In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data from the .npy file
data = np.load('development_numpy/development.npy', allow_pickle=True)

# Load metadata from development.csv
metadata = pd.read_csv('metadata/development.csv')

# Load feature names mapping from idx_to_feature_names.csv
feature_names_mapping = pd.read_csv('metadata/idx_to_feature_name.csv')

# Extract labels and speaker IDs from metadata
labels = metadata['word'].values  # Adjusted to use 'word' column as class labels
speaker_ids = metadata['speaker_id'].values

# Assuming the snippet IDs are sequential integers starting from 0
snippet_ids = np.arange(len(metadata))

# Check the shapes of the data and labels
print('Data shape:', data.shape)
print('Labels shape:', labels.shape)

# Flatten the feature dimensions (n_features * n_time) for use with SVM
n_samples, n_features, n_time = data.shape
X = data.reshape(n_samples, n_features * n_time)

# Verify reshaped data
print('Reshaped data shape:', X.shape)

# Split the data into train, validation, and test sets
X_train_val, X_test, y_train_val, y_test, snippet_ids_train_val, snippet_ids_test = train_test_split(
    X, labels, snippet_ids, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val, snippet_ids_train, snippet_ids_val = train_test_split(
    X_train_val, y_train_val, snippet_ids_train_val, test_size=0.5, random_state=42)

# Check the shapes after splitting
print('Training data shape:', X_train.shape)
print('Training labels shape:', y_train.shape)
print('Validation data shape:', X_val.shape)
print('Validation labels shape:', y_val.shape)
print('Test data shape:', X_test.shape)
print('Test labels shape:', y_test.shape)

Data shape: (45296, 175, 44)
Labels shape: (45296,)
Reshaped data shape: (45296, 7700)
Training data shape: (15853, 7700)
Training labels shape: (15853,)
Validation data shape: (15854, 7700)
Validation labels shape: (15854,)
Test data shape: (13589, 7700)
Test labels shape: (13589,)
