In [8]:
import numpy as np
import pandas as pd

# Load the data from the .npy file
data = np.load('development_numpy/development.npy', allow_pickle=True)

# Load metadata from development.csv
metadata = pd.read_csv('metadata/development.csv')

# Load feature names mapping from idx_to_feature_names.csv
feature_names_mapping = pd.read_csv('metadata/idx_to_feature_name.csv')

# Extract labels and speaker IDs from metadata
labels = metadata['word'].values  # Adjusted to use 'word' column as class labels
speaker_ids = metadata['speaker_id'].values

# Assuming the snippet IDs are sequential integers starting from 0
snippet_ids = np.arange(len(metadata))

# Check the shapes of the data and labels
print('Data shape:', data.shape)
print('Labels shape:', labels.shape)


Data shape: (45296, 175, 44)
Labels shape: (45296,)


In [9]:
from sklearn.model_selection import train_test_split

# Subsample the data (e.g., 10% of the original dataset)
subsample_fraction = 0.1
X_subsample, _, y_subsample, _, snippet_ids_subsample, _ = train_test_split(
    data, labels, snippet_ids, train_size=subsample_fraction, random_state=42)

# Check the shapes of the subsampled data
print('Subsampled data shape:', X_subsample.shape)
print('Subsampled labels shape:', y_subsample.shape)

# Flatten the feature dimensions (n_features * n_time) for use with SVM
n_samples, n_features, n_time = X_subsample.shape
X_subsample_flat = X_subsample.reshape(n_samples, n_features * n_time)

# Verify reshaped data
print('Reshaped subsampled data shape:', X_subsample_flat.shape)


Subsampled data shape: (4529, 175, 44)
Subsampled labels shape: (4529,)
Reshaped subsampled data shape: (4529, 7700)


In [10]:
# Split the subsampled data into train, validation, and test sets
X_train_val, X_test, y_train_val, y_test, snippet_ids_train_val, snippet_ids_test = train_test_split(
    X_subsample_flat, y_subsample, snippet_ids_subsample, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val, snippet_ids_train, snippet_ids_val = train_test_split(
    X_train_val, y_train_val, snippet_ids_train_val, test_size=0.5, random_state=42)

# Check the shapes after splitting
print('Training data shape:', X_train.shape)
print('Training labels shape:', y_train.shape)
print('Validation data shape:', X_val.shape)
print('Validation labels shape:', y_val.shape)
print('Test data shape:', X_test.shape)
print('Test labels shape:', y_test.shape)


Training data shape: (1585, 7700)
Training labels shape: (1585,)
Validation data shape: (1585, 7700)
Validation labels shape: (1585,)
Test data shape: (1359, 7700)
Test labels shape: (1359,)


In [11]:
from sklearn.preprocessing import StandardScaler

# Standardize the features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Verify the scaling
print('Training data mean (scaled):', np.mean(X_train_scaled))
print('Training data std (scaled):', np.std(X_train_scaled))


Training data mean (scaled): -4.266342060428542e-18
Training data std (scaled): 1.0


In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Define the SVM model
svm = SVC()

# Set up the hyperparameter grid for GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(svm, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Output the best hyperparameters and best score
print('Best hyperparameters (SVM):', grid_search.best_params_)
print('Best cross-validation score (SVM):', grid_search.best_score_)

# Evaluate the best SVM model on the validation and test sets
best_svm = grid_search.best_estimator_
val_score_svm = best_svm.score(X_val_scaled, y_val)
test_score_svm = best_svm.score(X_test_scaled, y_test)
print('Validation accuracy (SVM):', val_score_svm)
print('Test accuracy (SVM):', test_score_svm)


Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  14.5s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  14.5s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=  14.5s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  14.6s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  17.1s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  17.2s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  17.3s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  19.8s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  19.8s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  19.8s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=  14.7s
[CV] END ...................C=0.1, gamma=auto, k

# Task 3

In [1]:
from sklearn.metrics import classification_report, confusion_matrix

# Function to evaluate a model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    # Print classification report
    print(classification_report(y_test, y_pred))
    # Print confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", conf_matrix)
    # Visualize the confusion matrix
    plt.figure(figsize=(14, 10))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.show()

# Example of evaluating a model
evaluate_model(svm_model, X_test_important, y_test)


NameError: name 'svm_model' is not defined

In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for SVM
param_grid_svm = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}
grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv=3, scoring='accuracy')
grid_search_svm.fit(X_train_important, y_resampled)
print(f"Best hyperparameters for SVM: {grid_search_svm.best_params_}")

# Hyperparameter tuning for k-NN
param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance']
}
grid_search_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=3, scoring='accuracy')
grid_search_knn.fit(X_train_important, y_resampled)
print(f"Best hyperparameters for k-NN: {grid_search_knn.best_params_}")

# Hyperparameter tuning for Neural Network
param_grid_nn = {
    'hidden_layer_sizes': [(50,), (100,), (150,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd']
}
grid_search_nn = GridSearchCV(MLPClassifier(max_iter=300), param_grid_nn, cv=3, scoring='accuracy')
grid_search_nn.fit(X_train_important, y_resampled)
print(f"Best hyperparameters for Neural Network: {grid_search_nn.best_params_}")


# Task 4

In [None]:
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

# Hyperparameter tuning for SVM
param_grid_svm = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}
grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv=3, scoring='accuracy')
grid_search_svm.fit(X_train_important, y_resampled)
print(f"Best hyperparameters for SVM: {grid_search_svm.best_params_}")

# Hyperparameter tuning for k-NN
param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance']
}
grid_search_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=3, scoring='accuracy')
grid_search_knn.fit(X_train_important, y_resampled)
print(f"Best hyperparameters for k-NN: {grid_search_knn.best_params_}")

# Hyperparameter tuning for Neural Network
param_grid_nn = {
    'hidden_layer_sizes': [(50,), (100,), (150,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd']
}
grid_search_nn = GridSearchCV(MLPClassifier(max_iter=300), param_grid_nn, cv=3, scoring='accuracy')
grid_search_nn.fit(X_train_important, y_resampled)
print(f"Best hyperparameters for Neural Network: {grid_search_nn.best_params_}")

# Function to plot the results of grid search
def plot_grid_search(cv_results, param_grid, param_names, model_name):
    scores_mean = cv_results['mean_test_score']
    scores_std = cv_results['std_test_score']
    params = cv_results['params']

    fig, ax = plt.subplots(1, len(param_names), figsize=(15, 5))
    
    for i, param_name in enumerate(param_names):
        param_values = [param[param_name] for param in params]
        ax[i].errorbar(param_values, scores_mean, yerr=scores_std, fmt='o')
        ax[i].set_title(f'{model_name}: {param_name}')
        ax[i].set_xlabel(param_name)
        ax[i].set_ylabel('Mean Test Score')

    plt.show()

# Plotting the grid search results
plot_grid_search(grid_search_svm.cv_results_, param_grid_svm, list(param_grid_svm.keys()), 'SVM')
plot_grid_search(grid_search_knn.cv_results_, param_grid_knn, list(param_grid_knn.keys()), 'k-NN')
plot_grid_search(grid_search_nn.cv_results_, param_grid_nn, list(param_grid_nn.keys()), 'Neural Network')
