In [1]:
import pickle
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [35]:
features = "mfcc_20_no_pitch_rand"

with open(f"../../data/extracted_features_v2/{features}.pickle", "rb") as file:
   mfcc_stats_dict = pickle.load(file)

for reader in mfcc_stats_dict.keys():
    print(f"reader: {reader} | # samples {len(mfcc_stats_dict[reader])}")

reader: 1069 | # samples 300
reader: 19 | # samples 300
reader: 201 | # samples 300
reader: 250 | # samples 300
reader: 254 | # samples 300
reader: 26 | # samples 300
reader: 27 | # samples 300
reader: 289 | # samples 300
reader: 298 | # samples 300
reader: 311 | # samples 300
reader: 32 | # samples 300
reader: 3240 | # samples 300
reader: 39 | # samples 300
reader: 40 | # samples 300
reader: 4297 | # samples 300
reader: 60 | # samples 300
reader: 78 | # samples 300
reader: 7800 | # samples 300
reader: 83 | # samples 300
reader: 87 | # samples 300


In [None]:
def scale_features(data_dict):
    """
    Scales the data in a dictionary using StandardScaler.
    
    Parameters:
        data_dict (dict): Dictionary where each key is a subject, and the value is a 2D array 
                          (samples x features).
                          
    Returns:
        dict: A new dictionary with scaled data, maintaining the same structure as the input.
    """
    # Combine all data into a single array
    all_samples = np.vstack(list(data_dict.values()))  # Shape: (total_samples, num_features)

    # Scale the data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(all_samples)  # Shape: (total_samples, num_features)

    # Split scaled data back into original structure
    split_indices = np.cumsum([len(samples) for samples in data_dict.values()])[:-1]
    scaled_subjects = np.split(scaled_data, split_indices)

    # Reconstruct the dictionary with scaled data
    scaled_data_dict = {key: scaled_subjects[i] for i, key in enumerate(data_dict.keys())}

    return scaled_data_dict

def scale_and_optimize_pca(data_dict, variance_threshold=0.95):
    """
    Scales the data in a dictionary using StandardScaler, determines the optimal 
    number of PCA components based on the explained variance threshold, 
    applies PCA, and returns the transformed data in the same dictionary structure.

    Parameters:
        data_dict (dict): Dictionary where each key is a subject, and the value is a 2D array 
                          (samples x features).
        variance_threshold (float): The minimum cumulative explained variance (0 to 1) 
                                    to determine the number of PCA components.
                          
    Returns:
        tuple:
            dict: A new dictionary with PCA-transformed data, maintaining the same structure as the input.
            int: The number of PCA components selected.
    """
    # Combine all data into a single array
    all_samples = np.vstack(list(data_dict.values()))  # Shape: (total_samples, num_features)

    # Scale the data
    scaler = StandardScaler()
    standardized_data = scaler.fit_transform(all_samples)  # Shape: (total_samples, num_features)

    # Fit PCA on standardized data to find optimal number of components
    pca = PCA()
    pca.fit(standardized_data)
    cumsum = np.cumsum(pca.explained_variance_ratio_)
    n_components = np.argmax(cumsum >= variance_threshold) + 1

    # Apply PCA with the optimal number of components
    pca = PCA(n_components=n_components)
    pca_data = pca.fit_transform(standardized_data)  # Shape: (total_samples, n_components)

    # Split PCA-transformed data back into original structure
    split_indices = np.cumsum([len(samples) for samples in data_dict.values()])[:-1]
    pca_subjects = np.split(pca_data, split_indices)

    # Reconstruct the dictionary with PCA-transformed data
    pca_data_dict = {key: pca_subjects[i] for i, key in enumerate(data_dict.keys())}

    return pca_data_dict, n_components


In [36]:
pca_transformed_data, optimal_components = scale_and_optimize_pca(mfcc_stats_dict)

for key in pca_transformed_data.keys():
    print(f"key: {key} shape: {pca_transformed_data[key].shape}")

key: 1069 shape: (300, 70)
key: 19 shape: (300, 70)
key: 201 shape: (300, 70)
key: 250 shape: (300, 70)
key: 254 shape: (300, 70)
key: 26 shape: (300, 70)
key: 27 shape: (300, 70)
key: 289 shape: (300, 70)
key: 298 shape: (300, 70)
key: 311 shape: (300, 70)
key: 32 shape: (300, 70)
key: 3240 shape: (300, 70)
key: 39 shape: (300, 70)
key: 40 shape: (300, 70)
key: 4297 shape: (300, 70)
key: 60 shape: (300, 70)
key: 78 shape: (300, 70)
key: 7800 shape: (300, 70)
key: 83 shape: (300, 70)
key: 87 shape: (300, 70)


In [37]:
with open(f"../../data/extracted_features_v2/pca_{features}.pickle", "wb") as file:
    pickle.dump(pca_transformed_data, file)