In [1]:
import pickle
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [5]:
features = "mfcc_20_no_pitch_1000_rand_100_speakers"

with open(f"../../../data/extracted_features_v2/{features}.pickle", "rb") as file:
   mfcc_stats_dict = pickle.load(file)

for reader in mfcc_stats_dict.keys():
    print(f"reader: {reader} | # samples {len(mfcc_stats_dict[reader])}")

reader: 1034 | # samples 1000
reader: 1069 | # samples 1000
reader: 1081 | # samples 1000
reader: 1098 | # samples 1000
reader: 118 | # samples 1000
reader: 1246 | # samples 1000
reader: 125 | # samples 1000
reader: 1263 | # samples 1000
reader: 1334 | # samples 1000
reader: 1447 | # samples 1000
reader: 1455 | # samples 1000
reader: 150 | # samples 1000
reader: 1624 | # samples 1000
reader: 1963 | # samples 1000
reader: 1970 | # samples 1000
reader: 198 | # samples 1000
reader: 2007 | # samples 1000
reader: 211 | # samples 1000
reader: 2384 | # samples 1000
reader: 2436 | # samples 1000
reader: 250 | # samples 1000
reader: 2514 | # samples 1000
reader: 26 | # samples 1000
reader: 2893 | # samples 1000
reader: 2910 | # samples 1000
reader: 2989 | # samples 1000
reader: 307 | # samples 1000
reader: 311 | # samples 1000
reader: 3240 | # samples 1000
reader: 3242 | # samples 1000
reader: 328 | # samples 1000
reader: 332 | # samples 1000
reader: 3486 | # samples 1000
reader: 374 | # sample

In [6]:
def scale_features(data_dict):
    """
    Scales the data in a dictionary using StandardScaler.
    
    Parameters:
        data_dict (dict): Dictionary where each key is a subject, and the value is a 2D array 
                          (samples x features).
                          
    Returns:
        dict: A new dictionary with scaled data, maintaining the same structure as the input.
    """
    # Combine all data into a single array
    all_samples = np.vstack(list(data_dict.values()))  # Shape: (total_samples, num_features)

    # Scale the data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(all_samples)  # Shape: (total_samples, num_features)

    # Split scaled data back into original structure
    split_indices = np.cumsum([len(samples) for samples in data_dict.values()])[:-1]
    scaled_subjects = np.split(scaled_data, split_indices)

    # Reconstruct the dictionary with scaled data
    scaled_data_dict = {key: scaled_subjects[i] for i, key in enumerate(data_dict.keys())}

    return scaled_data_dict

def scale_and_optimize_pca(data_dict, variance_threshold=0.95):
    """
    Scales the data in a dictionary using StandardScaler, determines the optimal 
    number of PCA components based on the explained variance threshold, 
    applies PCA, and returns the transformed data in the same dictionary structure.

    Parameters:
        data_dict (dict): Dictionary where each key is a subject, and the value is a 2D array 
                          (samples x features).
        variance_threshold (float): The minimum cumulative explained variance (0 to 1) 
                                    to determine the number of PCA components.
                          
    Returns:
        tuple:
            dict: A new dictionary with PCA-transformed data, maintaining the same structure as the input.
            int: The number of PCA components selected.
    """
    # Combine all data into a single array
    all_samples = np.vstack(list(data_dict.values()))  # Shape: (total_samples, num_features)

    # Scale the data
    scaler = StandardScaler()
    standardized_data = scaler.fit_transform(all_samples)  # Shape: (total_samples, num_features)

    # Fit PCA on standardized data to find optimal number of components
    pca = PCA()
    pca.fit(standardized_data)
    cumsum = np.cumsum(pca.explained_variance_ratio_)
    n_components = np.argmax(cumsum >= variance_threshold) + 1

    # Apply PCA with the optimal number of components
    pca = PCA(n_components=n_components)
    pca_data = pca.fit_transform(standardized_data)  # Shape: (total_samples, n_components)

    # Split PCA-transformed data back into original structure
    split_indices = np.cumsum([len(samples) for samples in data_dict.values()])[:-1]
    pca_subjects = np.split(pca_data, split_indices)

    # Reconstruct the dictionary with PCA-transformed data
    pca_data_dict = {key: pca_subjects[i] for i, key in enumerate(data_dict.keys())}

    return pca_data_dict, n_components


In [7]:
pca_transformed_data, optimal_components = scale_and_optimize_pca(mfcc_stats_dict)

for key in pca_transformed_data.keys():
    print(f"key: {key} shape: {pca_transformed_data[key].shape}")

key: 1034 shape: (1000, 71)
key: 1069 shape: (1000, 71)
key: 1081 shape: (1000, 71)
key: 1098 shape: (1000, 71)
key: 118 shape: (1000, 71)
key: 1246 shape: (1000, 71)
key: 125 shape: (1000, 71)
key: 1263 shape: (1000, 71)
key: 1334 shape: (1000, 71)
key: 1447 shape: (1000, 71)
key: 1455 shape: (1000, 71)
key: 150 shape: (1000, 71)
key: 1624 shape: (1000, 71)
key: 1963 shape: (1000, 71)
key: 1970 shape: (1000, 71)
key: 198 shape: (1000, 71)
key: 2007 shape: (1000, 71)
key: 211 shape: (1000, 71)
key: 2384 shape: (1000, 71)
key: 2436 shape: (1000, 71)
key: 250 shape: (1000, 71)
key: 2514 shape: (1000, 71)
key: 26 shape: (1000, 71)
key: 2893 shape: (1000, 71)
key: 2910 shape: (1000, 71)
key: 2989 shape: (1000, 71)
key: 307 shape: (1000, 71)
key: 311 shape: (1000, 71)
key: 3240 shape: (1000, 71)
key: 3242 shape: (1000, 71)
key: 328 shape: (1000, 71)
key: 332 shape: (1000, 71)
key: 3486 shape: (1000, 71)
key: 374 shape: (1000, 71)
key: 3857 shape: (1000, 71)
key: 3983 shape: (1000, 71)
key: 

In [8]:
with open(f"../../../data/extracted_features_v2/pca_{features}.pickle", "wb") as file:
    pickle.dump(pca_transformed_data, file)