In [2]:
import pickle
import numpy as np

from sklearn.decomposition import PCA

In [9]:
features = "mfcc_20_no_pitch_rand"

with open(f"../../data/extracted_features_v2/{features}.pickle", "rb") as file:
   mfcc_stats_dict = pickle.load(file)

for reader in mfcc_stats_dict.keys():
    print(f"reader: {reader} | # samples {len(mfcc_stats_dict[reader])}")

reader: 1069 | # samples 300
reader: 19 | # samples 300
reader: 201 | # samples 300
reader: 250 | # samples 300
reader: 254 | # samples 300
reader: 26 | # samples 300
reader: 27 | # samples 300
reader: 289 | # samples 300
reader: 298 | # samples 300
reader: 311 | # samples 300
reader: 32 | # samples 300
reader: 3240 | # samples 300
reader: 39 | # samples 300
reader: 40 | # samples 300
reader: 4297 | # samples 300
reader: 60 | # samples 300
reader: 78 | # samples 300
reader: 7800 | # samples 300
reader: 83 | # samples 300
reader: 87 | # samples 300


In [4]:
def apply_pca(data, n_components=None):
    """
    Applies PCA to a 2D dataset.

    Parameters:
    - data (numpy.ndarray): 2D array with shape (n_instances, n_features).
    - n_components (int, float, or None): Number of principal components to keep.
        If None, all components are kept.
        If float between 0 and 1, selects the number of components such that 
        the amount of variance that needs to be explained is greater than n_components.

    Returns:
    - transformed_data (numpy.ndarray): Data projected onto principal components.
    - explained_variance_ratio (numpy.ndarray): Variance explained by each component.
    - pca_model (PCA): The PCA model (useful for future transformations).
    """
    if not isinstance(data, np.ndarray) or len(data.shape) != 2:
        raise ValueError("Input data must be a 2D numpy array.")

    # Initialize the PCA model
    pca = PCA(n_components=n_components)

    # Fit the PCA model to the data and transform it
    transformed_data = pca.fit_transform(data)

    # Return the transformed data, explained variance ratio, and the PCA model
    return transformed_data, pca.explained_variance_ratio_, pca

def apply_pca_wrapper(data_dict, n_components=None):
    """
    Applies PCA to each 2D array in a dictionary.

    Parameters:
    - data_dict (dict): A dictionary where values are 2D lists or numpy arrays.
        Each 2D array should have shape (n_instances, n_features).
    - n_components (int, float, or None): Number of principal components to keep.
        See `apply_pca` for details.

    Returns:
    - transformed_dict (dict): A new dictionary with the same keys as `data_dict`,
        where values are the PCA-transformed 2D arrays.
    - pca_models (dict): A dictionary of PCA models for each key.
    """
    transformed_dict = {}
    pca_models = {}

    for key, data in data_dict.items():
        # Convert data to a numpy array if it isn't one already
        data = np.array(data)
        
        # Check if data is valid
        if len(data.shape) != 2:
            raise ValueError(f"Data for key '{key}' is not 2D.")

        # Apply PCA to the current data
        transformed_data, _, pca_model = apply_pca(data, n_components)

        # Store the transformed data and PCA model
        transformed_dict[key] = transformed_data
        pca_models[key] = pca_model

    return transformed_dict, pca_models

In [11]:
pca_dictionary, pca_models = apply_pca_wrapper(mfcc_stats_dict)

for key in pca_dictionary.keys():
    print(f"key: {key} shape: {pca_dictionary[key].shape}")


key: 1069 shape: (300, 140)
key: 19 shape: (300, 140)
key: 201 shape: (300, 140)
key: 250 shape: (300, 140)
key: 254 shape: (300, 140)
key: 26 shape: (300, 140)
key: 27 shape: (300, 140)
key: 289 shape: (300, 140)
key: 298 shape: (300, 140)
key: 311 shape: (300, 140)
key: 32 shape: (300, 140)
key: 3240 shape: (300, 140)
key: 39 shape: (300, 140)
key: 40 shape: (300, 140)
key: 4297 shape: (300, 140)
key: 60 shape: (300, 140)
key: 78 shape: (300, 140)
key: 7800 shape: (300, 140)
key: 83 shape: (300, 140)
key: 87 shape: (300, 140)


In [15]:
with open(f"../../data/extracted_features_v2/pca_{features}.pickle", "wb") as file:
    pickle.dump(pca_dictionary, file)