## Engineering Notebook

---

In this notebook, we analyze the extracted features, assessing the necessity of normalization. We also investigate possible feature selection techniques to reduce the dimensionality of the data.
The sections are organized as follows:

1. [Load Data](#Load-Data)
2. [Feature Analysis](#2.-Feature-Analysis)
    1. [Visualize Features](#2.1.-Visualize-Features)
    2. [Feature correlation](#2.2.-Feature-Correlation)
3. [Covariance analysis](#3-covariance-matrix-of-the-groups)
4. [Feature Selection](#4.-Feature-Selection)
5. [Outliers Detection](#5.-Outliers-Detection)
6. [Feature distribution](#6-Feature-Distribution)
7. [PCA](#7-PCA)
8. [Save the data](#8-Save-Data)



In [6]:
# import all the functions
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import os
import sys
from scipy.stats import spearmanr
import seaborn as sns

sys.path.append("../")
import pandas as pd
from sklearn.cluster import KMeans

In [7]:
# paths to the features and the labels
FOLDER = "../../features/balanced/both/"
FILE_PATH = (
    FOLDER
    + "full_data_both_bal_2s_4000hz_30mfcc_12chroma_70cqt_41rms_41zcr_41sc_61sb_41sr.npy"
)

feature_names = (
    [f"MFCC {i}" for i in range(1, 31)]
    + [f"Chroma {i}" for i in range(1, 13)]
    + [f"CQT {i}" for i in range(1, 71)]
    + [f"RMS {i}" for i in range(1, 42)]
    + [f"ZCR {i}" for i in range(1, 42)]
    + [f"SC {i}" for i in range(1, 42)]
    + [f"SB {i}" for i in range(1, 62)]
    + [f"SR {i}" for i in range(1, 42)]
)

In [8]:
def remove_features(data, data_df, features_to_drop):
    # Drop the identified features from the dataset
    indexes = data_df.columns.get_indexer(features_to_drop)
    print("Removing features from the dataset")
    filtered_data = data
    # Load the data from the file and extract the "train_bal" subset
    for key in filtered_data.keys():
        filtered_data[key]["X"] = np.delete(filtered_data[key]["X"], indexes, axis=1)
        print(filtered_data[key]["X"].shape)
    return filtered_data


def read_file(file_path: str, feature_names: list):

    dataset = []  # Initialize an empty list to store dataset
    data = None  # Initialize data to None

    # Load the data from the file and extract the "train_bal" subset
    data = np.load(file_path, allow_pickle=True).item()
    datam = data["train"]

    # Separate features (X) and labels (y), then concatenate them into one array
    X = datam["X"]
    y = datam["y"].reshape(-1, 1)
    dataset = np.concatenate((X, y), axis=1)
    # Convert the dataset into a pandas DataFrame with feature names and label
    data_df = pd.DataFrame(dataset, columns=feature_names + ["label"])

    return data, data_df

def get_target_correlation(data_df, target="label"):
    # Importa le librerie necessarie
    p_values = []
    correlazione = []
    features = []
    # Calcola i coefficienti di correlazione di Kendall e i valori p per ogni coppia di colonne nel dataframe
    for col1 in data_df.columns:
        correlation, p_value = spearmanr(data_df[col1], data_df[target])
        p_values.append(p_value)
        correlazione.append(correlation)
        features.append(col1)

    correlazione_df = pd.DataFrame(
        {"Feature": features, "Correlazione": correlazione, "P-value": p_values}
    )
    correlazione_df.set_index("Feature", inplace=True)
    return correlazione_df


def remove_highly_correlated_features(
    correlation_matrix_comp: pd.DataFrame,
    correlation_matrix_no_target: pd.DataFrame,
    target_variable: str = "label",
    threshold: float = 0.7,
    max_corr_count: int = 4,
) -> list:
    """
    Remove highly correlated features based on correlation matrix.

    Parameters:
        correlation_matrix_comp (pd.DataFrame): Complete correlation matrix including the target variable.
        correlation_matrix_no_target (pd.DataFrame): Correlation matrix without the target variable.
        target_variable (str): The name of the target variable.
        threshold (float): Threshold above which features are considered highly correlated.
        max_corr_count (int): Maximum number of highly correlated features to keep.

    Returns:
        List[str]: List of features to remove.
    """
    features_to_remove = set()

    # Iterate through each feature
    for feature in correlation_matrix_no_target.columns:
        # Count the number of features with correlation higher than threshold
        correlated_count = (correlation_matrix_no_target[feature] > threshold).sum()
        # If the count is greater than the maximum allowed count
        if correlated_count > max_corr_count:
            # Get the correlations of the highly correlated features with the target variable
            target_correlations = correlation_matrix_no_target.loc[feature]
            candidates = target_correlations[target_correlations > threshold].index
            best = (
                correlation_matrix_comp.loc[candidates, target_variable].abs().idxmax()
            )
            features_to_remove.update(candidates.difference([best]))

    # Remove the highly correlated features
    print(f" {len(features_to_remove)} features should be removed")
    return list(features_to_remove)

Remove features with low correlation with the target variable.

In [9]:
thresh1=[0,0.1,0.2,0.3,0.4,0.5]
thresh2=[0.6,0.7,0.8,0.9,1]
n_features_correlated=[5,10,15,20,25,30,40]

for th1 in thresh1:
    for th2 in thresh2:
        for n_feat in n_features_correlated:
            data, data_df = read_file(FILE_PATH, feature_names)
            # Get features slightly correlated with the target variable
            correlazione_df = get_target_correlation(data_df, target="label")
            features_slightly_corr = correlazione_df[
                np.abs(correlazione_df["Correlazione"]) <= th1
            ].index
            print(len(features_slightly_corr))

            # Get features highly correlated with each other

            correlation_matrix = data_df.corr(method="spearman")
            correlation_matrix_no_target = data_df.drop(columns=["label"]).corr(method="spearman")
            highly_corr_each = remove_highly_correlated_features(
                correlation_matrix,
                correlation_matrix_no_target,
                threshold=th2,
                max_corr_count=n_feat,
            )

            # Get the features to drop
            remove1 = set(features_slightly_corr)
            remove2 = set(highly_corr_each)
            features_to_drop = remove1.union(remove2)
            print("Number of features to drop : ", len(features_to_drop))

            # Remove the features from the dataset
            filtered_data = remove_features(data, data_df, features_to_drop)
            filtered_df = data_df.drop(columns=features_to_drop)

            # Get the number of features for each category
            feat_names = ["MFCC", "Chroma", "CQT", "RMS", "ZCR", "SC", "SB", "SR"]
            feat_count = {}
            for name in feat_names:
                feat_count[name] = len([col for col in filtered_df.columns if name in col])

            print("Number of features for each category : \n", feat_count)
            print("number of features kept : ", filtered_df.columns)
            features = filtered_df.drop(columns="label").columns

            # Save the filtered data
            filtered_data["features"] = features
            print("Saving filtered data")
            # Construct the full file path for the current feature file
            save_file_path = os.path.join(
                FOLDER,
                f"thresholds/full_data_filtered_2s_4000hz_{th1}_{th2}_{n_feat}.npy",
            )
            np.save(save_file_path, filtered_data)

0
 312 features should be removed
Number of features to drop :  312
Removing features from the dataset
(4920, 25)
(568, 25)
Number of features for each category : 
 {'MFCC': 22, 'Chroma': 1, 'CQT': 1, 'RMS': 0, 'ZCR': 1, 'SC': 0, 'SB': 0, 'SR': 0}
number of features kept :  Index(['MFCC 2', 'MFCC 3', 'MFCC 11', 'MFCC 12', 'MFCC 13', 'MFCC 14',
       'MFCC 15', 'MFCC 16', 'MFCC 17', 'MFCC 18', 'MFCC 19', 'MFCC 20',
       'MFCC 21', 'MFCC 22', 'MFCC 23', 'MFCC 24', 'MFCC 25', 'MFCC 26',
       'MFCC 27', 'MFCC 28', 'MFCC 29', 'MFCC 30', 'Chroma 2', 'CQT 7',
       'ZCR 5', 'label'],
      dtype='object')
Saving filtered data
0
 301 features should be removed
Number of features to drop :  301
Removing features from the dataset
(4920, 36)
(568, 36)
Number of features for each category : 
 {'MFCC': 22, 'Chroma': 12, 'CQT': 1, 'RMS': 0, 'ZCR': 1, 'SC': 0, 'SB': 0, 'SR': 0}
number of features kept :  Index(['MFCC 2', 'MFCC 3', 'MFCC 11', 'MFCC 12', 'MFCC 13', 'MFCC 14',
       'MFCC 15', 'M