In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow import keras

from sklearn.feature_selection import SelectKBest  # Class to select features based on univariate statistical tests
from sklearn.feature_selection import f_classif    # ANOVA F-value test for classification tasks

In [2]:
# Load the dataset
df = pd.read_csv('../datasets/processed_data/features_30_sec.csv')
df = df.drop(labels='filename', axis=1)

df.head()

Unnamed: 0,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,661794,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,3805.839606,...,52.42091,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035,blues
1,661794,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,3550.522098,...,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.10619,0.531217,45.786282,blues
2,661794,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,3042.260232,...,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.43972,46.63966,-2.231258,30.573025,blues
3,661794,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,2184.745799,...,44.427753,-3.319597,50.206673,0.636965,37.31913,-0.619121,37.259739,-3.407448,31.949339,blues
4,661794,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,3579.757627,...,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.19516,blues


In [3]:
# Encode class labels
class_list = df.iloc[:, -1]
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(class_list)

In [4]:
# Standardize feature data
X = StandardScaler().fit_transform(np.array(df.iloc[:, :-1], dtype=float))

In [5]:
# Convert the standardized features array back to a DataFrame
X_df = pd.DataFrame(X, columns=df.columns[:-1])  # Use the original feature names for columns
# Add the encoded labels as a new column to this DataFrame
X_df['label'] = y

In [6]:
# Use the Standaridized data
# Define the number of top features you want to select
k = 15  # Number of features to select
selector = SelectKBest(f_classif, k=k)
selector.fit_transform(X, y)

# Get the selected feature names
selected_features = df.iloc[:, :-1].columns[selector.get_support()]
print(selected_features)

Index(['chroma_stft_mean', 'chroma_stft_var', 'rms_mean', 'rms_var',
       'spectral_centroid_mean', 'spectral_centroid_var',
       'spectral_bandwidth_mean', 'rolloff_mean', 'zero_crossing_rate_var',
       'perceptr_var', 'mfcc1_mean', 'mfcc2_mean', 'mfcc4_mean', 'mfcc6_mean',
       'mfcc8_mean'],
      dtype='object')


In [7]:
# Use the original data
# Define the number of top features you want to select
k = 15  # Number of features to select
selector = SelectKBest(f_classif, k=k)
selector.fit_transform(df.drop('label', axis=1), df['label'])

# Get the selected feature names
selected_features = df.iloc[:, :-1].columns[selector.get_support()]
print(selected_features)

Index(['chroma_stft_mean', 'chroma_stft_var', 'rms_mean', 'rms_var',
       'spectral_centroid_mean', 'spectral_centroid_var',
       'spectral_bandwidth_mean', 'rolloff_mean', 'zero_crossing_rate_var',
       'perceptr_var', 'mfcc1_mean', 'mfcc2_mean', 'mfcc4_mean', 'mfcc6_mean',
       'mfcc8_mean'],
      dtype='object')


In [10]:
# Find the 10 most important features for the RandomForestClassifier

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
rfe = RFE(model, n_features_to_select=10)
fit = rfe.fit(df.drop('label', axis=1), df['label'])

# Print the selected features
selected_features = df.drop('label', axis=1).columns[fit.support_]
print(selected_features)

Index(['chroma_stft_mean', 'chroma_stft_var', 'rms_mean', 'rms_var',
       'spectral_centroid_var', 'spectral_bandwidth_mean', 'perceptr_var',
       'mfcc4_mean', 'mfcc5_var', 'mfcc9_mean'],
      dtype='object')


In [8]:
# Find the features with the greatest correlations with any other features
# Find the features with the greatest correlations with the 'label'
# Calculate the correlation matrix
df['label']=y
# type(df_modified)
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.85
to_drop = [column for column in upper.columns if any(upper[column] > 0.85)]
print(to_drop)

# Extract correlations with the last column (i.e., 'label' column)
correlations_with_last_column = corr_matrix['label']
# Sort the correlations in descending order while excluding the last column itself
sorted_correlations = correlations_with_last_column.drop('label').sort_values(ascending=False)

# Get the sorted column names
sorted_column_names = sorted_correlations.index.tolist()[:5]

print(sorted_column_names)

['spectral_bandwidth_mean', 'rolloff_mean', 'rolloff_var', 'zero_crossing_rate_mean', 'harmony_var', 'mfcc2_mean', 'mfcc20_var']
['spectral_bandwidth_mean', 'rolloff_mean', 'spectral_centroid_mean', 'chroma_stft_mean', 'mfcc2_mean']


In [9]:
# corr_matrix = X_df.corr().abs()

# # Select upper triangle of correlation matrix
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# # Find features with correlation greater than 0.85
# to_drop = [column for column in upper.columns if any(upper[column] > 0.85)]
# print(to_drop)

# # Extract correlations with the last column (i.e., 'label' column)
# correlations_with_last_column = corr_matrix['label']
# # Sort the correlations in descending order while excluding the last column itself
# sorted_correlations = correlations_with_last_column.drop('label').sort_values(ascending=False)

# # Get the sorted column names
# sorted_column_names = sorted_correlations.index.tolist()[:6]

# print(sorted_column_names)