In [1]:
import librosa
import librosa.display
from librosa.core import load
import numpy as np
import pandas as pd
import glob
import os, sys
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif

VERSION = 4 # input file version

## Load data

In [2]:
# master_data = pd.read_csv(f'./dataset_info_combined_V{VERSION}.csv')
df_joint_train = pd.read_csv(f'./features/cache_all_features_train_V{VERSION}.csv')
feature_column_names = [i for i in df_joint_train.columns \
                        if i not in ['file_path','renamed_file_path','split','sentiment_value','emotional_category']]

print("shape of train set: ", df_joint_train.shape)

X = df_joint_train[feature_column_names]
y_s = df_joint_train['sentiment_value']
y_e = df_joint_train['emotional_category']

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_e_num = label_encoder.fit_transform(y_e)

shape of train set:  (10981, 1550)


### ANOVA F Test

In [3]:
top_N = 100

# ANOVA F-test
anova_selector = SelectKBest(f_classif, k=min(top_N, X.shape[1]))
selected_features_anova = anova_selector.fit_transform(X, y_e)

# Get p-values
_, p_values = f_classif(X, y_e)
print("p_values: ", p_values)
significant_features_id = np.where(p_values<0.05)[0]
print(f"based on p value  {len(significant_features_id)} significant_features")
# Or you can print the F-scores
scores = anova_selector.scores_
print("scores: ", scores)

# Optionally, you can sort by p-values and select features with the lowest ones manually
sorted_idx = np.argsort(p_values)
significant_features_id_topn  = sorted_idx[:top_N]
print(f"{top_N} significant_features: ", [feature_column_names[idx] for idx in significant_features_id_topn])

p_values:  [0.00000000e+000 3.37473741e-294 0.00000000e+000 ... 1.24653171e-179
 6.50236372e-047 5.50686335e-044]
based on p value  1545 significant_features
scores:  [610.51256069 210.31431021 372.5802621  ... 126.49827078  33.80220847
  31.79234655]
100 significant_features:  ['mfcc_mean_1', 'mfcc_p10_4', 'mfcc_p10_5', 'mfcc_p10_11', 'mfcc_median_33', 'mfcc_p10_13', 'Pitch_pitch_slope_without_octave_jumps', 'Pitch_mean_absolute_pitch_slope', 'Pitch_q3_pitch', 'Pitch_median_intensity', 'Pitch_q1_pitch', 'mfcc_p10_3', 'Pitch_stddev_pitch', 'Pitch_max_pitch', 'mfcc_p90_1', 'mfcc_median_11', 'mfcc_std_15', 'mfcc_p90_20', 'mfcc_p90_21', 'mfcc_median_5', 'mfcc_median_4', 'mfcc_median_3', 'mfcc_median_1', 'Pitch_mean_pitch', 'mfcc_p10_1', 'GNE_max_gne', 'GNE_mean_gne', 'mfcc_std_14', 'mfcc_std_13', 'mfcc_std_12', 'mfcc_std_11', 'mfcc_std_5', 'mfcc_std_4', 'mfcc_std_3', 'mfcc_std_1', 'mfcc_std_17', 'mfcc_std_18', 'mfcc_std_19', 'mfcc_std_20', 'mfcc_std_21', 'mfcc_std_22', 'mfcc_std_23', 'Spe

### Coorelation +  mutal info + feature-feature correlation

In [4]:
# Create an array to store correlations
correlations = np.zeros(X.shape[1])  
for i in range(X.shape[1]):
    correlations[i] = np.corrcoef(X.iloc[:, i], y_e_num)[0, 1]
print("Correlation coefficients between each feature and the labels:\n", correlations)

# to compute some form of association between categorical labels and numerical features, 
# you could consider using chi-squared tests for independence or mutual information scores.
from sklearn.feature_selection import mutual_info_classif
mutual_info_scores = mutual_info_classif(X, y_e)
print("Mutual Information scores:\n", mutual_info_scores)

# For feature-feature correlation
feature_corr_matrix = np.corrcoef(X.T)

# print("\nFeature-feature correlation matrix:\n", feature_corr_matrix)

Correlation coefficients between each feature and the labels:
 [-0.31875965  0.1874767   0.15073757 ...  0.1723755   0.09586706
 -0.09181611]
Mutual Information scores:
 [0.26351935 0.09047474 0.15920847 ... 0.05468904 0.02472287 0.02471631]


#### correlation plot (must select part of features) 

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Create a mask to hide the upper triangle since it's redundant with the lower triangle
# mask = np.triu(np.ones_like(feature_corr_matrix, dtype=bool))

# # Set up the matplotlib figure
# f, ax = plt.subplots(figsize=(10, 8))

# # Generate a custom diverging colormap
# cmap = sns.diverging_palette(230, 20, as_cmap=True)

# # Make sure the length of column_name_original matches the number of features in your correlation matrix
# assert len(column_names_origin) == feature_corr_matrix.shape[0]

# # Generate the heatmap with custom labels and show the diagonal
# sns.heatmap(feature_corr_matrix,  mask=mask, cmap=cmap, vmax=.3, center=0, annot=False,
#             square=True, linewidths=.5, cbar_kws={"shrink": .2},
#             xticklabels=column_names_origin, yticklabels=column_names_origin)

# # Rotate the x-axis labels for better readability
# plt.xticks(rotation=45, fontsize='small')
# plt.yticks(fontsize='small')

# plt.show()

### Generate ranking table using emotion labels

In [5]:
# ANOVA F-test with p-values
anova_selector = SelectKBest(f_classif, k='all')
anova_selector.fit(X, y_e)
p_values = anova_selector.pvalues_
f_scores = anova_selector.scores_
is_significant_p = (p_values < 0.05).astype(int)  
# ranking_by_f_score = np.argsort(f_scores)

# Combine results into a single dataframe
result_df = pd.DataFrame({
    'feature': feature_column_names,
    'p_value': p_values,
    'is_significant': is_significant_p,
    'f_score': f_scores,
    'correlation_coeff': correlations,
    'mutual_info_score': mutual_info_scores
})

result_df = result_df.round(4).sort_values(by='f_score',ascending=False)
# result_df.head()
result_df.to_csv(f'./features/relationship_stats_test_result_emotion_all_V{VERSION}.csv',index = False)

### Use Sentiment Label

In [6]:
# Create an array to store correlations
correlations = np.zeros(X.shape[1])  
for i in range(X.shape[1]):
    correlations[i] = np.corrcoef(X.iloc[:, i], y_s)[0, 1]

from sklearn.feature_selection import mutual_info_classif
mutual_info_scores = mutual_info_classif(X, y_s)
print("Mutual Information scores:\n", mutual_info_scores)

# ANOVA F-test with p-values
anova_selector = SelectKBest(f_classif, k='all')
anova_selector.fit(X, y_s)
p_values = anova_selector.pvalues_
f_scores = anova_selector.scores_
is_significant_p = (p_values < 0.05).astype(int)  

result_df = pd.DataFrame({
    'feature': feature_column_names,
    'p_value': p_values,
    'is_significant': is_significant_p,
    'f_score': f_scores,
    'correlation_coeff': correlations,
    'mutual_info_score': mutual_info_scores
})

result_df = result_df.round(4).sort_values(by='f_score',ascending=False)

result_df.to_csv(f'./features/relationship_stats_test_result_sentiment_all_V{VERSION}.csv',index = False)
result_df.head()

Mutual Information scores:
 [0.07998662 0.02063289 0.03424365 ... 0.01581673 0.         0.        ]


Unnamed: 0,feature,p_value,is_significant,f_score,correlation_coeff,mutual_info_score
1510,Pitch_pitch_slope_without_octave_jumps,0.0,1,1290.2384,0.301,0.115
1505,Pitch_stddev_pitch,0.0,1,872.1901,0.2545,0.0979
1509,Pitch_mean_absolute_pitch_slope,0.0,1,860.4551,0.2461,0.0912
1508,Pitch_q3_pitch,0.0,1,853.84,0.1747,0.1036
92,mfcc_median_33,0.0,1,721.284,0.1222,0.0707


In [7]:
result_df.describe()

Unnamed: 0,p_value,is_significant,f_score,correlation_coeff,mutual_info_score
count,1546.0,1546.0,1546.0,1546.0,1546.0
mean,0.011053,0.96119,53.687398,-0.015961,0.036666
std,0.06789,0.193204,87.77936,0.054504,0.019041
min,0.0,0.0,0.0827,-0.2045,0.0
25%,0.0,1.0,14.2832,-0.047075,0.020525
50%,0.0,1.0,31.1346,-0.023,0.0359
75%,0.0,1.0,51.9576,0.006675,0.050475
max,0.9206,1.0,1290.2384,0.301,0.115
