In [1]:
import numpy as np
import pandas as pd
import glob
import os, sys
# import matplotlib.pyplot as plt
import time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
VERSION = 4
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier


### Load Data

In [2]:
df_joint_train_low = pd.read_csv(f'./features/merged_file_v3.csv')
df_joint_train_aug = pd.read_csv(f'./features/cache_train_V4_augmented.csv')
print("shape of train set: ", df_joint_train_low.shape)
# df_joint_test = pd.read_csv(f'./features/cache_all_features_test_V{VERSION}.csv') # not balanced
df_joint_test = pd.read_csv(f'./features/cache_test_V4_resampled_250.csv')
print("shape of test  set: ", df_joint_test.shape)
feature_column_names = [i for i in df_joint_train_aug.columns \
                        if i not in ['file_path','renamed_file_path','split','sentiment_value','emotional_category']]

shape of train set:  (7648, 1546)
shape of test  set:  (680, 1546)


In [3]:
df_joint_train_low.groupby('sentiment_value')['file_path'].count()

sentiment_value
-1    4970
 0     852
 1    1826
Name: file_path, dtype: int64

In [4]:
X_train = df_joint_train_aug[feature_column_names]
X_train_low = df_joint_train_low[feature_column_names]
y_train_s = df_joint_train_aug['sentiment_value']
y_train_e = df_joint_train_aug['emotional_category']
y_train_low_s = df_joint_train_low['sentiment_value']
y_train_low_e = df_joint_train_low['emotional_category']

# from sklearn.preprocessing import LabelEncoder
# label_encoder = LabelEncoder()
# y_e_num = label_encoder.fit_transform(y_train_e)

X_test = df_joint_test[feature_column_names]
y_test_s = df_joint_test['sentiment_value']
y_test_e = df_joint_test['emotional_category']

# from sklearn.preprocessing import LabelEncoder
# label_encoder = LabelEncoder()
# y_test_e_num = label_encoder.fit_transform(y_test_e)

In [5]:
### generate selected features 
# PREFIXES = {'mfcc': 60,'chroma': 12, 'mel32': 32, 'mel64': 64,'mel128': 128, 'zcr': 1, 'rms': 1}
def generate_selected_features_by_type(feature_column_names,input,stats,number=1):
    selected_result = []
    for name in feature_column_names:
        if input+"_"+stats in name:
            selected_result.append(name)
    if number < len(selected_result):
        selected_result = selected_result[:number]
    return selected_result

# example to take mfcc 20 mean & std; mel32; zcr all 5 stats features
feature_MFCC20_mean = generate_selected_features_by_type(feature_column_names,"mfcc","mean",20)
feature_MFCC20_std  = generate_selected_features_by_type(feature_column_names,"mfcc","std",20)
feature_mel32_stats = generate_selected_features_by_type(feature_column_names,"mel32","",32*5)
feature_zcr_stats   = generate_selected_features_by_type(feature_column_names,"zcr","",5)
feature_rms_stats   = generate_selected_features_by_type(feature_column_names,"rms","",5)

In [6]:
# # REF: previous study selected prosody features
selected_spect = ['Spectrum_band_energy_difference','Spectrum_band_density_difference','Spectrum_center_of_gravity_spectrum','Spectrum_skewness_spectrum','Spectrum_kurtosis_spectrum', 'Spectrum_stddev_spectrum','Spectrum_band_density', 'Spectrum_band_energy']
selected_formant = ['Formant_f1_mean','Formant_f1_median','Formant_f3_mean','Formant_fitch_vtl','Formant_mff','Formant_formant_dispersion']
selected_GNE = ['GNE_max_gne', 'GNE_stddev_gne', 'GNE_mean_gne', 'GNE_sum_gne']
selected_pitch = ['Pitch_pitch_slope_without_octave_jumps', 'Pitch_q3_pitch','Pitch_stddev_pitch', 'Pitch_mean_absolute_pitch_slope','Pitch_mean_pitch', 'Pitch_max_pitch', 'Pitch_q1_pitch', 'Pitch_min_pitch']
selected_intensity = ['Intensity_max_intensity', 'Intensity_q3_intensity','Intensity_median_intensity', 'Intensity_mean_intensity', 'Intensity_stddev_intensity','Intensity_relative_max_intensity_time']
selected_HNR = ['HNR_stddev_hnr', 'HNR_mean_hnr','HNR_relative_min_hnr_time','HNR_max_hnr']
selected_prosody = selected_spect + selected_formant + selected_HNR + selected_intensity + selected_pitch + ['Local Jitter','Local Shimmer']

In [7]:
# Example try MFCC 20 mean & std + ZCR & RMS
feature_column_names = feature_MFCC20_mean + feature_MFCC20_std + feature_mel32_stats+ feature_zcr_stats + feature_rms_stats+ selected_prosody

X_train = df_joint_train_aug[feature_column_names]
X_test = df_joint_test[feature_column_names]
X_train_low = df_joint_train_low[feature_column_names]

In [13]:
X_train.shape, X_train_low.shape, X_test.shape

((24885, 244), (7648, 244), (680, 244))

### emotion 8-class Random Forest Classifier

In [14]:
rand_forest = RandomForestClassifier()
rand_forest.fit(X_train, y_train_e)

predictions = rand_forest.predict(X_test) 
probabilities = rand_forest.predict_proba(X_test)
print(classification_report(y_test_e, predictions))
print(confusion_matrix(y_test_e, predictions))

              precision    recall  f1-score   support

       Anger       0.64      0.71      0.67        62
    Calmness       0.40      1.00      0.57        16
     Disgust       0.47      0.26      0.33        62
        Fear       0.80      0.39      0.52        62
   Happiness       0.76      0.63      0.69       187
  Neutrality       0.66      0.87      0.75       167
     Sadness       0.54      0.61      0.57        62
    Surprise       0.92      0.92      0.92        62

    accuracy                           0.67       680
   macro avg       0.65      0.67      0.63       680
weighted avg       0.69      0.67      0.66       680

[[ 44   0   5   0  12   0   1   0]
 [  0  16   0   0   0   0   0   0]
 [  2   3  16   2  12  18   8   1]
 [  3   0   4  24   8  12  10   1]
 [ 20   7   7   3 117  28   3   2]
 [  0   8   0   1   2 145  11   0]
 [  0   5   1   0   1  16  38   1]
 [  0   1   1   0   2   1   0  57]]


In [17]:
rand_forest2 = RandomForestClassifier()
rand_forest2.fit(X_train_low, y_train_low_e)
predictions = rand_forest2.predict(X_test) 
probabilities2 = rand_forest.predict_proba(X_test)
# Evaluate the model
print(classification_report(y_test_e, predictions))
print(confusion_matrix(y_test_e, predictions))

              precision    recall  f1-score   support

       Anger       0.28      0.42      0.33        62
    Calmness       0.00      0.00      0.00        16
     Disgust       0.20      0.66      0.31        62
        Fear       0.25      0.69      0.37        62
   Happiness       0.44      0.10      0.16       187
  Neutrality       0.33      0.01      0.01       167
     Sadness       0.26      0.56      0.36        62
    Surprise       0.43      0.26      0.32        62

    accuracy                           0.26       680
   macro avg       0.27      0.34      0.23       680
weighted avg       0.33      0.26      0.20       680

[[26  0  6 23  6  0  1  0]
 [ 0  0  4  0  0  0 12  0]
 [ 5  0 41  7  1  0  8  0]
 [ 1  0  9 43  0  0  6  3]
 [51  0 37 54 18  1 11 15]
 [ 4  0 69 23  9  1 58  3]
 [ 0  0 12 13  1  1 35  0]
 [ 7  0 22  9  6  0  2 16]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 2 layer decision 

In [22]:
threshold = 0.95
final_predictions = []

for prob1, prob2 in zip(probabilities, probabilities2):
    if np.max(prob1) > threshold:
        predicted_class = np.argmax(prob1)
    else:
        predicted_class = np.argmax(prob2)
    final_predictions.append(predicted_class)

final_predictions = np.array(final_predictions)
final_predictions_str = [rand_forest.classes_[prediction] for prediction in final_predictions]
print(classification_report(y_test_e, final_predictions_str ))
print(confusion_matrix(y_test_e,final_predictions_str))

              precision    recall  f1-score   support

       Anger       0.64      0.71      0.67        62
    Calmness       0.40      1.00      0.57        16
     Disgust       0.47      0.26      0.33        62
        Fear       0.80      0.39      0.52        62
   Happiness       0.76      0.63      0.69       187
  Neutrality       0.66      0.87      0.75       167
     Sadness       0.54      0.61      0.57        62
    Surprise       0.92      0.92      0.92        62

    accuracy                           0.67       680
   macro avg       0.65      0.67      0.63       680
weighted avg       0.69      0.67      0.66       680

[[ 44   0   5   0  12   0   1   0]
 [  0  16   0   0   0   0   0   0]
 [  2   3  16   2  12  18   8   1]
 [  3   0   4  24   8  12  10   1]
 [ 20   7   7   3 117  28   3   2]
 [  0   8   0   1   2 145  11   0]
 [  0   5   1   0   1  16  38   1]
 [  0   1   1   0   2   1   0  57]]


## StackingEnsembleNN    (rf+dt+knn+gb-fast -> gb-fast)

In [28]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# 定义基本分类器
base_classifiers = [
    # ('mlp', MLPClassifier(hidden_layer_sizes=(100,), activation='relu', random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=15)),
    # ('svm', SVC(kernel='rbf', random_state=42)),
    # ('ada', AdaBoostClassifier(n_estimators=50, learning_rate=1.0, random_state=42)),
    ('gb', HistGradientBoostingClassifier(loss='log_loss', learning_rate=0.01, max_iter=50, random_state=42))
]

# 定义Stacking分类器
stacking_classifier = StackingClassifier(estimators=base_classifiers, 
                                         final_estimator=HistGradientBoostingClassifier(
                                             loss='log_loss', learning_rate=0.1, max_iter=50, random_state=42))

# 训练Stacking分类器
stacking_classifier.fit(X_train, y_train_e)

# 在测试集上进行预测
y_pred = stacking_classifier.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test_e, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6323529411764706


# CNN+Attention TODO

In [38]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import keras
from keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras.layers import Input, Conv1D, MaxPooling1D, Dropout, Flatten, Dense, Activation, Multiply, Permute

# 定义输入层
input_audio = Input(shape=(244, 1))

# CNN层
conv1 = Conv1D(128, 5, padding='same', activation='relu')(input_audio)
drop1 = Dropout(0.1)(conv1)
pool1 = MaxPooling1D(pool_size=8)(drop1)
conv2 = Conv1D(128, 5, padding='same', activation='relu')(pool1)
drop2 = Dropout(0.1)(conv2)

# Attention机制
attention_probs = Dense(128, activation='softmax')(drop2)
attention_mul = Multiply()([drop2, attention_probs])

# 展平层
flatten = Flatten()(attention_mul)

# 全连接层
dense1 = Dense(10, activation='softmax')(flatten)

# 构建模型
model = Model(inputs=input_audio, outputs=dense1)

# 编译模型
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 打印模型结构
model.summary()


In [39]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(),  # changed from opt to Adam()
              metrics=['accuracy'])

In [40]:
x_traincnn = np.expand_dims(X_train, axis=2)
x_testcnn = np.expand_dims(X_test, axis=2)

In [41]:
cnnhistory=model.fit(x_traincnn, y_e_num, batch_size=16, epochs=100, validation_data=(x_testcnn, y_test_e_num))

Epoch 1/100
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.1793 - loss: 2.4467 - val_accuracy: 0.1881 - val_loss: 1.8441
Epoch 2/100
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.2412 - loss: 1.8567 - val_accuracy: 0.2339 - val_loss: 1.8555
Epoch 3/100
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.2563 - loss: 1.7712 - val_accuracy: 0.2644 - val_loss: 1.7780
Epoch 4/100
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.2681 - loss: 1.7461 - val_accuracy: 0.2678 - val_loss: 1.7395
Epoch 5/100
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.2658 - loss: 1.7567 - val_accuracy: 0.2729 - val_loss: 1.7573
Epoch 6/100
[1m478/478[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.2826 - loss: 1.7193 - val_accuracy: 0.2754 - val_loss: 1.7307
Epoch 7/100
[1m