In [1]:
import numpy as np
import pandas as pd
import glob
import os, sys
import matplotlib.pyplot as plt
import time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
VERSION = 4
RANDOM_SEED = 26

## Load data

### Load Data - downsampled

In [2]:
# df_joint_train_org   = pd.read_csv(f'./features/cache_all_features_train_V{VERSION}.csv')
# df_joint_train_org   = df_joint_train_org.drop(columns=['GNE_max_gne','GNE_mean_gne','GNE_stddev_gne','GNE_sum_gne'])
                       
# df_joint_test_org = pd.read_csv(f'./features/cache_all_features_test_V{VERSION}.csv').drop(
#     columns=['GNE_max_gne','GNE_mean_gne','GNE_stddev_gne','GNE_sum_gne'])

# print("shape of train set: ", df_joint_train_org.shape)
# print("shape of test  set: ", df_joint_test_org.shape)


# df_joint_train  = pd.read_csv(f'./features/cache_train_V4_resampled_2500.csv')
df_joint_test = pd.read_csv(f'./cache_all_features_test_V4.csv')

# print("shape of train set: ", df_joint_train.shape)
print("shape of test  set: ", df_joint_test.shape)

df_joint_train_aug  = pd.read_csv(f'./cache_train_V4_augmented.csv')
feature_column_names = [i for i in df_joint_train_aug.columns \
                        if i not in ['file_path','renamed_file_path','split','sentiment_value','emotional_category']]
             
print("shape of train set: ", df_joint_train_aug.shape)
df_joint_train_aug.groupby('sentiment_value')['file_path'].count()

shape of test  set:  (1180, 1550)
shape of train set:  (24885, 1546)


sentiment_value
-1    7999
 0    8560
 1    8326
Name: file_path, dtype: int64

### best guess feature combinations

In [3]:
# generate selected features 
def generate_selected_features_by_type(feature_column_names,input,stats,number=1):
    selected_result = []
    for name in feature_column_names:
        if input+"_"+stats in name:
            selected_result.append(name)
    if number < len(selected_result):
        selected_result = selected_result[:number]
    return selected_result

# example to take mfcc 20 mean & std; mel32; zcr all 5 stats features
feature_MFCC20_mean  = generate_selected_features_by_type(feature_column_names,"mfcc","mean",20)
feature_MFCC20_std   = generate_selected_features_by_type(feature_column_names,"mfcc","std",20)
feature_mel32_median = generate_selected_features_by_type(feature_column_names,"mel32","median",32)
feature_mel32_std    = generate_selected_features_by_type(feature_column_names,"mel32","std",32)
feature_zcr_stats    = generate_selected_features_by_type(feature_column_names,"zcr","",5)
feature_rms_stats    = generate_selected_features_by_type(feature_column_names,"rms","",5)
selected_spect = ['Spectrum_band_energy_difference','Spectrum_band_density_difference','Spectrum_center_of_gravity_spectrum','Spectrum_skewness_spectrum','Spectrum_kurtosis_spectrum', 'Spectrum_stddev_spectrum','Spectrum_band_density', 'Spectrum_band_energy']
selected_formant = ['Formant_f1_mean','Formant_f1_median','Formant_f3_mean','Formant_fitch_vtl','Formant_mff','Formant_formant_dispersion']
selected_pitch = ['Pitch_pitch_slope_without_octave_jumps', 'Pitch_q3_pitch','Pitch_stddev_pitch', 'Pitch_mean_absolute_pitch_slope','Pitch_mean_pitch', 'Pitch_max_pitch', 'Pitch_q1_pitch', 'Pitch_min_pitch']
selected_intensity = ['Intensity_max_intensity', 'Intensity_q3_intensity','Intensity_median_intensity', 'Intensity_mean_intensity', 'Intensity_stddev_intensity','Intensity_relative_max_intensity_time']
selected_HNR = ['HNR_stddev_hnr', 'HNR_mean_hnr','HNR_relative_min_hnr_time','HNR_max_hnr']
selected_prosody = selected_intensity + selected_pitch # + ['Local Jitter','Local Shimmer']
selected_feature_names = feature_MFCC20_mean + feature_MFCC20_std + feature_mel32_median + feature_mel32_std + \
                        feature_zcr_stats + feature_rms_stats + selected_intensity + selected_pitch 

In [4]:
# default use augmented training set and balanced test set
X_train = df_joint_train_aug[selected_feature_names]
y_train_s = df_joint_train_aug['sentiment_value']
y_train_e = df_joint_train_aug['emotional_category']

# from sklearn.preprocessing import LabelEncoder
# label_encoder = LabelEncoder()
# y_e_num = label_encoder.fit_transform(y_train_e)

X_test = df_joint_test[selected_feature_names]
y_test_s = df_joint_test['sentiment_value']
y_test_e = df_joint_test['emotional_category']

# y_test_e_num = label_encoder.fit_transform(y_test_e)

In [5]:
X_train.shape, X_test.shape

((24885, 128), (1180, 128))

In [6]:
# !pip install lightgbm

## Models

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score, precision_recall_fscore_support
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier,HistGradientBoostingClassifier
from lightgbm import LGBMClassifier

# Common adjustable parameters
common_params = {
    'RandomForest': {'n_estimators': 100, 'criterion':'gini', 'max_depth': None, 
                     'min_samples_split':100, 'bootstrap':True, 'n_jobs':3, 'random_state': RANDOM_SEED},
    'SVM': {'kernel': 'rbf', 'C': 0.01, 'probability': True},
    'SVM1': {'kernel': 'rbf', 'C': 0.05, 'probability': True},
    'SVM2': {'kernel': 'rbf', 'C': 0.1, 'probability': True},
    'SVM3': {'kernel': 'rbf', 'C': 0.5, 'probability': True},
    'SVM4': {'kernel': 'rbf', 'C': 1, 'probability': True},
    'SVM5': {'kernel': 'rbf', 'C': 2.0, 'probability': True},
    'SVM6': {'kernel': 'rbf', 'C': 3.0, 'probability': True},
    'SVM7': {'kernel': 'rbf', 'C': 5, 'probability': True},
    'SVM8': {'kernel': 'rbf', 'C': 10, 'probability': True},
    'SVM9': {'kernel': 'rbf', 'C': 20, 'probability': True},
    'SVM10': {'kernel': 'rbf', 'C': 50, 'probability': True},
    'KNN': {'n_neighbors': 2},
    'KNN1': {'n_neighbors': 3},
    'KNN2': {'n_neighbors': 4},
    'KNN3': {'n_neighbors': 5},
    'KNN4': {'n_neighbors': 6},
    'KNN5': {'n_neighbors': 8},
    'KNN6': {'n_neighbors': 10},
    'KNN7': {'n_neighbors': 12},
    'KNN8': {'n_neighbors': 15},
    'KNN9': {'n_neighbors': 20},
    'KNN10': {'n_neighbors': 30},
    'GradientBoosting': {'loss': 'log_loss', 'learning_rate': 0.1, 'n_estimators': 100, 'subsample': 1.0,
                         'criterion': 'friedman_mse', 'min_samples_split': 2, 'max_depth': 3},
    'GradientBoostingFast': {'loss': 'log_loss', 'learning_rate': 0.1, 'max_iter': 100},
    'AdaBoost': {'n_estimators': 50, 'learning_rate': 1.0},
    'LightGBM': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 100, 'subsample': 1.0,
                 'min_child_samples': 20, 'max_depth': -1}    
}

# Models with common adjustable parameters
dtree   = DecisionTreeClassifier()
rforest = RandomForestClassifier(**common_params['RandomForest'])
svm     = SVC(**common_params['SVM'])
svm1     = SVC(**common_params['SVM1'])
svm2     = SVC(**common_params['SVM2'])
svm3     = SVC(**common_params['SVM3'])
svm4     = SVC(**common_params['SVM4'])
svm5     = SVC(**common_params['SVM5'])
svm6     = SVC(**common_params['SVM6'])
svm7     = SVC(**common_params['SVM7'])
svm8     = SVC(**common_params['SVM8'])
svm9     = SVC(**common_params['SVM9'])
svm10     = SVC(**common_params['SVM10'])

knn     = KNeighborsClassifier(**common_params['KNN'])
knn1     = KNeighborsClassifier(**common_params['KNN1'])
knn2     = KNeighborsClassifier(**common_params['KNN2'])
knn3     = KNeighborsClassifier(**common_params['KNN3'])
knn4     = KNeighborsClassifier(**common_params['KNN4'])
knn5     = KNeighborsClassifier(**common_params['KNN5'])
knn6     = KNeighborsClassifier(**common_params['KNN6'])
knn7     = KNeighborsClassifier(**common_params['KNN7'])
knn8     = KNeighborsClassifier(**common_params['KNN8'])
knn9     = KNeighborsClassifier(**common_params['KNN9'])
knn10     = KNeighborsClassifier(**common_params['KNN10'])
gboost  = GradientBoostingClassifier(**common_params['GradientBoosting'])
gb_fast = HistGradientBoostingClassifier(**common_params['GradientBoostingFast'])
adaBoost= AdaBoostClassifier(**common_params['AdaBoost'])
lightgbm=LGBMClassifier(**common_params['LightGBM'])

In [8]:
# def exp_clf_with_feature_selected(clf_model, X_train, X_test, y_train, y_test):
#     start = time.time()
#     print(f'Model Name: {clf_model.__class__};\n Train set shape {X_train.shape}, num of class {y_train.unique().size}')
#     predictions = clf_model.fit(X_train, y_train).predict(X_test.values)
    
#     print(classification_report(y_test, predictions))
#     print(confusion_matrix(y_test, predictions))
    
#     precision, recall, f1score, support = precision_recall_fscore_support(y_test, predictions, average=None)
#     # TODO make all metrics into result dict for recording 
#     probabilities = clf_model.predict_proba(X_test.values)
#     print('prbabilities distribution: \n', pd.DataFrame(probabilities,columns=clf_model.classes_).describe())
#     print(f'Time taken: {round(time.time()-start,3)} seconds.\n')

In [9]:
import pickle

def exp_clf_with_feature_selected(clf_model, X_train, X_test, y_train, y_test,verbose=True):
    start = time.time()
    
    clf_model.fit(X_train, y_train)
    predictions = clf_model.predict(X_test.values)
    
    # Calculate metrics
    report = classification_report(y_test, predictions, output_dict=True)
    metrics = {
        'accuracy': report['accuracy'],
        'precision': report['macro avg']['precision'],
        'recall': report['macro avg']['recall'],
        'f1-score': report['macro avg']['f1-score']
    }
    for class_name in report.keys():
        if class_name not in ['accuracy', 'macro avg', 'weighted avg']:
            metrics[class_name+'_precision'] = report[class_name]['precision']
            metrics[class_name+'_recall'] = report[class_name]['recall'],
            metrics[class_name+'_f1-score'] = report[class_name]['f1-score']
    
    feature_columns = list(X_train.columns)
    num_classes = y_train.nunique()
    class_names = list(y_train.unique())
    
    model_filename = f"./models/{clf_model.__class__.__name__}_model"
    model_filename += f"_{num_classes}cls_{len(feature_columns)}feat_{round(report['accuracy']*100)}acc.pkl"
    with open(model_filename, 'wb') as file:
        pickle.dump(clf_model, file)
    
    results = {**metrics,
        'num_classes': num_classes,
        'class_names': class_names,
        'model_filename': model_filename,
        'feature_columns': feature_columns,
    }
    
    if verbose:
        print(f"Model Name: {clf_model.__class__.__name__};\nTrain set shape {X_train.shape}, num of class {num_classes}")
        print(classification_report(y_test, predictions))
        print(confusion_matrix(y_test, predictions))
        probabilities = clf_model.predict_proba(X_test.values)
        print('Probabilities distribution:\n', pd.DataFrame(probabilities, columns=clf_model.classes_).describe())
    print(f"Model: {clf_model.__class__.__name__};Time taken: {round(time.time()-start, 3)} seconds.\n")

    return results, clf_model


### Sentiment 3-class Classifier Sample code

In [10]:
result, m_trained = exp_clf_with_feature_selected(svm, X_train, X_test, y_train_s, y_test_s)

Model Name: SVC;
Train set shape (24885, 128), num of class 3
              precision    recall  f1-score   support

          -1       0.71      0.31      0.43       748
           0       0.29      0.79      0.43       183
           1       0.37      0.53      0.44       249

    accuracy                           0.43      1180
   macro avg       0.46      0.54      0.43      1180
weighted avg       0.57      0.43      0.43      1180

[[230 299 219]
 [ 28 145  10]
 [ 68  48 133]]
Probabilities distribution:
                 -1            0            1
count  1180.000000  1180.000000  1180.000000
mean      0.335558     0.329718     0.334724
std       0.096860     0.246889     0.240948
min       0.131565     0.007774     0.029126
25%       0.255908     0.095174     0.115059
50%       0.334962     0.292190     0.280060
75%       0.425199     0.549067     0.539015
max       0.539176     0.839309     0.822119
Model: SVC;Time taken: 1146.989 seconds.



In [11]:
result, m_trained = exp_clf_with_feature_selected(svm1, X_train, X_test, y_train_s, y_test_s)

Model Name: SVC;
Train set shape (24885, 128), num of class 3
              precision    recall  f1-score   support

          -1       0.73      0.37      0.50       748
           0       0.31      0.78      0.45       183
           1       0.42      0.58      0.49       249

    accuracy                           0.48      1180
   macro avg       0.49      0.58      0.48      1180
weighted avg       0.60      0.48      0.49      1180

[[280 274 194]
 [ 36 143   4]
 [ 66  39 144]]
Probabilities distribution:
                 -1            0            1
count  1180.000000  1180.000000  1180.000000
mean      0.359899     0.315435     0.324666
std       0.133785     0.269637     0.251623
min       0.116418     0.001445     0.021641
25%       0.245925     0.049692     0.097165
50%       0.353773     0.254108     0.257981
75%       0.480039     0.570074     0.537123
max       0.698382     0.850168     0.866858
Model: SVC;Time taken: 1049.68 seconds.



In [12]:
result, m_trained = exp_clf_with_feature_selected(svm2, X_train, X_test, y_train_s, y_test_s)

Model Name: SVC;
Train set shape (24885, 128), num of class 3
              precision    recall  f1-score   support

          -1       0.76      0.41      0.54       748
           0       0.32      0.78      0.45       183
           1       0.45      0.60      0.52       249

    accuracy                           0.51      1180
   macro avg       0.51      0.60      0.50      1180
weighted avg       0.63      0.51      0.52      1180

[[310 262 176]
 [ 38 142   3]
 [ 61  39 149]]
Probabilities distribution:
                 -1            0            1
count  1180.000000  1180.000000  1180.000000
mean      0.373087     0.311196     0.315718
std       0.148389     0.275216     0.257432
min       0.085989     0.000979     0.013441
25%       0.248386     0.038406     0.086461
50%       0.364767     0.243907     0.241159
75%       0.496717     0.575376     0.534455
max       0.851733     0.860904     0.911849
Model: SVC;Time taken: 1074.791 seconds.



In [13]:
result, m_trained = exp_clf_with_feature_selected(svm3, X_train, X_test, y_train_s, y_test_s)

Model Name: SVC;
Train set shape (24885, 128), num of class 3
              precision    recall  f1-score   support

          -1       0.80      0.54      0.65       748
           0       0.34      0.81      0.48       183
           1       0.60      0.58      0.59       249

    accuracy                           0.59      1180
   macro avg       0.58      0.65      0.57      1180
weighted avg       0.69      0.59      0.61      1180

[[404 251  93]
 [ 31 149   3]
 [ 69  35 145]]
Probabilities distribution:
                 -1            0            1
count  1180.000000  1180.000000  1180.000000
mean      0.412642     0.296897     0.290460
std       0.200607     0.282704     0.270436
min       0.025379     0.000046     0.005147
25%       0.244274     0.020439     0.066483
50%       0.402359     0.214961     0.190595
75%       0.562785     0.557246     0.449629
max       0.993212     0.892252     0.974306
Model: SVC;Time taken: 893.69 seconds.



In [14]:
result, m_trained = exp_clf_with_feature_selected(svm4, X_train, X_test, y_train_s, y_test_s)

Model Name: SVC;
Train set shape (24885, 128), num of class 3
              precision    recall  f1-score   support

          -1       0.82      0.56      0.67       748
           0       0.37      0.86      0.51       183
           1       0.62      0.58      0.60       249

    accuracy                           0.61      1180
   macro avg       0.60      0.67      0.59      1180
weighted avg       0.71      0.61      0.63      1180

[[422 240  86]
 [ 23 157   3]
 [ 71  33 145]]
Probabilities distribution:
                 -1            0            1
count  1180.000000  1180.000000  1180.000000
mean      0.431888     0.285411     0.282702
std       0.222502     0.281920     0.276715
min       0.018461     0.000011     0.002643
25%       0.252393     0.013795     0.060844
50%       0.417532     0.192396     0.172921
75%       0.599607     0.537712     0.434876
max       0.997280     0.899496     0.981340
Model: SVC;Time taken: 758.196 seconds.



In [15]:
result, m_trained = exp_clf_with_feature_selected(svm5, X_train, X_test, y_train_s, y_test_s)

Model Name: SVC;
Train set shape (24885, 128), num of class 3
              precision    recall  f1-score   support

          -1       0.82      0.59      0.69       748
           0       0.39      0.86      0.53       183
           1       0.63      0.59      0.61       249

    accuracy                           0.63      1180
   macro avg       0.61      0.68      0.61      1180
weighted avg       0.71      0.63      0.65      1180

[[444 221  83]
 [ 22 157   4]
 [ 75  28 146]]
Probabilities distribution:
                 -1            0            1
count  1180.000000  1180.000000  1180.000000
mean      0.454991     0.270760     0.274248
std       0.242399     0.279414     0.281033
min       0.014122     0.000002     0.002006
25%       0.259123     0.009293     0.054476
50%       0.434998     0.170139     0.159226
75%       0.643571     0.503868     0.400522
max       0.997988     0.905222     0.985190
Model: SVC;Time taken: 699.37 seconds.



In [16]:
result, m_trained = exp_clf_with_feature_selected(svm6, X_train, X_test, y_train_s, y_test_s)

Model Name: SVC;
Train set shape (24885, 128), num of class 3
              precision    recall  f1-score   support

          -1       0.82      0.61      0.70       748
           0       0.40      0.84      0.54       183
           1       0.63      0.59      0.61       249

    accuracy                           0.64      1180
   macro avg       0.62      0.68      0.62      1180
weighted avg       0.72      0.64      0.66      1180

[[459 205  84]
 [ 25 154   4]
 [ 74  27 148]]
Probabilities distribution:
                 -1            0            1
count  1180.000000  1180.000000  1180.000000
mean      0.466624     0.262174     0.271203
std       0.252483     0.277264     0.284316
min       0.009364     0.000001     0.002408
25%       0.266756     0.007438     0.050290
50%       0.448746     0.154962     0.152228
75%       0.666259     0.486621     0.402417
max       0.997407     0.918068     0.990281
Model: SVC;Time taken: 685.572 seconds.



In [17]:
result, m_trained = exp_clf_with_feature_selected(svm7, X_train, X_test, y_train_s, y_test_s)

Model Name: SVC;
Train set shape (24885, 128), num of class 3
              precision    recall  f1-score   support

          -1       0.82      0.65      0.73       748
           0       0.42      0.81      0.55       183
           1       0.65      0.61      0.63       249

    accuracy                           0.67      1180
   macro avg       0.63      0.69      0.64      1180
weighted avg       0.72      0.67      0.68      1180

[[488 182  78]
 [ 31 149   3]
 [ 75  23 151]]
Probabilities distribution:
                 -1             0            1
count  1180.000000  1.180000e+03  1180.000000
mean      0.481468  2.525013e-01     0.266031
std       0.262015  2.740661e-01     0.286198
min       0.006268  9.674107e-07     0.001910
25%       0.275684  5.819207e-03     0.046684
50%       0.468235  1.453960e-01     0.141977
75%       0.691326  4.530897e-01     0.393866
max       0.996809  9.435031e-01     0.993585
Model: SVC;Time taken: 662.859 seconds.



In [18]:
result, m_trained = exp_clf_with_feature_selected(svm8, X_train, X_test, y_train_s, y_test_s)

Model Name: SVC;
Train set shape (24885, 128), num of class 3
              precision    recall  f1-score   support

          -1       0.83      0.69      0.75       748
           0       0.45      0.81      0.58       183
           1       0.66      0.61      0.63       249

    accuracy                           0.69      1180
   macro avg       0.65      0.70      0.65      1180
weighted avg       0.73      0.69      0.70      1180

[[515 156  77]
 [ 33 148   2]
 [ 76  22 151]]
Probabilities distribution:
                 -1             0            1
count  1180.000000  1.180000e+03  1180.000000
mean      0.499569  2.424879e-01     0.257943
std       0.276258  2.719409e-01     0.290379
min       0.003219  8.623728e-07     0.001037
25%       0.278337  3.575818e-03     0.039197
50%       0.496840  1.333750e-01     0.127898
75%       0.725203  4.297191e-01     0.382482
max       0.997026  9.665202e-01     0.996741
Model: SVC;Time taken: 629.555 seconds.



In [19]:
result, m_trained = exp_clf_with_feature_selected(svm9, X_train, X_test, y_train_s, y_test_s)

Model Name: SVC;
Train set shape (24885, 128), num of class 3
              precision    recall  f1-score   support

          -1       0.83      0.73      0.77       748
           0       0.49      0.80      0.60       183
           1       0.67      0.59      0.63       249

    accuracy                           0.71      1180
   macro avg       0.66      0.71      0.67      1180
weighted avg       0.74      0.71      0.72      1180

[[543 136  69]
 [ 33 147   3]
 [ 82  20 147]]
Probabilities distribution:
                 -1             0            1
count  1180.000000  1.180000e+03  1180.000000
mean      0.513337  2.369479e-01     0.249715
std       0.287223  2.730310e-01     0.292536
min       0.001964  7.325207e-07     0.000758
25%       0.280285  2.520250e-03     0.035648
50%       0.516720  1.207621e-01     0.114526
75%       0.753457  4.085441e-01     0.367213
max       0.998818  9.792954e-01     0.998025
Model: SVC;Time taken: 606.62 seconds.



In [20]:
result, m_trained = exp_clf_with_feature_selected(svm10, X_train, X_test, y_train_s, y_test_s)

Model Name: SVC;
Train set shape (24885, 128), num of class 3
              precision    recall  f1-score   support

          -1       0.83      0.74      0.78       748
           0       0.49      0.80      0.61       183
           1       0.68      0.59      0.63       249

    accuracy                           0.72      1180
   macro avg       0.67      0.71      0.67      1180
weighted avg       0.74      0.72      0.72      1180

[[552 129  67]
 [ 34 146   3]
 [ 81  22 146]]
Probabilities distribution:
                 -1             0            1
count  1180.000000  1.180000e+03  1180.000000
mean      0.528210  2.334483e-01     0.238342
std       0.302032  2.791225e-01     0.294236
min       0.000001  1.424814e-07     0.000374
25%       0.270987  1.499340e-03     0.028608
50%       0.536833  9.631812e-02     0.098305
75%       0.792668  4.142379e-01     0.336991
max       0.999569  9.867499e-01     0.999998
Model: SVC;Time taken: 585.486 seconds.



In [21]:
# change model as the first parameter in the function 
# result, m_trained = exp_clf_with_feature_selected(gb_fast, X_train, X_test, y_train_s, y_test_s)

In [22]:
# result, m_trained = exp_clf_with_feature_selected(adaBoost, X_train, X_test, y_train_s, y_test_s)

In [23]:
# result, m_trained = exp_clf_with_feature_selected(gboost, X_train, X_test, y_train_s, y_test_s)

In [24]:
# result, m_trained = exp_clf_with_feature_selected(knn, X_train, X_test, y_train_s, y_test_s)

In [25]:
# result, m_trained = exp_clf_with_feature_selected(svm, X_train, X_test, y_train_s, y_test_s)

In [26]:
# result, m_trained = exp_clf_with_feature_selected(rforest, X_train, X_test, y_train_s, y_test_s)

In [27]:
# result, m_trained = exp_clf_with_feature_selected(lightgbm, X_train, X_test, y_train_s, y_test_s)

In [28]:
# result, m_trained = exp_clf_with_feature_selected(dtree, X_train, X_test, y_train_s, y_test_s)

### How to save experiment metrics result

In [None]:
exp_results = []
for clf_model in [svm,svm1,svm2,svm3,svm4,svm5,svm6,svm7,svm8,svm9,svm10]:
    result, m_trained = exp_clf_with_feature_selected(clf_model, X_train, X_test, y_train_s, y_test_s,verbose=False)
    exp_results.append(result)
pd.DataFrame(exp_results)

Model: SVC;Time taken: 997.225 seconds.

Model: SVC;Time taken: 909.189 seconds.

Model: SVC;Time taken: 847.941 seconds.

Model: SVC;Time taken: 754.93 seconds.



In [None]:
# exp_results = []
# for clf_model in [rforest,adaBoost,gb_fast,gboost,knn,svm,lightgbm,dtree]:
#     result, m_trained = exp_clf_with_feature_selected(clf_model, X_train, X_test, y_train_s, y_test_s,verbose=False)
#     exp_results.append(result)
# pd.DataFrame(exp_results)

In [None]:
pd.DataFrame(exp_results).to_excel("exp_result-0316-v4-aug-svm.xlsx")

### Emotion 8-class

In [None]:
exp_results = []
for clf_model in [svm,svm1,svm2,svm3,svm4,svm5,svm6,svm7,svm8,svm9,svm10]:
    result, m_trained = exp_clf_with_feature_selected(clf_model, X_train, X_test, y_train_e, y_test_e,verbose=False)
    exp_results.append(result)
pd.DataFrame(exp_results)

In [None]:
pd.DataFrame(exp_results).to_excel("exp_result-0316-v4-aug-svm-8.xlsx")

In [None]:
exp_clf_with_feature_selected(svm, X_train, X_test, y_train_e, y_test_e)

In [None]:
exp_clf_with_feature_selected(svm1, X_train, X_test, y_train_e, y_test_e)

In [None]:
exp_clf_with_feature_selected(svm2, X_train, X_test, y_train_e, y_test_e)

In [None]:
exp_clf_with_feature_selected(svm3, X_train, X_test, y_train_e, y_test_e)

In [None]:
exp_clf_with_feature_selected(svm4, X_train, X_test, y_train_e, y_test_e)

In [None]:
exp_clf_with_feature_selected(svm5, X_train, X_test, y_train_e, y_test_e)

In [None]:
exp_clf_with_feature_selected(svm6, X_train, X_test, y_train_e, y_test_e)

In [None]:
exp_clf_with_feature_selected(svm7, X_train, X_test, y_train_e, y_test_e)

In [None]:
exp_clf_with_feature_selected(svm8, X_train, X_test, y_train_e, y_test_e)

In [None]:
exp_clf_with_feature_selected(svm9, X_train, X_test, y_train_e, y_test_e)

In [None]:
exp_clf_with_feature_selected(svm10, X_train, X_test, y_train_e, y_test_e)

In [None]:
# change y_lable into emo
# exp_clf_with_feature_selected(gb_fast, X_train, X_test, y_train_e, y_test_e)

In [None]:
# check how long on normal gradient boosting
# exp_clf_with_feature_selected(gboost, X_train, X_test, y_train_e, y_test_e)

### Threshold tuning

In [None]:
# Calculate accuracy for the given threshold

probabilities = svm3.predict_proba(X_test)

def calc_acc_by_thres(probabilities, threshold, y_test):
    predictions_adj = []
    # Loop through each sample's probabilities
    for probs in probabilities:
        if probs[0] > threshold:
            pred_class = -1
        elif probs[1] > probs[2]:
            pred_class = 0
        else:
            pred_class = 1
        predictions_adj.append(pred_class)
    accuracy = np.mean(predictions_adj == y_test)
    precision, recall, f1score, _ = precision_recall_fscore_support(y_test_s, predictions_adj, average=None)
    return accuracy, min(f1score), np.var(f1score)

best_threshold = None
best_accuracy = 0.0
best_f1score = 0.0
# best_f1s_var = 10

# Define a range of threshold values to try
threshold_range = np.linspace(0.25, 0.75, 100)
for threshold in threshold_range:
    accuracy, min_f1_score, var_f1_score = calc_acc_by_thres(probabilities, threshold, y_test_s)
    # if accuracy > best_accuracy:
    #     best_accuracy = accuracy
    #     best_threshold = threshold
    if min_f1_score > best_f1score:
        best_f1score = min_f1_score
        best_threshold = threshold
        best_accuracy = accuracy
    # if var_f1_score < best_f1s_var:
    #     best_f1score = min_f1_score
    #     best_threshold = threshold
    #     best_accuracy = accuracy
    #     best_f1s_var = var_f1_score
        

print("Best Threshold:", best_threshold)
print("Best Accuracy:", best_accuracy)
print("Best min f1 score:", best_f1score)
# print("Best var f1 score:", best_f1s_var)

In [None]:
# if you want to adjust the threshold; Predict probabilities for each class label;
# best threshold and best_threshold x 110% 120% 130% to see trend
probabilities = svm.predict_proba(X_test)

threshold = best_threshold
print('BEST Threshold:', threshold) 
predictions_adj = []
for probs in probabilities:
    if probs[0] > threshold:
        pred_class = -1
    elif probs[1] > probs[2]:
        pred_class = 0
    else:
        pred_class = 1
    predictions_adj.append(pred_class)
print(classification_report(y_test_s, predictions_adj))
print(confusion_matrix(y_test_s, predictions_adj))
print("Accuracy:", np.mean(predictions_adj == y_test_s))

threshold = best_threshold*1.1
print('Threshold-2:', threshold) 
predictions_adj = []
for probs in probabilities:
    if probs[0] > threshold:
        pred_class = -1
    elif probs[1] > probs[2]:
        pred_class = 0
    else:
        pred_class = 1
    predictions_adj.append(pred_class)
print(classification_report(y_test_s, predictions_adj))
print(confusion_matrix(y_test_s, predictions_adj))
print("Accuracy:", np.mean(predictions_adj == y_test_s))

threshold = best_threshold*1.2
print('Threshold-3:', threshold) 
predictions_adj = []
for probs in probabilities:
    if probs[0] > threshold:
        pred_class = -1
    elif probs[1] > probs[2]:
        pred_class = 0
    else:
        pred_class = 1
    predictions_adj.append(pred_class)
print(classification_report(y_test_s, predictions_adj))
print(confusion_matrix(y_test_s, predictions_adj))
print("Accuracy:", np.mean(predictions_adj == y_test_s))

threshold = best_threshold*1.3
print('Threshold-4:', threshold) 
predictions_adj = []
for probs in probabilities:
    if probs[0] > threshold:
        pred_class = -1
    elif probs[1] > probs[2]:
        pred_class = 0
    else:
        pred_class = 1
    predictions_adj.append(pred_class)
print(classification_report(y_test_s, predictions_adj))
print(confusion_matrix(y_test_s, predictions_adj))
print("Accuracy:", np.mean(predictions_adj == y_test_s))