In [116]:
# Libraries and utilities

# RENINDER: FARE FIT.TRANSFORM SOLO SU TRAINING E DOPO SUL TEST SOLO .TRANSFORM
# REMINDER: NORMALIZZARE PRIMA DI FARE TUTTO IL PREPROCESSING 

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler,MinMaxScaler,StandardScaler, OneHotEncoder
from sklearn import preprocessing

from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RandomizedSearchCV,GridSearchCV, RepeatedKFold, train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)

In [117]:
df_train=pd.read_csv("../Dataset_prepared/Prepared_train.csv")
df_test=pd.read_csv("../Dataset_prepared/Prepared_test.csv")

df_train.drop('actor',axis=1,inplace=True)
df_train.drop('filename',axis=1,inplace=True)

df_test.drop('actor',axis=1,inplace=True)
df_test.drop('filename',axis=1,inplace=True)

In [118]:

#creating instance of one-hot-encoder
encoder = OneHotEncoder()
#perform one-hot encoding on 'emotion' column 
encoder_df = pd.DataFrame(encoder.fit_transform(df_train[['emotion']]).toarray())
encoder_df_test = pd.DataFrame(encoder.fit_transform(df_test[['emotion']]).toarray())
#merge one-hot encoded columns back with original DataFrame
df_train = df_train.join(encoder_df)
df_test = df_test.join(encoder_df_test)
#drop 'emotion' column
df_train.drop('emotion', axis=1, inplace=True)
df_test.drop('emotion', axis=1, inplace=True)
#rename 0=Angry column
df_train.rename(columns = {0:'IsAngry?'}, inplace = True)
df_test.rename(columns = {0:'IsAngry?'}, inplace = True)
#drop other emotion columns
df_train.drop([1,2,3,4,5,6,7],axis=1,inplace=True)
df_test.drop([1,2,3,4,5,6,7],axis=1,inplace=True)


In [119]:
df_train.head()

Unnamed: 0,vocal_channel,emotional_intensity,statement,repetition,sex,mean,q25,q50,q75,kur,...,stft_sum_w4,stft_q01_w4,stft_q05_w4,stft_q25_w4,stft_q50_w4,stft_q75_w4,stft_q95_w4,stft_kur_w4,stft_skew_w4,IsAngry?
0,speech,normal,Kids are talking by the door,1st,M,9.15e-07,-3.1e-05,0.0,0.0,15.02852,...,740.409892,0.0,0.566462,0.709962,0.799141,0.896606,1.0,6.74219,-1.706215,0.0
1,speech,normal,Kids are talking by the door,2nd,M,7.13779e-07,-3.1e-05,0.0,0.0,16.488415,...,705.975006,0.368623,0.472736,0.623183,0.744908,0.874713,1.0,-0.70042,-0.201495,0.0
2,speech,normal,Dogs are sitting by the door,1st,M,9.554485e-07,0.0,0.0,0.0,17.035218,...,694.365924,0.0,0.417919,0.643636,0.774253,0.899156,1.0,1.688986,-1.024773,0.0
3,speech,normal,Dogs are sitting by the door,2nd,M,9.176213e-07,-3.1e-05,0.0,0.0,21.824521,...,663.205667,0.30628,0.399641,0.60691,0.755213,0.886474,1.0,-0.594111,-0.412871,0.0
4,speech,normal,Kids are talking by the door,1st,M,8.108948e-07,-3.1e-05,0.0,0.0,13.236022,...,741.412796,0.248765,0.428202,0.634815,0.759914,0.878014,1.0,0.126535,-0.620782,0.0


In [120]:
# Percentage of Angry records in train and test datasets
train_values=df_train.values
train_target=np.array(df_train["IsAngry?"])
test_values=df_test.values
test_target=np.array(df_test["IsAngry?"])

In [136]:
print(f"Valori ed etichette nel train:{len(train_values), len(train_target)}")
print(f"Valori ed etichette nel test:{len(test_values), len(test_target)}")

Valori ed etichette nel train:(1828, 1828)
Valori ed etichette nel test:(624, 624)


In [138]:
np.unique(train_target, return_counts=True), np.unique(test_target, return_counts=True)


((array([0., 1.]), array([1548,  280], dtype=int64)),
 (array([0., 1.]), array([528,  96], dtype=int64)))

In [142]:
print(f"Train !Angry: {1548/1828}, Train Angry: {280/1828}")
print(f"Test  !Angry: {528/624}, Train Angry: {96/624}")

Train !Angry: 0.8468271334792122, Train Angry: 0.15317286652078774
Test  !Angry: 0.8461538461538461, Train Angry: 0.15384615384615385


In [124]:
# Label encoding categorical columns for train and test
le=preprocessing.LabelEncoder()
c = df_train.select_dtypes(include=['object']).columns.tolist()
for i in c:
    df_train[i]=le.fit_transform(df_train[i])

for i in c:
    df_test[i]=le.fit_transform(df_test[i])


#Normalization numerical pure feature for training and test
scaler = StandardScaler()
numeric_features = [n for n in df_train.columns if n!="IsAngry?" and n!= "vocal_channel" and n!= "emotional_intensity" and n!= "statement" and n!= "repetition" and n!= "sex"]
scaled_features = scaler.fit_transform(df_train[numeric_features])
df_train[numeric_features] = scaled_features


scaled_features_test = scaler.transform(df_test[numeric_features])
df_test[numeric_features] = scaled_features_test

In [125]:
col=[x for x in df_train.columns if x!="IsAngry?"]
X_train=df_train[col].values
y_train = np.array(df_train["IsAngry?"])

X_test=df_test[col].values
y_test = np.array(df_test["IsAngry?"])

DT

In [127]:
param_grid = {'max_depth': [None, 2, 5, 10, 15, 20],
              'min_samples_split': [2, 5, 10, 15, 20],
              'min_samples_leaf': [1, 5, 10, 15, 20]
}

grid = GridSearchCV(
    DecisionTreeClassifier(),
    param_grid=param_grid,
    cv=StratifiedKFold(), # with no parameters it is with 5 split and 10 repetitions
    refit=True,
    scoring="f1_macro"
)
grid.fit(X_train, y_train)
clf = grid.best_estimator_

In [128]:
print(grid.best_params_)

{'max_depth': 15, 'min_samples_leaf': 15, 'min_samples_split': 2}


In [129]:
learner = DecisionTreeClassifier(max_depth=15,min_samples_split=2,min_samples_leaf=15)
classifier = learner.fit(X_train, y_train)
predictions = classifier.predict(X_test)

print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         0.0       0.93      0.93      0.93       528
         1.0       0.61      0.64      0.62        96

    accuracy                           0.88       624
   macro avg       0.77      0.78      0.78       624
weighted avg       0.88      0.88      0.88       624



KNN

In [144]:
param_grid = {
    "n_neighbors": np.arange(2,51),
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "cityblock","minkowski"], # cityblock is the manhattan distance
}

grid = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=param_grid,
    cv=StratifiedKFold(), # with no parameters it is with 5 split and 10 repetitions
    refit=True,
    scoring="f1_macro"
)

grid.fit(X_train, y_train)
clf = grid.best_estimator_

In [145]:
print(grid.best_params_)

{'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}


In [146]:
learner = KNeighborsClassifier(n_neighbors= 5, metric= 'cityblock', weights= 'uniform')
classifier = learner.fit(X_train, y_train)
predictions = classifier.predict(X_test)

print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93       528
         1.0       0.68      0.40      0.50        96

    accuracy                           0.88       624
   macro avg       0.79      0.68      0.72       624
weighted avg       0.86      0.88      0.86       624

