In [1]:
# Libraries and utilities

# RENINDER: FARE FIT.TRANSFORM SOLO SU TRAINING E DOPO SUL TEST SOLO .TRANSFORM
# REMINDER: NORMALIZZARE PRIMA DI FARE TUTTO IL PREPROCESSING 

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler,MinMaxScaler,StandardScaler
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, RandomizedSearchCV,GridSearchCV, RepeatedKFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)

In [2]:
df_train=pd.read_csv("../Dataset_prepared/Prepared_train.csv")
df_test=pd.read_csv("../Dataset_prepared/Prepared_test.csv")

In [3]:
df_train.drop('actor',axis=1,inplace=True)
df_train.drop('filename',axis=1,inplace=True)

df_test.drop('actor',axis=1,inplace=True)
df_test.drop('filename',axis=1,inplace=True)

le=preprocessing.LabelEncoder()
c = df_train.select_dtypes(include=['object']).columns.tolist()
for i in c:
    df_train[i]=le.fit_transform(df_train[i])

for i in c:
    df_test[i]=le.fit_transform(df_test[i])

scaler = StandardScaler()
numeric_features = [n for n in df_train.columns if n!="emotion" and n!= "vocal_channel" and n!= "emotional_intensity" and n!= "statement" and n!= "repetition" and n!= "sex" and n!="filename" and n!="actor"]
scaled_features = scaler.fit_transform(df_train[numeric_features])
df_train[numeric_features] = scaled_features


scaled_features_test = scaler.transform(df_test[numeric_features])
df_test[numeric_features] = scaled_features_test

col=[x for x in df_train.columns if x!="emotion"]
X_train=df_train[col].values
y_train = np.array(df_train["emotion"])


X_test=df_test[col].values
y_test = np.array(df_test["emotion"])

DT pre cancellazione outliers

In [4]:
param_grid = {'max_depth': [None, 2, 5, 10, 15, 20],
              'min_samples_split': [2, 5, 10, 15, 20],
              'min_samples_leaf': [1, 5, 10, 15, 20]
}

grid = GridSearchCV(
    DecisionTreeClassifier(),
    param_grid=param_grid,
    cv=StratifiedKFold(), # with no parameters it is with 5 split and 10 repetitions
    refit=True,
    scoring="f1_macro"
)
grid.fit(X_train, y_train)
clf = grid.best_estimator_

In [5]:
print(grid.best_params_)

{'max_depth': 10, 'min_samples_leaf': 20, 'min_samples_split': 2}


In [6]:
learner = DecisionTreeClassifier(max_depth=10,min_samples_split=2,min_samples_leaf=20)
classifier = learner.fit(X_train, y_train)
predictions = classifier.predict(X_test)

print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.46      0.61      0.53        96
           1       0.47      0.33      0.39        96
           2       0.31      0.29      0.30        48
           3       0.26      0.21      0.23        96
           4       0.24      0.22      0.23        96
           5       0.29      0.48      0.36        48
           6       0.31      0.29      0.30        96
           7       0.44      0.44      0.44        48

    accuracy                           0.35       624
   macro avg       0.35      0.36      0.35       624
weighted avg       0.35      0.35      0.34       624



KNN pre cancellazione Outliers

In [7]:
param_grid = {
    "n_neighbors": np.arange(2,51),
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "cityblock","minkowski"], # cityblock is the manhattan distance
}

grid = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=param_grid,
    cv=StratifiedKFold(), # with no parameters it is with 5 split and 10 repetitions
    refit=True,
    scoring="f1_macro"
)

grid.fit(X_train, y_train)
clf = grid.best_estimator_

In [8]:
print(grid.best_params_)

{'metric': 'cityblock', 'n_neighbors': 40, 'weights': 'distance'}


In [9]:
learner = KNeighborsClassifier(n_neighbors= 40, metric= 'cityblock', weights= 'distance')
classifier = learner.fit(X_train, y_train)
predictions = classifier.predict(X_test)

print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.55      0.59      0.57        96
           1       0.49      0.56      0.52        96
           2       0.50      0.27      0.35        48
           3       0.45      0.39      0.41        96
           4       0.34      0.45      0.39        96
           5       0.42      0.27      0.33        48
           6       0.27      0.24      0.26        96
           7       0.28      0.35      0.31        48

    accuracy                           0.41       624
   macro avg       0.41      0.39      0.39       624
weighted avg       0.42      0.41      0.41       624



In [23]:
df_new_train=df_train.copy()
to_delete=[8, 35, 93, 240, 241, 243, 249, 285, 286, 287, 292, 302, 303, 308, 309, 311, 382, 425, 484, 554, 555, 560, 561, 562, 563, 613, 614, 656, 657, 832, 853, 903, 1010, 1012, 1028, 1029, 1177, 1247, 1283, 1358, 1481, 1492, 1702, 1703, 1704, 1707, 1752, 1756, 1757, 1758, 1759, 1762, 1800, 1801]
print(f"Numero di record da cancellare: {len(to_delete)}")
print(f"Record pre cancellazione: {df_new_train.shape}")
df_new_train=df_new_train.drop(index=to_delete)
print(f"Record after cancellazione: {df_new_train.shape}")

Numero di record da cancellare: 54
Record pre cancellazione: (1828, 259)
Record after cancellazione: (1774, 259)


In [24]:
col=[x for x in df_new_train.columns if x!="emotion"]
X_new_train=df_new_train[col].values
y_new_train = np.array(df_new_train["emotion"])

DT post cancellazione

In [29]:
param_grid = {'max_depth': [None, 2, 5, 10, 15, 20],
              'min_samples_split': [2, 5, 10, 15, 20],
              'min_samples_leaf': [1, 5, 10, 15, 20]
}

grid = GridSearchCV(
    DecisionTreeClassifier(),
    param_grid=param_grid,
    cv=StratifiedKFold(), # with no parameters it is with 5 split and 10 repetitions
    refit=True,
    scoring="f1_macro"
)
grid.fit(X_new_train, y_new_train)
clf = grid.best_estimator_

In [30]:
print(grid.best_params_)

{'max_depth': 5, 'min_samples_leaf': 20, 'min_samples_split': 5}


In [31]:
learner = DecisionTreeClassifier(max_depth=5,min_samples_split=5,min_samples_leaf=20)
classifier = learner.fit(X_new_train, y_new_train)
predictions = classifier.predict(X_test)

print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.49      0.57      0.53        96
           1       0.46      0.32      0.38        96
           2       0.30      0.27      0.29        48
           3       0.28      0.15      0.19        96
           4       0.35      0.45      0.39        96
           5       0.43      0.25      0.32        48
           6       0.25      0.41      0.31        96
           7       0.50      0.42      0.45        48

    accuracy                           0.36       624
   macro avg       0.38      0.35      0.36       624
weighted avg       0.37      0.36      0.36       624



KNN

In [32]:
param_grid = {
    "n_neighbors": np.arange(2,51),
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "cityblock","minkowski"], # cityblock is the manhattan distance
}

grid = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=param_grid,
    cv=StratifiedKFold(), # with no parameters it is with 5 split and 10 repetitions
    refit=True,
    scoring="f1_macro"
)

grid.fit(X_new_train, y_new_train)
clf = grid.best_estimator_

In [33]:
print(grid.best_estimator_)

KNeighborsClassifier(metric='cityblock', n_neighbors=37, weights='distance')


In [35]:
learner = KNeighborsClassifier(n_neighbors= 37, metric= 'cityblock', weights= 'distance')
classifier = learner.fit(X_new_train, y_new_train)
predictions = classifier.predict(X_test)

print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.57      0.58      0.58        96
           1       0.45      0.52      0.49        96
           2       0.56      0.31      0.40        48
           3       0.44      0.38      0.40        96
           4       0.32      0.42      0.36        96
           5       0.39      0.27      0.32        48
           6       0.28      0.26      0.27        96
           7       0.30      0.38      0.33        48

    accuracy                           0.41       624
   macro avg       0.41      0.39      0.39       624
weighted avg       0.41      0.41      0.40       624

