In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from itertools import product

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from joblib import Parallel, delayed
import pickle


In [2]:
dataset = pd.read_csv('dataset.csv', index_col=False)
not_features = ["Path", "Emotional_intensity", "Statement", "Gender"]
features_data = dataset.drop(columns=not_features)

features_data.dropna(inplace=True)
actor_list = [1, 12, 7, 24]
test_set = features_data.loc[features_data['Actor'].isin(actor_list)]
training_set = features_data.drop(test_set.index)

In [3]:
training_actors = training_set['Actor']
training_actors = training_actors.unique()

print(training_actors)

training_actors_m = []
training_actors_f = []
for actor in training_actors:
    if (actor%2 == 0):
        training_actors_f.append(actor)
    else:
        training_actors_m.append(actor)

[ 2  3  4  5  6  8  9 10 11 13 14 15 16 17 18 19 20 21 22 23]


In [4]:
# definire i modelli
model1 = RandomForestClassifier(max_samples=0.9, n_estimators=200, max_depth=None, n_jobs=-1, random_state=42)
model2 = XGBClassifier(objective='binary:logistic', reg_alpha=0.5, reg_lambda=1.0, n_estimators=200, random_state=42, learning_rate=0.1)
model3 = GradientBoostingClassifier(n_estimators=200, max_features=2, max_depth=None, random_state=42, subsample=0.9)
model4 = LogisticRegression(random_state=42, max_iter=300)
model5 = MLPClassifier(random_state=42, max_iter=300)
model6 = SVC(random_state=42)

 # definire gli iperparametri
param_gridRfc = {
    "n_estimators": [200, 400],
    "max_depth": [10, 20, None], 
    "min_samples_split": [2, 5], 
    "min_samples_leaf": [1, 2],   
}

param_gridXGB = {
    "n_estimators": [200, 400],
    "max_depth": [3, 6],          
    "learning_rate": [0.01, 0.1],
    "colsample_bytree": [0.7, 1], 
}

param_grid_gb = {
    "n_estimators": [200, 400],
    "max_depth": [3, 5],         
    "subsample": [0.8, 0.9],     
    "learning_rate": [0.01, 0.1],
}

param_grid_logreg = {
    "C": [0.1, 1],                
    "solver": ["lbfgs", "liblinear"], 
}

param_grid_mlp = {
    "hidden_layer_sizes": [(50,50), (100,)], 
    "alpha": [0.0001, 0.001],                
    "learning_rate_init": [0.001, 0.01],     
}

param_grid_svc = {
    "C": [1, 10],                 
    "kernel": ["linear", "rbf"],   
    "gamma": ["scale", "auto"],    
}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [5]:
# Funzione per eseguire la GridSearch
def GridSearch(model, param_grid): 
    best_score = 0.0
    best_params = {}

    param_names = param_grid.keys()
    param_values = param_grid.values()
    param_combinations = list(product(*param_values))

    # Funzione per essere eseguita in parallelo
    def evaluate_combination(params):
        param_combination = dict(zip(param_names, params))
        model_accuracy = []
        model_f1 = []

        for i in range(10):
            m = training_actors_m[i]
            f = training_actors_f[i]
            cross_test_set = training_set.loc[training_set['Actor'].isin([m, f])]
            cross_training_set = training_set.drop(cross_test_set.index)

            y_train = cross_training_set["Emotions"]
            X2d_train = cross_training_set[dataset.columns[6:]]
            y_test = cross_test_set["Emotions"]
            X2d_test = cross_test_set[dataset.columns[6:]]

            # Crea una nuova istanza del modello
            model_instance = model.__class__(**param_combination)

            # Fai il fitting del modello
            model_instance.fit(X2d_train, y_train)

            # Registra i risultati
            model_accuracy.append(model_instance.score(X2d_test, y_test))
            y_pred = model_instance.predict(X2d_test)
            model_f1.append(f1_score(y_test, y_pred, average="macro"))

        avg_accuracy = sum(model_accuracy) / len(model_accuracy)
        avg_f1 = sum(model_f1) / len(model_f1)

        return param_combination, avg_accuracy, avg_f1

    # Parallelizza la valutazione delle combinazioni
    results = Parallel(n_jobs=-1)(delayed(evaluate_combination)(params) for params in param_combinations)

    for param_combination, avg_accuracy, avg_f1 in results:
        if avg_accuracy > best_score:
            best_score = avg_accuracy
            best_params = param_combination

        print(f"Parameters: {param_combination}")
        print(f"Average Accuracy: {avg_accuracy}")
        print(f"Average F1 Score: {avg_f1}")
        print("--------------------")

    # Salva il miglior modello trovato
    with open('best_model.pkl', 'wb') as file:
        pickle.dump(model, file)
    
    print("Miglior modello salvato con successo.")

    print("Best parameters:")
    print(best_params)
    print(f"Best Accuracy: {best_score}")

# Prova a caricare il modello salvato
try:
    with open('best_model.pkl', 'rb') as file:
        model2 = pickle.load(file)
    print("Modello caricato con successo dal file.")
except FileNotFoundError:
    print("File non trovato, eseguo la GridSearch...")


File non trovato, eseguo la GridSearch...


In [6]:
#richiamo la funzione per stampare le migliori combinazioni, RandomForest
GridSearch(model1, param_gridRfc)

Parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}
Average Accuracy: 0.3908333333333333
Average F1 Score: 0.3528077667933687
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2}
Average Accuracy: 0.38666666666666666
Average F1 Score: 0.3484672544046244
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1}
Average Accuracy: 0.39666666666666667
Average F1 Score: 0.3610954481246852
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2}
Average Accuracy: 0.38666666666666666
Average F1 Score: 0.3502733129964137
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1}
Average Accuracy: 0.39416666666666667
Average F1 Score: 0.360585528002184
--------------------
Parameters: {'n_estimators': 200

In [7]:
#XGBoost
GridSearch(model2, param_gridXGB)

Parameters: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 0.7}
Average Accuracy: 0.3975
Average F1 Score: 0.3749875152433307
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 1}
Average Accuracy: 0.40499999999999997
Average F1 Score: 0.38262148713385374
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
Average Accuracy: 0.4141666666666667
Average F1 Score: 0.3974137949596206
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 1}
Average Accuracy: 0.4133333333333334
Average F1 Score: 0.39187305273838713
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.01, 'colsample_bytree': 0.7}
Average Accuracy: 0.4066666666666666
Average F1 Score: 0.39547865003470667
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 6,

In [8]:
# GradientBoosting
GridSearch(model3, param_grid_gb)


Parameters: {'n_estimators': 200, 'max_depth': 3, 'subsample': 0.8, 'learning_rate': 0.01}
Average Accuracy: 0.41833333333333333
Average F1 Score: 0.3987754862297334
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 3, 'subsample': 0.8, 'learning_rate': 0.1}
Average Accuracy: 0.4175
Average F1 Score: 0.3981561950177415
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 3, 'subsample': 0.9, 'learning_rate': 0.01}
Average Accuracy: 0.4133333333333333
Average F1 Score: 0.3922952039176186
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 3, 'subsample': 0.9, 'learning_rate': 0.1}
Average Accuracy: 0.4091666666666667
Average F1 Score: 0.38322909286553963
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 5, 'subsample': 0.8, 'learning_rate': 0.01}
Average Accuracy: 0.4141666666666667
Average F1 Score: 0.3928881336712299
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 5, 'subsample': 0.8, 'learning_rate

In [9]:
GridSearch(model4, param_grid_logreg) # Logistic Regression

Parameters: {'C': 0.1, 'solver': 'lbfgs'}
Average Accuracy: 0.3783333333333333
Average F1 Score: 0.34232942116187765
--------------------
Parameters: {'C': 0.1, 'solver': 'liblinear'}
Average Accuracy: 0.3825
Average F1 Score: 0.34333233269002705
--------------------
Parameters: {'C': 1, 'solver': 'lbfgs'}
Average Accuracy: 0.37916666666666665
Average F1 Score: 0.33892926865976525
--------------------
Parameters: {'C': 1, 'solver': 'liblinear'}
Average Accuracy: 0.37583333333333335
Average F1 Score: 0.33611570692976833
--------------------
Miglior modello salvato con successo.
Best parameters:
{'C': 0.1, 'solver': 'liblinear'}
Best Accuracy: 0.3825


In [10]:
GridSearch(model5, param_grid_mlp)        # MLPClassifier

Parameters: {'hidden_layer_sizes': (50, 50), 'alpha': 0.0001, 'learning_rate_init': 0.001}
Average Accuracy: 0.35
Average F1 Score: 0.31452792354049597
--------------------
Parameters: {'hidden_layer_sizes': (50, 50), 'alpha': 0.0001, 'learning_rate_init': 0.01}
Average Accuracy: 0.3716666666666667
Average F1 Score: 0.3280735961096863
--------------------
Parameters: {'hidden_layer_sizes': (50, 50), 'alpha': 0.001, 'learning_rate_init': 0.001}
Average Accuracy: 0.38583333333333336
Average F1 Score: 0.3355753437431397
--------------------
Parameters: {'hidden_layer_sizes': (50, 50), 'alpha': 0.001, 'learning_rate_init': 0.01}
Average Accuracy: 0.37333333333333335
Average F1 Score: 0.3332591565735344
--------------------
Parameters: {'hidden_layer_sizes': (100,), 'alpha': 0.0001, 'learning_rate_init': 0.001}
Average Accuracy: 0.3908333333333333
Average F1 Score: 0.35612375228120197
--------------------
Parameters: {'hidden_layer_sizes': (100,), 'alpha': 0.0001, 'learning_rate_init': 0.01

In [11]:
GridSearch(model6, param_grid_svc)    # SVC

Parameters: {'C': 1, 'kernel': 'linear', 'gamma': 'scale'}
Average Accuracy: 0.3883333333333333
Average F1 Score: 0.3530346266185246
--------------------
Parameters: {'C': 1, 'kernel': 'linear', 'gamma': 'auto'}
Average Accuracy: 0.3883333333333333
Average F1 Score: 0.3530346266185246
--------------------
Parameters: {'C': 1, 'kernel': 'rbf', 'gamma': 'scale'}
Average Accuracy: 0.29
Average F1 Score: 0.20694175160601355
--------------------
Parameters: {'C': 1, 'kernel': 'rbf', 'gamma': 'auto'}
Average Accuracy: 0.26833333333333337
Average F1 Score: 0.24666106127964685
--------------------
Parameters: {'C': 10, 'kernel': 'linear', 'gamma': 'scale'}
Average Accuracy: 0.3591666666666667
Average F1 Score: 0.3244830486634488
--------------------
Parameters: {'C': 10, 'kernel': 'linear', 'gamma': 'auto'}
Average Accuracy: 0.3591666666666667
Average F1 Score: 0.3244830486634488
--------------------
Parameters: {'C': 10, 'kernel': 'rbf', 'gamma': 'scale'}
Average Accuracy: 0.3325
Average F1 S