In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from itertools import product

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from joblib import Parallel, delayed
import pickle
import os

In [2]:
dataset = pd.read_csv('dataset.csv', index_col=False)
not_features = ["Path", "Emotional_intensity", "Statement", "Gender"]
features_data = dataset.drop(columns=not_features)

features_data.dropna(inplace=True)
actor_list = [1, 12, 7, 24]
test_set = features_data.loc[features_data['Actor'].isin(actor_list)]
training_set = features_data.drop(test_set.index)

In [3]:
training_actors = training_set['Actor']
training_actors = training_actors.unique()

print(training_actors)

training_actors_m = []
training_actors_f = []
for actor in training_actors:
    if (actor%2 == 0):
        training_actors_f.append(actor)
    else:
        training_actors_m.append(actor)

[ 2  3  4  5  6  8  9 10 11 13 14 15 16 17 18 19 20 21 22 23]


In [4]:
# definire i modelli
model1 = RandomForestClassifier(n_jobs=-1, random_state=42)
model2 = XGBClassifier(objective='binary:logistic', random_state=42)
model3 = GradientBoostingClassifier(random_state=42, warm_start=True)
model4 = LogisticRegression(random_state=42)
model5 = MLPClassifier(random_state=42)
model6 = SVC(random_state=42)

 # definire gli iperparametri
param_gridRfc = {
    "n_estimators": [200, 400, 600],
    "max_depth": [10, 15, 20, None], 
    "min_samples_split": [2, 5], 
    "min_samples_leaf": [1, 2],   
}

param_gridXGB = {
    "n_estimators": [200, 300, 400],
    "max_depth": [3, 6],          
    "colsample_bytree": [0.5, 0.7, 1], 
}

param_grid_gb = {
    "n_estimators": [100, 200, 300],
    "max_depth": [2,3],         
    "subsample": [0.6, 0.7, 0.8],     
    "learning_rate": [0.05, 0.1],
}

param_grid_logreg = {
    "C": [0.01, 0.1, 1, 10],               
    "solver": ["lbfgs", "liblinear", "saga"],
}

param_grid_mlp = {
    "hidden_layer_sizes": [(50,50), (50,50,50), (100,)],
    "alpha": [0.0001, 0.001, 0.01],                
    "learning_rate_init": [0.001, 0.01],     
}

param_grid_svc = {
    "C": [0.1, 1, 10],                
    "kernel": ["linear", "rbf"],   
    "gamma": ["scale", "auto"],    
}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [5]:
def save_model_params(filename, data):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

def load_model_params(filename):
    if os.path.exists(filename):
        with open(filename, 'rb') as f:
            return pickle.load(f)
    return None

# Funzione per eseguire la GridSearch
def GridSearch(model, param_grid, save_file): 
    best_score = 0.0
    best_params = {}

    # Prova a caricare i risultati intermedi se esistono
    saved_data = load_model_params(save_file)
    if saved_data:
        print("Caricamento dei risultati intermedi...")
        best_score, best_params = saved_data
            
    param_names = param_grid.keys()
    param_values = param_grid.values()
    param_combinations = list(product(*param_values))

    # Funzione per essere eseguita in parallelo
    def evaluate_combination(params):
        param_combination = dict(zip(param_names, params))
        model_accuracy = []
        model_f1 = []

        for i in range(10):
            m = training_actors_m[i]
            f = training_actors_f[i]
            cross_test_set = training_set.loc[training_set['Actor'].isin([m, f])]
            cross_training_set = training_set.drop(cross_test_set.index)

            y_train = cross_training_set["Emotions"]
            X2d_train = cross_training_set[dataset.columns[6:]]
            y_test = cross_test_set["Emotions"]
            X2d_test = cross_test_set[dataset.columns[6:]]

            # Crea una nuova istanza del modello
            model_instance = model.__class__(**param_combination)

            # Fai il fitting del modello
            model_instance.fit(X2d_train, y_train)

            # Registra i risultati
            model_accuracy.append(model_instance.score(X2d_test, y_test))
            y_pred = model_instance.predict(X2d_test)
            model_f1.append(f1_score(y_test, y_pred, average="macro"))

        avg_accuracy = sum(model_accuracy) / len(model_accuracy)
        avg_f1 = sum(model_f1) / len(model_f1)

        return param_combination, avg_accuracy, avg_f1

    # Parallelizza la valutazione delle combinazioni
    results = Parallel(n_jobs=4)(delayed(evaluate_combination)(params) for params in param_combinations)

    for param_combination, avg_accuracy, avg_f1 in results:
        if avg_accuracy > best_score:
            best_score = avg_accuracy
            best_params = param_combination

    # Salva il miglior risultato raggiunto finora
        save_model_params(save_file, (best_score, best_params))
        print(f"Parameters: {param_combination}")
        print(f"Average Accuracy: {avg_accuracy}")
        print(f"Average F1 Score: {avg_f1}")
        print("--------------------")

    print("Best parameters:")
    print(best_params)
    print(f"Best Accuracy: {best_score}")
    print("--------------------")

In [6]:
#richiamo la funzione per stampare le migliori combinazioni, RandomForest
GridSearch(model1, param_gridRfc, "random_forest_params.pkl")

Caricamento dei risultati intermedi...
Parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}
Average Accuracy: 0.3908333333333333
Average F1 Score: 0.3540236641114832
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2}
Average Accuracy: 0.39166666666666666
Average F1 Score: 0.3528395873022103
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1}
Average Accuracy: 0.395
Average F1 Score: 0.36159913407977406
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2}
Average Accuracy: 0.38916666666666666
Average F1 Score: 0.35316180241238676
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1}
Average Accuracy: 0.3883333333333333
Average F1 Score: 0.3526337501796798
--------------------
Param

In [7]:
#XGBoost
GridSearch(model2, param_gridXGB, "xgboost_params.pkl")

Caricamento dei risultati intermedi...


Parameters: {'n_estimators': 200, 'max_depth': 3, 'colsample_bytree': 0.5}
Average Accuracy: 0.4058333333333334
Average F1 Score: 0.38514576625405905
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 3, 'colsample_bytree': 0.7}
Average Accuracy: 0.41083333333333333
Average F1 Score: 0.3864835880678685
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 3, 'colsample_bytree': 1}
Average Accuracy: 0.4175
Average F1 Score: 0.3981892610338039
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 6, 'colsample_bytree': 0.5}
Average Accuracy: 0.4225
Average F1 Score: 0.4040523846848597
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 6, 'colsample_bytree': 0.7}
Average Accuracy: 0.42333333333333334
Average F1 Score: 0.40899666707701454
--------------------
Parameters: {'n_estimators': 200, 'max_depth': 6, 'colsample_bytree': 1}
Average Accuracy: 0.4141666666666667
Average F1 Score: 0.40160239441568335
--------------------
Par

In [8]:
# GradientBoosting
GridSearch(model3, param_grid_gb, "gradient_boosting_params.pkl")


Parameters: {'n_estimators': 100, 'max_depth': 2, 'subsample': 0.6, 'learning_rate': 0.05}
Average Accuracy: 0.40083333333333326
Average F1 Score: 0.37754971852174396
--------------------
Parameters: {'n_estimators': 100, 'max_depth': 2, 'subsample': 0.6, 'learning_rate': 0.1}
Average Accuracy: 0.4058333333333334
Average F1 Score: 0.3898456833173912
--------------------
Parameters: {'n_estimators': 100, 'max_depth': 2, 'subsample': 0.7, 'learning_rate': 0.05}
Average Accuracy: 0.4125
Average F1 Score: 0.3891098091984585
--------------------
Parameters: {'n_estimators': 100, 'max_depth': 2, 'subsample': 0.7, 'learning_rate': 0.1}
Average Accuracy: 0.4341666666666667
Average F1 Score: 0.41211083449827396
--------------------
Parameters: {'n_estimators': 100, 'max_depth': 2, 'subsample': 0.8, 'learning_rate': 0.05}
Average Accuracy: 0.4091666666666667
Average F1 Score: 0.3860898296920449
--------------------
Parameters: {'n_estimators': 100, 'max_depth': 2, 'subsample': 0.8, 'learning_rat

In [9]:
GridSearch(model4, param_grid_logreg, "logreg_params.pkl") # Logistic Regression

Parameters: {'C': 0.01, 'solver': 'lbfgs'}
Average Accuracy: 0.38666666666666666
Average F1 Score: 0.35259976381853814
--------------------
Parameters: {'C': 0.01, 'solver': 'liblinear'}
Average Accuracy: 0.3908333333333333
Average F1 Score: 0.3511943026738598
--------------------
Parameters: {'C': 0.01, 'solver': 'saga'}
Average Accuracy: 0.3525
Average F1 Score: 0.31172392533963167
--------------------
Parameters: {'C': 0.1, 'solver': 'lbfgs'}
Average Accuracy: 0.3783333333333333
Average F1 Score: 0.34232942116187765
--------------------
Parameters: {'C': 0.1, 'solver': 'liblinear'}
Average Accuracy: 0.3825
Average F1 Score: 0.34333233269002705
--------------------
Parameters: {'C': 0.1, 'solver': 'saga'}
Average Accuracy: 0.35
Average F1 Score: 0.3100519504951505
--------------------
Parameters: {'C': 1, 'solver': 'lbfgs'}
Average Accuracy: 0.37916666666666665
Average F1 Score: 0.33892926865976525
--------------------
Parameters: {'C': 1, 'solver': 'liblinear'}
Average Accuracy: 0.3

In [10]:
GridSearch(model5, param_grid_mlp, "mlp_params.pkl")        # MLPClassifier

Parameters: {'hidden_layer_sizes': (50, 50), 'alpha': 0.0001, 'learning_rate_init': 0.001}
Average Accuracy: 0.37416666666666665
Average F1 Score: 0.34663821232771486
--------------------
Parameters: {'hidden_layer_sizes': (50, 50), 'alpha': 0.0001, 'learning_rate_init': 0.01}
Average Accuracy: 0.33666666666666667
Average F1 Score: 0.2881826747672808
--------------------
Parameters: {'hidden_layer_sizes': (50, 50), 'alpha': 0.001, 'learning_rate_init': 0.001}
Average Accuracy: 0.355
Average F1 Score: 0.3194952412044508
--------------------
Parameters: {'hidden_layer_sizes': (50, 50), 'alpha': 0.001, 'learning_rate_init': 0.01}
Average Accuracy: 0.3816666666666667
Average F1 Score: 0.34785531354667865
--------------------
Parameters: {'hidden_layer_sizes': (50, 50), 'alpha': 0.01, 'learning_rate_init': 0.001}
Average Accuracy: 0.37
Average F1 Score: 0.32232566069748947
--------------------
Parameters: {'hidden_layer_sizes': (50, 50), 'alpha': 0.01, 'learning_rate_init': 0.01}
Average Ac

In [11]:
GridSearch(model6, param_grid_svc, "svc_params.pkl")    # SVC