In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from itertools import product

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from joblib import Parallel, delayed
import joblib


In [8]:
dataset = pd.read_csv('dataset.csv', index_col=False)
not_features = ["Path", "Emotional_intensity", "Statement", "Gender"]
features_data = dataset.drop(columns=not_features)

features_data.dropna(inplace=True)
actor_list = [1, 12, 7, 24]
test_set = features_data.loc[features_data['Actor'].isin(actor_list)]
training_set = features_data.drop(test_set.index)

In [9]:
training_actors = training_set['Actor']
training_actors = training_actors.unique()

print(training_actors)

training_actors_m = []
training_actors_f = []
for actor in training_actors:
    if (actor%2 == 0):
        training_actors_f.append(actor)
    else:
        training_actors_m.append(actor)

[ 2  3  4  5  6  8  9 10 11 13 14 15 16 17 18 19 20 21 22 23]


In [10]:
# Define the model
model1 = RandomForestClassifier(max_samples=0.9, n_estimators=200, max_depth=None, n_jobs=-1, random_state=42)
model2 = XGBClassifier(objective='binary:logistic', reg_alpha=0.5, reg_lambda=1.0, n_estimators=200, random_state=42)
model3 = GradientBoostingClassifier(n_estimators=200, max_features=2, max_depth=None, random_state=42, subsample=0.9)
model4 = LogisticRegression(random_state=42, max_iter=300)
model5 = MLPClassifier(random_state=42, max_iter=300)
model6 = SVC(random_state=42)

 # Define the grid
param_grid = {
    "max_samples": [0.3, 0.6, 0.7, 0.9, 0.11],
    "n_estimators": [200, 400, 600, 700, 800]
}

param_grid2 = {
    "n_estimators": [200, 400, 600],
    "reg_alpha": [0.5, 1, 1.5],
    "reg_lambda": [0.5, 1, 1.5]
}

param_grid_gb = {
    "n_estimators": [200, 400, 600],
    "max_features": [2, 3, 4],
    "subsample": [0.6, 0.8, 0.9],
    "learning_rate": [0.01, 0.1, 0.2]
}

param_grid_logreg = {
    "C": [0.1, 1, 10],
    "penalty": ["l2"],
    "solver": ["lbfgs", "liblinear"]
}

param_grid_svc = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto"]
}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [11]:
def GridSearch(model, param_grid):
    best_score = 0.0
    best_params = {}

    param_names = param_grid.keys()
    param_values = param_grid.values()
    param_combinations = list(product(*param_values))

    # Definire una funzione per essere eseguita in parallelo
    def evaluate_combination(params):
        param_combination = dict(zip(param_names, params))
        model_accuracy = []
        model_f1 = []

        for i in range(10):
            m = training_actors_m[i]
            f = training_actors_f[i]
            cross_test_set = training_set.loc[training_set['Actor'].isin([m, f])]
            cross_training_set = training_set.drop(cross_test_set.index)

            y_train = cross_training_set["Emotions"]
            X2d_train = cross_training_set[dataset.columns[6:]]
            y_test = cross_test_set["Emotions"]
            X2d_test = cross_test_set[dataset.columns[6:]]

            # Crea una nuova istanza del modello
            model_instance = model.__class__(**param_combination)

            # Fai il fitting del modello
            model_instance.fit(X2d_train, y_train)

            # Registra i risultati
            model_accuracy.append(model_instance.score(X2d_test, y_test))
            y_pred = model_instance.predict(X2d_test)
            model_f1.append(f1_score(y_test, y_pred, average="macro"))

        avg_accuracy = sum(model_accuracy) / len(model_accuracy)
        avg_f1 = sum(model_f1) / len(model_f1)

        return param_combination, avg_accuracy, avg_f1

    # Parallelizza la valutazione delle combinazioni
    results = Parallel(n_jobs=-1)(delayed(evaluate_combination)(params) for params in param_combinations)

    for param_combination, avg_accuracy, avg_f1 in results:
        if avg_accuracy > best_score:
            best_score = avg_accuracy
            best_params = param_combination

        print(f"Parameters: {param_combination}")
        print(f"Average Accuracy: {avg_accuracy}")
        print(f"Average F1 Score: {avg_f1}")
        print("--------------------")

    print("Best parameters:")
    print(best_params)
    print(f"Best Accuracy: {best_score}")


In [12]:
#richiamo la funzione per stampare le migliori combinazioni, RandomForest
GridSearch(model1, param_grid)

Parameters: {'max_samples': 0.3, 'n_estimators': 200}
Average Accuracy: 0.38
Average F1 Score: 0.3442102510554439
--------------------
Parameters: {'max_samples': 0.3, 'n_estimators': 400}
Average Accuracy: 0.38083333333333336
Average F1 Score: 0.340453603861795
--------------------
Parameters: {'max_samples': 0.3, 'n_estimators': 600}
Average Accuracy: 0.38083333333333336
Average F1 Score: 0.33923549063390945
--------------------
Parameters: {'max_samples': 0.3, 'n_estimators': 700}
Average Accuracy: 0.3825
Average F1 Score: 0.3464115591069616
--------------------
Parameters: {'max_samples': 0.3, 'n_estimators': 800}
Average Accuracy: 0.3783333333333333
Average F1 Score: 0.33508326704453034
--------------------
Parameters: {'max_samples': 0.6, 'n_estimators': 200}
Average Accuracy: 0.40083333333333326
Average F1 Score: 0.36433918436002444
--------------------
Parameters: {'max_samples': 0.6, 'n_estimators': 400}
Average Accuracy: 0.39666666666666667
Average F1 Score: 0.357837265378891

In [13]:
#XGBoost
GridSearch(model2, param_grid2)

Parameters: {'n_estimators': 200, 'reg_alpha': 0.5, 'reg_lambda': 0.5}
Average Accuracy: 0.41
Average F1 Score: 0.3935177902315908
--------------------
Parameters: {'n_estimators': 200, 'reg_alpha': 0.5, 'reg_lambda': 1}
Average Accuracy: 0.4133333333333334
Average F1 Score: 0.39650566935893766
--------------------
Parameters: {'n_estimators': 200, 'reg_alpha': 0.5, 'reg_lambda': 1.5}
Average Accuracy: 0.41083333333333333
Average F1 Score: 0.39225924261931416
--------------------
Parameters: {'n_estimators': 200, 'reg_alpha': 1, 'reg_lambda': 0.5}
Average Accuracy: 0.4133333333333333
Average F1 Score: 0.4013813846321727
--------------------
Parameters: {'n_estimators': 200, 'reg_alpha': 1, 'reg_lambda': 1}
Average Accuracy: 0.4008333333333334
Average F1 Score: 0.3835750255788266
--------------------
Parameters: {'n_estimators': 200, 'reg_alpha': 1, 'reg_lambda': 1.5}
Average Accuracy: 0.40750000000000003
Average F1 Score: 0.38369620192301396
--------------------
Parameters: {'n_estimat

In [14]:
# GradientBoosting
GridSearch(model3, param_grid_gb)


In [None]:
GridSearch(model4, param_grid_logreg) # Logistic Regression

In [None]:
GridSearch(model5, param_grid)        # MLPClassifier

In [None]:
GridSearch(model6, param_grid_svc)    # SVC