In [77]:
from sklearn.linear_model import ElasticNet
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
import os
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import pickle
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


In [78]:
# Carica il dataset
df = pd.read_csv('../dataset/discretized_dataset.csv')
df = df.drop('votes', axis=1)
df = df.drop('avg_vote', axis=1)

def multi_value_one_hot(df, column):
    s = df[column].str.get_dummies(sep=', ')
    return df.join(s.add_prefix(column + '_'))

df = multi_value_one_hot(df, 'genre')

df.drop(columns=['genre'], inplace=True)
df['month_published'] = df['month_published'].astype(str)



In [79]:
X = df.drop('revenue_cluster', axis=1)
y = df['revenue_cluster']
print(X.head())

numerical_features = ['duration','converted_budget',
                     'dir_oscar_nomination', 'writer_oscar_nomination',
                     'cast_globe_nomination',
                     'BAFTA_writer_nom', 'BAFTA_dir_nom', 'BAFTA_act_nom', 
                     'dir_emmy_nom', 'writer_emmy_nom', 'act_emmy_nom',
                     'actors_films_before', 'director_films_before', 'writers_films_before'
                     ]

categorical_features_no_genre = ['language', 'production_company', 'month_published']
categorical_features =categorical_features_no_genre + [col for col in df.columns if col.startswith('genre_')]


   duration  converted_budget  dir_oscar_nomination  writer_oscar_nomination  \
0      88.0          175700.3                     0                        0   
1      59.0         3013850.0                     0                        0   
2      77.0          521727.6                     0                        0   
3      50.0         5598468.6                     0                        0   
4     300.0        10802441.1                     0                        0   

   cast_globe_nomination  BAFTA_act_nom  BAFTA_dir_nom  BAFTA_writer_nom  \
0                      0              0              0                 0   
1                      0              0              0                 0   
2                      0              0              0                 0   
3                      0              0              0                 0   
4                      0              0              0                 0   

   dir_emmy_nom  writer_emmy_nom  ...  genre_Horror  genre_Mus

In [80]:

# Dividere i dati in set di addestramento e test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Creare il trasformatore logaritmico
log_transformer = FunctionTransformer(np.log1p, validate=True)

from sklearn.base import BaseEstimator, TransformerMixin

class DropOtherColumns(BaseEstimator, TransformerMixin):
    def __init__(self, prefix='Other'):
        self.prefix = prefix
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Trova i nomi delle colonne che iniziano con il prefisso specificato
        other_columns = [col for col in X.columns if col.startswith(self.prefix)]
        # Rimuovi le colonne trovate
        return X.drop(columns=other_columns, errors='ignore')

# Definire le colonne da eliminare dopo l'encoding
columns_to_drop_after_encoding = ['language_Other', 'production_company_Other']

# Creare un ColumnTransformer per applicare trasformazioni
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('log', log_transformer),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),  # Applica solo l'encoding delle colonne categoriche
        ('drop_other', DropOtherColumns(), [])  # Applica il trasformatore per l'eliminazione delle colonne "Other" dopo l'encoding
    ]
)



# Random Forest

In [81]:


# Creare la pipeline completa con SMOTENC e RandomForestClassifier
pipeline = ImbPipeline(steps=[
    ('smote', SMOTENC(categorical_features=[X.columns.get_loc(col) for col in categorical_features], random_state=42)),
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Definire una griglia dei parametri da esplorare
param_grid = {
    'classifier__n_estimators': [ 100, 300],
    'classifier__max_depth': [None, 10, 4],
    'classifier__min_samples_split': [2, 10],
    'classifier__min_samples_leaf': [1],
    'classifier__max_features': ['sqrt', 'log2', None],
    'classifier__bootstrap': [True],# False],
    'classifier__criterion': ['gini', 'entropy']
}

# Creare GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

# Eseguire la ricerca della griglia sui dati di addestramento
grid_search.fit(X_train, y_train)

# Ottenere i migliori parametri trovati
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Addestrare un modello con i migliori parametri trovati sull'intero set di addestramento
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Valutare il modello finale sul set di test
y_pred_test = best_model.predict(X_test)
final_accuracy_test = accuracy_score(y_test, y_pred_test)
final_f1_test = f1_score(y_test, y_pred_test, average='weighted')
final_confusion_matrix_test = confusion_matrix(y_test, y_pred_test)
final_classification_report_test = classification_report(y_test, y_pred_test)

# Valutare il modello sul set di addestramento
y_pred_train = best_model.predict(X_train)
final_accuracy_train = accuracy_score(y_train, y_pred_train)
final_f1_train = f1_score(y_train, y_pred_train, average='weighted')
final_confusion_matrix_train = confusion_matrix(y_train, y_pred_train)
final_classification_report_train = classification_report(y_train, y_pred_train)

# Stampa i risultati
print("Test set results:")
print("Final Accuracy:", final_accuracy_test)
print("Final F1 Score:", final_f1_test)
print("Confusion Matrix:\n", final_confusion_matrix_test)
print("Classification Report:\n", final_classification_report_test)

print("\nTrain set results:")
print("Final Accuracy:", final_accuracy_train)
print("Final F1 Score:", final_f1_train)
print("Confusion Matrix:\n", final_confusion_matrix_train)
print("Classification Report:\n", final_classification_report_train)

# Specifica il percorso del file dove vuoi salvare il modello
file_path = "../models/classification/RandomForestClassifier.pkl"

# Crea il percorso della directory se non esiste già
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Salva il miglior modello utilizzando pickle
with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Best parameters: {'classifier__bootstrap': True, 'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 300}
Test set results:
Final Accuracy: 0.4892538034291234
Final F1 Score: 0.4841779213204224
Confusion Matrix:
 [[484 315 437 176]
 [225 402  86 298]
 [231  43 951  51]
 [ 66 163  24 189]]
Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.34      0.40      1412
           1       0.44      0.40      0.42      1011
           2       0.63      0.75      0.69      1276
           3       0.26      0.43      0.33       442

    accuracy                           0.49      4141
   macro avg       0.45      0.48      0.46      4141
weighted avg       0.49      0.49      0.48      4141


Train set results:
Final Accuracy: 0.5638734605167833
Final F1 Score: 0.5626841396337882
Confusion Matri

# Logistic Regression

In [82]:
# Creare la pipeline completa con SMOTENC e RandomForestClassifier
pipeline = ImbPipeline(steps=[
    ('smote', SMOTENC(categorical_features=categorical_features, random_state=42)),
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
    ('classifier', LogisticRegression())
])

# Definire una griglia dei parametri da esplorare
param_grid = {
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear']  
}

# Creare GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

# Eseguire la ricerca della griglia sui dati di addestramento
grid_search.fit(X_train, y_train)

# Ottenere i migliori parametri trovati
best_params = grid_search.best_params_
print("Best parameters:", best_params)


Best parameters: {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}


In [83]:

# Addestrare un modello con i migliori parametri trovati sull'intero set di addestramento
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Valutare il modello finale sul set di test
y_pred_test = best_model.predict(X_test)
final_accuracy_test = accuracy_score(y_test, y_pred_test)
final_f1_test = f1_score(y_test, y_pred_test, average='weighted')
final_confusion_matrix_test = confusion_matrix(y_test, y_pred_test)
final_classification_report_test = classification_report(y_test, y_pred_test)

# Valutare il modello sul set di addestramento
y_pred_train = best_model.predict(X_train)
final_accuracy_train = accuracy_score(y_train, y_pred_train)
final_f1_train = f1_score(y_train, y_pred_train, average='weighted')
final_confusion_matrix_train = confusion_matrix(y_train, y_pred_train)
final_classification_report_train = classification_report(y_train, y_pred_train)

# Stampa i risultati
print("Test set results:")
print("Final Accuracy:", final_accuracy_test)
print("Final F1 Score:", final_f1_test)
print("Confusion Matrix:\n", final_confusion_matrix_test)
print("Classification Report:\n", final_classification_report_test)

print("\nTrain set results:")
print("Final Accuracy:", final_accuracy_train)
print("Final F1 Score:", final_f1_train)
print("Confusion Matrix:\n", final_confusion_matrix_train)
print("Classification Report:\n", final_classification_report_train)

# Specifica il percorso del file dove vuoi salvare il modello
file_path = "../models/classification/LogisticRegression.pkl"

# Crea il percorso della directory se non esiste già
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# Salva il miglior modello utilizzando pickle
with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Test set results:
Final Accuracy: 0.47162521130161794
Final F1 Score: 0.452990705939036
Confusion Matrix:
 [[ 342  262  550  258]
 [ 170  357  133  351]
 [ 123   38 1033   82]
 [  52  133   36  221]]
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.24      0.33      1412
           1       0.45      0.35      0.40      1011
           2       0.59      0.81      0.68      1276
           3       0.24      0.50      0.33       442

    accuracy                           0.47      4141
   macro avg       0.45      0.48      0.43      4141
weighted avg       0.49      0.47      0.45      4141


Train set results:
Final Accuracy: 0.4757908717701038
Final F1 Score: 0.46055253961944304
Confusion Matrix:
 [[1432 1010 2093 1115]
 [ 685 1457  456 1445]
 [ 501  185 4105  314]
 [ 209  529  141  887]]
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.25      0.34      5650
     

# AdaBoost

In [84]:

# Creare la pipeline completa con SMOTENC e RandomForestClassifier
pipeline = ImbPipeline(steps=[
    ('smote', SMOTENC(categorical_features=[X.columns.get_loc(col) for col in categorical_features], random_state=42)),
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
    ('classifier', AdaBoostClassifier(random_state=42))
])

# Definire una griglia dei parametri da esplorare
param_grid = {
    'classifier__n_estimators': [50, 100, 150],  # Numero di stimatori
    'classifier__learning_rate': [0.01, 0.1, 1.0],  # Tasso di apprendimento
    'classifier__algorithm': ['SAMME', 'SAMME.R']  # Algoritmo per calcolare i pesi degli stimatori
}


# Creare GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

# Eseguire la ricerca della griglia sui dati di addestramento
grid_search.fit(X_train, y_train)

# Ottenere i migliori parametri trovati
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Addestrare un modello con i migliori parametri trovati sull'intero set di addestramento
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Valutare il modello finale sul set di test
y_pred_test = best_model.predict(X_test)
final_accuracy_test = accuracy_score(y_test, y_pred_test)
final_f1_test = f1_score(y_test, y_pred_test, average='weighted')
final_confusion_matrix_test = confusion_matrix(y_test, y_pred_test)
final_classification_report_test = classification_report(y_test, y_pred_test)

# Valutare il modello sul set di addestramento
y_pred_train = best_model.predict(X_train)
final_accuracy_train = accuracy_score(y_train, y_pred_train)
final_f1_train = f1_score(y_train, y_pred_train, average='weighted')
final_confusion_matrix_train = confusion_matrix(y_train, y_pred_train)
final_classification_report_train = classification_report(y_train, y_pred_train)

# Stampa i risultati
print("Test set results:")
print("Final Accuracy:", final_accuracy_test)
print("Final F1 Score:", final_f1_test)
print("Confusion Matrix:\n", final_confusion_matrix_test)
print("Classification Report:\n", final_classification_report_test)

print("\nTrain set results:")
print("Final Accuracy:", final_accuracy_train)
print("Final F1 Score:", final_f1_train)
print("Confusion Matrix:\n", final_confusion_matrix_train)
print("Classification Report:\n", final_classification_report_train)

# Specifica il percorso del file dove vuoi salvare il modello
file_path = "../models/classification/AdaBoostClassifier.pkl"

# Crea il percorso della directory se non esiste già
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Salva il miglior modello utilizzando pickle
with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Best parameters: {'classifier__algorithm': 'SAMME.R', 'classifier__learning_rate': 0.1, 'classifier__n_estimators': 100}
Test set results:
Final Accuracy: 0.5013281815986477
Final F1 Score: 0.5009509489778415
Confusion Matrix:
 [[547 348 368 149]
 [236 474  65 236]
 [299  50 890  37]
 [ 63 193  21 165]]
Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.39      0.43      1412
           1       0.45      0.47      0.46      1011
           2       0.66      0.70      0.68      1276
           3       0.28      0.37      0.32       442

    accuracy                           0.50      4141
   macro avg       0.47      0.48      0.47      4141
weighted avg       0.51      0.50      0.50      4141


Train set results:
Final Accuracy: 0.5078483458101908
Final F1 Score: 0.5080638715997898
Confusion Matrix:
 [[2163 1354 1443  690]
 [ 895 1946  233  969]
 [1111  217 3601  176]
 [ 240  747   77  702]]
Classification Report:
           

# Gradient Boosting

In [85]:
# Creare la pipeline completa con SMOTENC e RandomForestClassifier
pipeline = ImbPipeline(steps=[
    ('smote', SMOTENC(categorical_features=[X.columns.get_loc(col) for col in categorical_features], random_state=42)),
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

# Definire una griglia dei parametri da esplorare
param_grid= {
    'classifier__n_estimators': [100, 300],
    'classifier__learning_rate': [0.01, 0.1, 1.0],
    'classifier__max_depth': [3, 7],
    'classifier__min_samples_split': [2,  10],
    'classifier__min_samples_leaf': [1],
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__subsample': [0.8, 1.0],  # Frazione di campioni da utilizzare per il fitting di ciascun albero
    'classifier__loss': ['deviance', 'exponential']  # Funzione di perdita da ottimizzare
}


# Creare GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

# Eseguire la ricerca della griglia sui dati di addestramento
grid_search.fit(X_train, y_train)

# Ottenere i migliori parametri trovati
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Addestrare un modello con i migliori parametri trovati sull'intero set di addestramento
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Valutare il modello finale sul set di test
y_pred_test = best_model.predict(X_test)
final_accuracy_test = accuracy_score(y_test, y_pred_test)
final_f1_test = f1_score(y_test, y_pred_test, average='weighted')
final_confusion_matrix_test = confusion_matrix(y_test, y_pred_test)
final_classification_report_test = classification_report(y_test, y_pred_test)

# Valutare il modello sul set di addestramento
y_pred_train = best_model.predict(X_train)
final_accuracy_train = accuracy_score(y_train, y_pred_train)
final_f1_train = f1_score(y_train, y_pred_train, average='weighted')
final_confusion_matrix_train = confusion_matrix(y_train, y_pred_train)
final_classification_report_train = classification_report(y_train, y_pred_train)

# Stampa i risultati
print("Test set results:")
print("Final Accuracy:", final_accuracy_test)
print("Final F1 Score:", final_f1_test)
print("Confusion Matrix:\n", final_confusion_matrix_test)
print("Classification Report:\n", final_classification_report_test)

print("\nTrain set results:")
print("Final Accuracy:", final_accuracy_train)
print("Final F1 Score:", final_f1_train)
print("Confusion Matrix:\n", final_confusion_matrix_train)
print("Classification Report:\n", final_classification_report_train)

# Specifica il percorso del file dove vuoi salvare il modello
file_path = "../models/classification/GradientBoostingClassifier.pkl"

# Crea il percorso della directory se non esiste già
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Salva il miglior modello utilizzando pickle
with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

ValueError: 
All the 1920 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
562 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\imblearn\pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'loss' parameter of GradientBoostingClassifier must be a str among {'log_loss', 'exponential'}. Got 'deviance' instead.

--------------------------------------------------------------------------------
398 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\imblearn\pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'loss' parameter of GradientBoostingClassifier must be a str among {'exponential', 'log_loss'}. Got 'deviance' instead.

--------------------------------------------------------------------------------
960 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\imblearn\pipeline.py", line 326, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\sklearn\ensemble\_gb.py", line 431, in fit
    self._check_params()
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\sklearn\ensemble\_gb.py", line 276, in _check_params
    self._loss = loss_class(self.n_classes_)
  File "C:\Users\BOLO\miniconda3\envs\Business\lib\site-packages\sklearn\ensemble\_gb_losses.py", line 889, in __init__
    raise ValueError(
ValueError: ExponentialLoss requires 2 classes; got 4 class(es)


# SVC

In [86]:
# Creare la pipeline completa con SMOTENC e RandomForestClassifier
pipeline = ImbPipeline(steps=[
    ('smote', SMOTENC(categorical_features=[X.columns.get_loc(col) for col in categorical_features], random_state=42)),
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
    ('classifier', SVC())
])

# Definire una griglia dei parametri da esplorare
param_grid = {
    'classifier__C': [0.1, 1, 10],  # Parametro di regolarizzazione
    'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Tipo di kernel da utilizzare
    'classifier__gamma': ['scale', 'auto'],  # Coefficiente del kernel per 'rbf', 'poly' e 'sigmoid'
    'classifier__degree': [2, 3, 4],  # Grado del polinomio per 'poly'
}


# Creare GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

# Eseguire la ricerca della griglia sui dati di addestramento
grid_search.fit(X_train, y_train)

# Ottenere i migliori parametri trovati
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Addestrare un modello con i migliori parametri trovati sull'intero set di addestramento
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Valutare il modello finale sul set di test
y_pred_test = best_model.predict(X_test)
final_accuracy_test = accuracy_score(y_test, y_pred_test)
final_f1_test = f1_score(y_test, y_pred_test, average='weighted')
final_confusion_matrix_test = confusion_matrix(y_test, y_pred_test)
final_classification_report_test = classification_report(y_test, y_pred_test)

# Valutare il modello sul set di addestramento
y_pred_train = best_model.predict(X_train)
final_accuracy_train = accuracy_score(y_train, y_pred_train)
final_f1_train = f1_score(y_train, y_pred_train, average='weighted')
final_confusion_matrix_train = confusion_matrix(y_train, y_pred_train)
final_classification_report_train = classification_report(y_train, y_pred_train)

# Stampa i risultati
print("Test set results:")
print("Final Accuracy:", final_accuracy_test)
print("Final F1 Score:", final_f1_test)
print("Confusion Matrix:\n", final_confusion_matrix_test)
print("Classification Report:\n", final_classification_report_test)

print("\nTrain set results:")
print("Final Accuracy:", final_accuracy_train)
print("Final F1 Score:", final_f1_train)
print("Confusion Matrix:\n", final_confusion_matrix_train)
print("Classification Report:\n", final_classification_report_train)

# Specifica il percorso del file dove vuoi salvare il modello
file_path = "../models/classification/SVCclassifier.pkl"

# Crea il percorso della directory se non esiste già
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Salva il miglior modello utilizzando pickle
with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Best parameters: {'classifier__C': 0.1, 'classifier__degree': 2, 'classifier__gamma': 'auto', 'classifier__kernel': 'poly'}
Test set results:
Final Accuracy: 0.496015455204057
Final F1 Score: 0.4924466216601493
Confusion Matrix:
 [[551 254 437 170]
 [285 384  80 262]
 [255  32 932  57]
 [ 81 149  25 187]]
Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.39      0.43      1412
           1       0.47      0.38      0.42      1011
           2       0.63      0.73      0.68      1276
           3       0.28      0.42      0.33       442

    accuracy                           0.50      4141
   macro avg       0.46      0.48      0.46      4141
weighted avg       0.50      0.50      0.49      4141


Train set results:
Final Accuracy: 0.5040449166867906
Final F1 Score: 0.5012148375183663
Confusion Matrix:
 [[2222 1060 1624  744]
 [1074 1600  310 1059]
 [ 986  154 3796  169]
 [ 320  609  106  731]]
Classification Report:
         

# KNNeighbors

In [87]:
# Creare la pipeline completa con SMOTENC e RandomForestClassifier
pipeline = ImbPipeline(steps=[
    ('smote', SMOTENC(categorical_features=[X.columns.get_loc(col) for col in categorical_features], random_state=42)),
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
    ('classifier', KNeighborsClassifier())
])

# Definire una griglia dei parametri da esplorare
param_grid = {
    'classifier__n_neighbors': [3, 5, 9],  # Numero di vicini da considerare
    'classifier__weights': ['uniform', 'distance'],  # Pesi utilizzati nella previsione
    'classifier__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algoritmo per calcolare i vicini
    'classifier__leaf_size': [10, 20, 30],  # Dimensione delle foglie per gli alberi di ricerca
    'classifier__p': [1, 2],  # Parametro di potenza per la distanza di Minkowski
}


# Creare GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

# Eseguire la ricerca della griglia sui dati di addestramento
grid_search.fit(X_train, y_train)

# Ottenere i migliori parametri trovati
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Addestrare un modello con i migliori parametri trovati sull'intero set di addestramento
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Valutare il modello finale sul set di test
y_pred_test = best_model.predict(X_test)
final_accuracy_test = accuracy_score(y_test, y_pred_test)
final_f1_test = f1_score(y_test, y_pred_test, average='weighted')
final_confusion_matrix_test = confusion_matrix(y_test, y_pred_test)
final_classification_report_test = classification_report(y_test, y_pred_test)

# Valutare il modello sul set di addestramento
y_pred_train = best_model.predict(X_train)
final_accuracy_train = accuracy_score(y_train, y_pred_train)
final_f1_train = f1_score(y_train, y_pred_train, average='weighted')
final_confusion_matrix_train = confusion_matrix(y_train, y_pred_train)
final_classification_report_train = classification_report(y_train, y_pred_train)

# Stampa i risultati
print("Test set results:")
print("Final Accuracy:", final_accuracy_test)
print("Final F1 Score:", final_f1_test)
print("Confusion Matrix:\n", final_confusion_matrix_test)
print("Classification Report:\n", final_classification_report_test)

print("\nTrain set results:")
print("Final Accuracy:", final_accuracy_train)
print("Final F1 Score:", final_f1_train)
print("Confusion Matrix:\n", final_confusion_matrix_train)
print("Classification Report:\n", final_classification_report_train)

# Specifica il percorso del file dove vuoi salvare il modello
file_path = "../models/classification/KNNClassifier.pkl"

# Crea il percorso della directory se non esiste già
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Salva il miglior modello utilizzando pickle
with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Best parameters: {'classifier__algorithm': 'ball_tree', 'classifier__leaf_size': 20, 'classifier__n_neighbors': 9, 'classifier__p': 1, 'classifier__weights': 'uniform'}
Test set results:
Final Accuracy: 0.4494083554696933
Final F1 Score: 0.45250556780433354
Confusion Matrix:
 [[618 281 359 154]
 [336 374  80 221]
 [402  97 732  45]
 [ 99 173  33 137]]
Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.44      0.43      1412
           1       0.40      0.37      0.39      1011
           2       0.61      0.57      0.59      1276
           3       0.25      0.31      0.27       442

    accuracy                           0.45      4141
   macro avg       0.42      0.42      0.42      4141
weighted avg       0.46      0.45      0.45      4141


Train set results:
Final Accuracy: 0.5801738710456411
Final F1 Score: 0.5821419779403567
Confusion Matrix:
 [[3201  913 1036  500]
 [ 981 2105  295  662]
 [1174  257 3536  138]
 [ 386  5

# GaussianNB

In [89]:
# Creare la pipeline completa con SMOTENC e RandomForestClassifier
pipeline = ImbPipeline(steps=[
    ('smote', SMOTENC(categorical_features=[X.columns.get_loc(col) for col in categorical_features], random_state=42)),
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=20)),
    ('classifier', GaussianNB())
])

# Definire una griglia dei parametri da esplorare
param_grid = {
    'classifier__var_smoothing': [1e-9, 1e-8, 1e-7]  # Parametro di smoothing della varianza
}


# Creare GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

# Eseguire la ricerca della griglia sui dati di addestramento
grid_search.fit(X_train, y_train)

# Ottenere i migliori parametri trovati
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Addestrare un modello con i migliori parametri trovati sull'intero set di addestramento
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Valutare il modello finale sul set di test
y_pred_test = best_model.predict(X_test)
final_accuracy_test = accuracy_score(y_test, y_pred_test)
final_f1_test = f1_score(y_test, y_pred_test, average='weighted')
final_confusion_matrix_test = confusion_matrix(y_test, y_pred_test)
final_classification_report_test = classification_report(y_test, y_pred_test)

# Valutare il modello sul set di addestramento
y_pred_train = best_model.predict(X_train)
final_accuracy_train = accuracy_score(y_train, y_pred_train)
final_f1_train = f1_score(y_train, y_pred_train, average='weighted')
final_confusion_matrix_train = confusion_matrix(y_train, y_pred_train)
final_classification_report_train = classification_report(y_train, y_pred_train)

# Stampa i risultati
print("Test set results:")
print("Final Accuracy:", final_accuracy_test)
print("Final F1 Score:", final_f1_test)
print("Confusion Matrix:\n", final_confusion_matrix_test)
print("Classification Report:\n", final_classification_report_test)

print("\nTrain set results:")
print("Final Accuracy:", final_accuracy_train)
print("Final F1 Score:", final_f1_train)
print("Confusion Matrix:\n", final_confusion_matrix_train)
print("Classification Report:\n", final_classification_report_train)

# Specifica il percorso del file dove vuoi salvare il modello
file_path = "../models/classification/GaussianNBClassifier.pkl"

# Crea il percorso della directory se non esiste già
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Salva il miglior modello utilizzando pickle
with open(file_path, 'wb') as file:
    pickle.dump(best_model, file)

Best parameters: {'classifier__var_smoothing': 1e-09}
Test set results:
Final Accuracy: 0.29630524028012556
Final F1 Score: 0.3048312633120409
Confusion Matrix:
 [[253 136 333 690]
 [ 98 145 112 656]
 [140  85 527 524]
 [ 44  63  33 302]]
Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.18      0.26      1412
           1       0.34      0.14      0.20      1011
           2       0.52      0.41      0.46      1276
           3       0.14      0.68      0.23       442

    accuracy                           0.30      4141
   macro avg       0.37      0.35      0.29      4141
weighted avg       0.42      0.30      0.30      4141


Train set results:
Final Accuracy: 0.3007123883120019
Final F1 Score: 0.30600291860913253
Confusion Matrix:
 [[ 961  545 1381 2763]
 [ 437  572  424 2610]
 [ 555  332 2219 1999]
 [ 152  238  147 1229]]
Classification Report:
               precision    recall  f1-score   support

           0       0