In [None]:
# Model selection con optuna

#dataset di test
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

import optuna
from optuna.storages import RDBStorage

from sklearn.model_selection import KFold,cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

#modelli
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC

#score
from sklearn.metrics import f1_score

In [None]:
num_trials = 100

In [None]:
# import di X_train, X_test, y_train, y_test con pandas
X_train = np.array(pd.read_excel('X_train_2.xlsx').iloc[:,8:])
X_test = np.array(pd.read_excel('X_test_2.xlsx').iloc[:,8:])
y_train = np.array(pd.read_excel('y_train_2.xlsx'))-1
y_test = np.array(pd.read_excel('y_test_2.xlsx'))-1

In [None]:
#adatto i dati
y_test = y_test.reshape(-1,)
y_train = y_train.reshape(-1,)

In [None]:
dataset_di_prova = False
if dataset_di_prova:
    data = load_digits()
    X= data.data
    y = data.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# provo il random forest
model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f1_score(y_test, y_pred, average='weighted') # il parametro average è per il multiclass

## Random Forest

In [None]:
def find_best_params_random_forest(X_train, y_train, X_test, y_test):
    model = RandomForestClassifier(n_jobs=-1)
    study = optuna.create_study(direction='maximize',
                                study_name = "study_random_forest",
                                storage=RDBStorage("sqlite:///study_random_forest.db"), 
                                load_if_exists=True
                                )

    def objective(trial):
        # Definisci i parametri da ottimizzare per il modello
        n_estimators = trial.suggest_int('n_estimators', 50, 1000)
        max_depth = trial.suggest_int('max_depth', 3, 100)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 200)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])

        model.set_params(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split,
                         min_samples_leaf=min_samples_leaf, max_features=max_features, bootstrap=bootstrap)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        #utilizzo f1 score come metrica dato che accuracy non è adatta per dataset sbilanciati
        #facendo il kfold validation
        kfold = KFold(n_splits=10, shuffle=True)
        scores = []
        for train_index, test_index in kfold.split(X_test):
           X_train_fold, X_test_fold = X_test[train_index], X_test[test_index]
           y_train_fold, y_test_fold = y_test[train_index], y_test[test_index]
           model.fit(X_train_fold, y_train_fold)
           y_pred_fold = model.predict(X_test_fold)
           scores.append(f1_score(y_test_fold, y_pred_fold, average='weighted'))
        return np.mean(scores)


    study.optimize(objective, n_trials=num_trials)
    trials_df = study.trials_dataframe()
    return trials_df

## Support Vector Machine

In [None]:
def find_best_params_svm(X_train, y_train, X_test, y_test):
    model = SVC()
    study = optuna.create_study(direction='maximize',
                                study_name = "study_svm",
                                storage=RDBStorage("sqlite:///study_svm.db"), 
                                load_if_exists=True
                                )

    def objective(trial):
        # Definisci i parametri da ottimizzare per il modello
        C = trial.suggest_float('C', 0.1, 1000)
        kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
        degree = trial.suggest_int('degree', 1, 10)
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

        model.set_params(C=C, kernel=kernel, degree=degree, gamma=gamma)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        #utilizzo f1 score come metrica dato che accuracy non è adatta per dataset sbilanciati
        return f1_score(y_test, y_pred, average='weighted')

    study.optimize(objective, n_trials=num_trials)
    trials_df = study.trials_dataframe()
    return trials_df

## XGBoost

In [None]:
def find_best_params_xgb(X_train, y_train, X_test, y_test):
    model = xgb.XGBClassifier(n_jobs=-1)
    study = optuna.create_study(direction='maximize',
                                study_name = "study_xgb",
                                storage=RDBStorage("sqlite:///study_xgb.db"), 
                                load_if_exists=True
                                )
    
    def objective(trial):
        # Definisci i parametri da ottimizzare per il modello
        n_estimators = trial.suggest_int('n_estimators', 50, 1000)
        max_depth = trial.suggest_int('max_depth', 3, 100)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 1)
        gamma = trial.suggest_float('gamma', 0.01, 1)
        subsample = trial.suggest_float('subsample', 0.01, 1)
        colsample_bytree = trial.suggest_float('colsample_bytree', 0.01, 1)
        reg_alpha = trial.suggest_float('reg_alpha', 0.01, 1)
        reg_lambda = trial.suggest_float('reg_lambda', 0.01, 1)
        min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
        objective = trial.suggest_categorical('objective', ['binary:logistic', 'binary:logitraw', 'binary:hinge'])
        booster = trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart'])
        colsample_bytree = trial.suggest_float('colsample_bytree', 0.01, 1)
        gamma = trial.suggest_float('gamma', 0.01, 1)
        max_depth = trial.suggest_int('max_depth', 3, 100)
        min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
        subsample = trial.suggest_float('subsample', 0.01, 1)

        model.set_params(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, gamma=gamma,
                         subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha,
                         reg_lambda=reg_lambda, min_child_weight=min_child_weight, objective=objective, booster=booster)
        model.fit(X_train, y_train)
        #utilizzo f1 score come metrica dato che accuracy non è adatta per dataset sbilanciati
        kfold = KFold(n_splits=10, shuffle=True)
        scores = []
        for train_index, test_index in kfold.split(X_test):
           X_train_fold, X_test_fold = X_test[train_index], X_test[test_index]
           y_train_fold, y_test_fold = y_test[train_index], y_test[test_index]
           model.fit(X_train_fold, y_train_fold)
           y_pred_fold = model.predict(X_test_fold)
           scores.append(f1_score(y_test_fold, y_pred_fold, average='weighted'))
        return np.mean(scores)
    
    study.optimize(objective, n_trials=num_trials)
    trials_df = study.trials_dataframe()
    return trials_df

# Model Selection

In [None]:
def model_selection_optuna(X_test,y_test,X_train,y_train):

    # definisco i modelli da utilizzare e le funzioni per la ricerca dei parametri migliori
    models ={
        'RandomForest': find_best_params_random_forest,
        'SVM': find_best_params_svm,
        'xgb': find_best_params_xgb
    }

    print("--- model selection ---")
    print("Modelli utilizzati:", models.keys())
    
    trials_dataframes = {}
    print("Trovo i parametri migliori per ogni modello...")
    print("\n")
    print("modello: RandomForest")
    trials_dataframes['RandomForest'] = models['RandomForest'](X_train, y_train, X_test, y_test)
    print("\n")
    print("modello: SVM")
    trials_dataframes['SVM'] = models['SVM'](X_train, y_train, X_test, y_test)
    print("\n")
    print("modello: xgb")
    trials_dataframes['xgb'] = models['xgb'](X_train, y_train, X_test, y_test)
    print("\n")

    print("risultati model selection:")
    for model_name in models.keys():
        print("modello:", model_name)
        print(trials_dataframes[model_name].sort_values(by='value', ascending=True).head(1))
        print("\n")

In [None]:
model_selection = model_selection_optuna(X_test,y_test,X_train,y_train)