# Applicazione degli algoritmi

### Importazione dei dati

In [1]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_validate
from sklearn.manifold import TSNE
from sklearn.metrics import auc, accuracy_score, roc_curve, recall_score, matthews_corrcoef, f1_score, confusion_matrix
from statistics import mean

import os
import pickle
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore")
ConvergenceWarning("ignore")



In [2]:
def importa_dati(path, sep=','):
    """Funzione per l'importazione di un dataset con un formato csv in un DataFrame di Pandas.
       - path: percorso in cui è salvato il file.csv (stringa);
       - sep: separatore utilizzato nel file (stringa)"""
    data = pd.read_csv(path, sep=sep)
    data.drop('Unnamed: 0', axis=1, inplace=True)
    return data

In [3]:
df_ohe = importa_dati('/Users/eliaceccolini/Documents/Uni/Tesi/Dataset_finale/Datasets/Dataset_preprocessato_OHE.csv')
df_le = importa_dati('/Users/eliaceccolini/Documents/Uni/Tesi/Dataset_finale/Datasets/Dataset_preprocessato_LE.csv')

***
### Dati nulli

In [4]:
def sostituisci_nulli(data, method):
    """Funzione per sostiuire i dati nulli utilizzando il metodo method.
       - data: Dataset (DataFrame)
       - method: metodo con cui sostiuire i nulli (String)"""
    null_cols = [c for c in data.columns if data[c].isnull().sum() > 0]
    data.fillna(method=method, inplace=True)

In [5]:
# dataset preprocessato con LE
sostituisci_nulli(df_le, 'ffill')

# dataset preprocessato con OHE
sostituisci_nulli(df_ohe, 'ffill')

***
### Divisione della variabile risposta dalle feature

In [6]:
def estrapola_variabile_risposta(data, var):
    """Funzione per dividere il dataset in X e y, quindi per separare la variabile risposta dalle feature.
       - data: Dataset (DataFrame)
       - var: nome della variabile risposta (String)"""
    y = data[var]
    X = data.drop(var, axis=1)
    return X, y

In [7]:
# dataset preprocessato con LE
X_le, y_le = estrapola_variabile_risposta(df_le, 'best_response')

# dataset preprocessato con OHE
X_ohe, y_ohe = estrapola_variabile_risposta(df_ohe, 'best_response')

***
### Funzione per eseguire cross validation e salvare i relativi grafici

In [8]:
def classificazione_crossvalidate(X, y, exp_name, num_trials, p_grid, model=None, model_name=None):
    """Funzione per evidenziare la t-distributed stochastic neighbor embedding del dataset in due dimensioni
       e per applicare il modello model_name al dataset per predire y a partire dalle feature X.
       - X: feature usate per la predizione di y (DataFrame)
       - y: variabile risposta (Series)
       - exp_name: nome dell'esperimento (String)
       - num_trials: numero di iterazioni da effettuare per la predizione (int)
       - p_grid: parametri della griglia (Dictionary)
       - model: modello ML da utilizzare
       - model_name: nome del modello (String)"""
    
    # visualizzazione del dataset in uno spazio bidimensionale (tsne)
    np.random.seed(1)
    tsne = TSNE(n_components=2, verbose=0, random_state=123)
    z = tsne.fit_transform(X)
    df = pd.DataFrame()
    df["y"] = y
    df["comp-1"] = z[:,0]
    df["comp-2"] = z[:,1]
    plt.figure()
    sns.scatterplot(x="comp-1", y="comp-2", hue=df.y.tolist(),data=df)
    plt.title("TSNE_"+exp_name)
    plt.savefig("immagini/tsne/TSNE_"+ exp_name +".png", dpi=600)
    plt.close()
    
    # divisione del datset in train e test
    X, X_test_final, y, y_test_final = train_test_split(X, y, test_size=0.33, random_state=1)
    
    # score che verranno considerati
    myscoring = ['balanced_accuracy', 'roc_auc', 'average_precision', 'recall']
    
    # inizializzazione dei vettori per i punteggi nel train set
    nested_scores = np.zeros(num_trials)
    bal_acc_train_scores = np.zeros((num_trials,1))
    roc_auc_train_scores = np.zeros((num_trials,1))
    ave_pre_train_scores = np.zeros((num_trials,1))
    recall_train_scores = np.zeros((num_trials,1))
    # inizializzazione dei vettori per i punteggi nel test set
    bal_acc_test_scores = np.zeros((num_trials,1))
    roc_auc_test_scores = np.zeros((num_trials,1))
    ave_pre_test_scores = np.zeros((num_trials,1))
    recall_test_scores = np.zeros((num_trials,1))
    
    # inizio iterazioni
    for i in range(num_trials):
        print('Iterazione numero '+str(i))
        np.random.seed(i)
        cv_inner = KFold(n_splits=5, shuffle=True, random_state=1)
        cv_outer = KFold(n_splits=5, shuffle=True, random_state=1)
        # definizione griglia
        clf = GridSearchCV(model, p_grid, scoring='recall', n_jobs=1, cv=cv_inner, refit=True, return_train_score=True)
        # esecuzione della cross validation innestata
        scores = cross_validate(clf, X=X, y=y, cv=cv_outer, return_train_score=True, return_estimator=True, scoring=myscoring)
        
        # salvataggio e stampa dei risultati all'i-esima iterazione
        bal_acc_train_scores[i] = np.mean(scores['train_balanced_accuracy'])
        roc_auc_train_scores[i] = np.mean(scores['train_roc_auc'])
        ave_pre_train_scores[i] = np.mean(scores['train_average_precision'])
        recall_train_scores[i] = np.mean(scores['train_recall'])
        print('Train: balanced_accuracy ' + str( bal_acc_train_scores[i]))
        print('Train: roc_auc ' + str(roc_auc_train_scores[i]))
        print('Train: average_precision ' + str(ave_pre_train_scores[i]))
        print('Train: recall ' + str(recall_train_scores[i]))
        bal_acc_test_scores[i] = np.mean(scores['test_balanced_accuracy'])
        roc_auc_test_scores[i] = np.mean(scores['test_roc_auc'])
        ave_pre_test_scores[i] = np.mean(scores['test_average_precision'])
        recall_test_scores[i] = np.mean(scores['test_recall'])
        print('Test: balanced_accuracy ' + str( bal_acc_test_scores[i]))
        print('Test: roc_auc ' + str(roc_auc_test_scores[i]))
        print('Test: average_precision ' + str(ave_pre_test_scores[i]))
        print('Test: recall ' + str(recall_test_scores[i]))
        
        # fpr: tasso falsi positivi, tpr: tasso veri positivi
        mean_fpr = np.linspace(0, 1, 1000)
        tprs = []
        # divisione train e test set
        for j, (train_ix, test_ix) in enumerate(cv_inner.split(X, y)):
            X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
            y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
            classifier_fold = scores['estimator'][j].best_estimator_
            classifier_fold.fit(X_train, y_train)
            y_pred_labels = classifier_fold.predict(X_test)
            # rapporto falsi positivi e veri positivi e soglia utilizzata per il loro calcolo
            fpr, tpr, thresholds = roc_curve(y_test, y_pred_labels)
            # calcolo area sotto la curva
            roc_auc = auc(fpr, tpr)
            interp_tpr = np.interp(mean_fpr, fpr, tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)
        
        # plot ROC AUC medie
        plt.figure()
        plt.plot([0, 1], [0, 1], '--', color='r', label='Random classifier', lw=2, alpha=0.8)
        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        plt.title('Mean AUC=%0.3f' % mean_auc)
        plt.plot(mean_fpr, mean_tpr, color='b', label='Mean ROC', lw=2, alpha=0.8)

        ## calcolo deviazione standard
        std_tpr = np.std(tprs, axis=0)
        tprs_upper_std = np.minimum(mean_tpr + std_tpr, 1)
        tprs_lower_std = np.maximum(mean_tpr - std_tpr, 0)
        plt.fill_between(mean_fpr, tprs_lower_std, tprs_upper_std, color='green', alpha=.2,label=r'$\pm$ 1 SD')

        ## calcolo 99.9% CI
        z = 3.291
        SE = std_tpr / np.sqrt(num_trials * 5)
        tprs_upper_95CI = mean_tpr + (z * SE)
        tprs_lower_95CI = mean_tpr - (z * SE)
        plt.fill_between(mean_fpr, tprs_lower_95CI, tprs_upper_95CI, color='grey', alpha=.5,label=r'$\pm$ 99.9% CI')
        
        # salvataggio curva ROC
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('Tasso di Falsi Positivi')
        plt.ylabel('Tasso di Veri Positivi')
        plt.legend(loc="lower right")
        os.makedirs("ROCcurves", exist_ok=True)
        plt.savefig("immagini/ROCcurves/ROCcurve_" + exp_name + "_" + model_name + ".png", dpi=600)
        plt.close()

    mean_results_matrix = np.zeros((3,2))

    train_accuracy_global_mean = np.mean(bal_acc_train_scores)
    train_accuracy_global_std = np.std(bal_acc_train_scores)

    test_accuracy_global_mean = np.mean(bal_acc_test_scores)
    test_accuracy_global_std = np.std(bal_acc_test_scores)
    mean_results_matrix[0,0] = test_accuracy_global_mean
    mean_results_matrix[0,1] = test_accuracy_global_std

    roc_auc_global_test_mean = np.mean(roc_auc_test_scores)
    roc_auc_global_test_std = np.std(roc_auc_test_scores)
    mean_results_matrix[1,0] = roc_auc_global_test_mean
    mean_results_matrix[1,1] = roc_auc_global_test_std

    test_recall_global_mean = np.mean(recall_test_scores)
    test_recall_global_std = np.std(recall_test_scores)
    mean_results_matrix[2,0] = test_recall_global_mean
    mean_results_matrix[2,1] = test_recall_global_std

    mean_results_df = pd.DataFrame(data=mean_results_matrix, columns=["mean","std"], index=["Test accuracy","Test ROC AUC","Test recall"])
    os.makedirs("risultati", exist_ok=True)
    mean_results_df.to_html("risultati/Mean_Results_"+ exp_name + "_" + model_name +".html")


    print("Train accuracy mean: " + str(train_accuracy_global_mean) + " std: " + str(train_accuracy_global_std))
    print("Test ROC AUC mean: " + str(roc_auc_global_test_mean) + " std: " + str(roc_auc_global_test_std))
    print("Test accuracy mean: " + str(test_accuracy_global_mean) + " std: " + str(test_accuracy_global_std))
    print("Test recall mean: " + str(test_recall_global_mean) + " std: " + str(test_recall_global_std))


    # modello finale
    print("Training final classifier...")
    clf_final = GridSearchCV(estimator=model, param_grid=p_grid, scoring='recall', 
                             n_jobs=-1, refit=True, cv=cv_inner, verbose=0, return_train_score=True)
    clf_final.fit(X,y)
    best_model = clf_final.best_estimator_
    print("Best final estimator:")
    print(best_model)
    
    # se il modello è un albero o un random forest vengono stampate e salvate le var più significative
    if ('Tree' in model_name or 'Forest' in model_name or 'XGBoost' in model_name):
        feature_names = X.columns
        feature_importances = best_model.feature_importances_
        # Ordinare le feature per importanza in modo decrescente e selezionare solo le prime 10
        top_10_indices = feature_importances.argsort()[::-1][:10]
        top_10_importances = feature_importances[top_10_indices]
        top_10_feature_names = feature_names[top_10_indices]
        plt.figure()
        plt.barh(range(len(top_10_importances)), top_10_importances, align='center')
        plt.yticks(range(len(top_10_importances)), top_10_feature_names)
        plt.xlabel('Importanza delle variabili')
        plt.ylabel('Variabili')
        plt.title('Decision Tree Feature Importances')
        plt.tight_layout()
        plt.savefig('immagini/importanza_variabili/Importanza_variabili_'+exp_name+'.png', dpi=600)
        
    # se il modello è un SVM vengono stampati e salvati i pesi relativi alle var più significative
    if 'SVM' in model_name:
        coefficients = model.coef_
        # associazione dei coefficienti alle variabili
        variable_coefficients = list(zip(X_train.columns, coefficients))
        # ordinamento in ordine decrescente delle variabili in base al valore assoluto del peso assegnotogli
        variable_coefficients.sort(key=lambda x: abs(x[1]), reverse=True)
        top_10_variables = variable_coefficients[:10]
        # Estrazione delle variabili e coefficienti per il grafico
        variables, coefficients = zip(*top_10_variables)
        plt.figure()
        plt.barh(range(len(variables)), coefficients, align='center')
        plt.yticks(range(len(variables)), variables)
        plt.xlabel('Peso')
        plt.ylabel('Variabile')
        plt.title('Le prime 10 variabili più importanti')
        plt.tight_layout()
        plt.savefig('immagini/importanza_variabili/Importanza_variabili_'+exp_name+'.png', dpi=600)
        
    # se il modello è un albero viene stampata e salvata la struttura dell'albero decisionale
    if 'Tree' in model_name:
        plt.figure()
        plt.title('albero_decisionale'+exp_name)
        plot_tree(decision_tree=best_model, feature_names=X.columns)
        plt.savefig('immagini/alberi_decisionali/Albero_decisionale_'+exp_name+'.png', dpi=600)
    
    # salvataggio del modello allenato
    os.makedirs("modelli", exist_ok=True)
    pickle.dump(best_model, open("modelli/Modello_"+ exp_name + "_" + model_name +".pkl",'wb'))
    
    # predizione del test set
    y_final_pred_labels = best_model.predict(X_test_final)
    final_model_accuracy = accuracy_score(y_test_final, y_final_pred_labels)
    # stampa dei punteggi finali
    print("Final estimator accuracy: " + str(final_model_accuracy))
    fpr, tpr, thresholds = roc_curve(y_test_final, y_final_pred_labels)
    roc_auc = auc(fpr, tpr)
    print("roc_auc final model: " + str(np.round(roc_auc,3)))
    final_model_recall = recall_score(y_test_final, y_final_pred_labels)
    print("Final estimator recall: " + str(final_model_recall))
    final_model_matt_corrcoef = matthews_corrcoef(y_test_final, y_final_pred_labels)
    print("Final estimator Matthews Correlation Coefficient: " + str(final_model_matt_corrcoef))
    final_model_f1score = f1_score(y_test_final, y_final_pred_labels)
    print("Final estimator F1 score: " + str(final_model_f1score))
    
    # stampa e salvataggio della matrice di confusione
    conf_matrix = confusion_matrix(y_test_final, y_final_pred_labels)
    plt.figure()
    plt.title('Matrice_Confusione_' + exp_name)
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.savefig('immagini/matrici_confusione/Matrice_confusione_'+exp_name+'.png', dpi=600)
    
    return final_model_accuracy, roc_auc, final_model_recall, final_model_f1score, final_model_matt_corrcoef

***
### Applicazione modelli
I modelli che saranno applicati sono:
- Naive Bayes
- Decision Tree Classifier
- Random Forest Classifier
- SVM
- XGBoost
- MLP (devo ancora aggiungerlo)

In [9]:
def applica_modelli(X, y, exp_name):
    """Funzione per applicare i modelli Naive Bayes, Decision Tree Classifier, Random Forest, SVM, XGBoost e MLP
       al dataset data.
       - X: feature del dataset (DataFrame)
       - y: variabile risposta (Series)
       - exp_name: nome esperimento (String)"""
    
    res = np.zeros((5, 5))
    
    # Naive Bayes
    p_grid = {'var_smoothing': np.logspace(0,-9, num=100)}
    model = GaussianNB()
    acc, roc_auc, recall, f1score, marr_corrcoef = classificazione_crossvalidate(X, y, 'NB_'+exp_name, 50, p_grid, model, 'NaiveBayes')
    res[0] = [acc, roc_auc, recall, f1score, marr_corrcoef]
    
    # Decision Tree Classifier
    p_grid = {"criterion":['gini','entropy'], "max_depth":[2,4,6,8,10,12]}
    model = DecisionTreeClassifier()
    acc, roc_auc, recall, f1score, marr_corrcoef = classificazione_crossvalidate(X, y, 'DT_'+exp_name, 50, p_grid, model, 'DecisionTree')
    res[1] = [acc, roc_auc, recall, f1score, marr_corrcoef]
    
    # Random Forest Classifier
    p_grid = {'n_estimators': [5, 10, 15, 20], 'max_depth': [2, 5, 7, 9]}
    model = RandomForestClassifier()
    acc, roc_auc, recall, f1score, marr_corrcoef = classificazione_crossvalidate(X, y, 'RF_'+exp_name, 50, p_grid, model, 'RandomForest')
    res[2] = [acc, roc_auc, recall, f1score, marr_corrcoef]
    
    # SVM
    p_grid = {"C": [1, 10, 100], "gamma": [0.01, 0.1]}
    model = SVC(kernel="linear", max_iter=100)
    acc, roc_auc, recall, f1score, marr_corrcoef = classificazione_crossvalidate(X, y, 'SVM_'+exp_name, 50, p_grid, model, 'SVM')
    res[3] = [acc, roc_auc, recall, f1score, marr_corrcoef]
    
    # XGBoost
    p_grid = {"gamma":[0, 0.1, 0.2,0.3,0.4,0.5],
              "max_depth": [3,5,10],
              "n_estimators":[5,10, 20, 100],
              "ubsample": [0.25, 0.5, 1],
              "verbosity": [0]}
    model = XGBClassifier(silent=True)
    acc, roc_auc, recall, f1score, marr_corrcoef = classificazione_crossvalidate(X, y, 'XGBoost_'+exp_name, 50, p_grid, model, 'XGBoost')
    res[4] = [acc, roc_auc, recall, f1score, marr_corrcoef]
    return res

In [10]:
# esperimento: dataset pre-processato con LE nella versione binaria
#scores_le = applica_modelli(X_le, y_le, 'LE_BIN')

# esperimento: dataset pre-processato con OHE nella versione binaria
#scores_ohe = applica_modelli(X_ohe, y_ohe, 'OHE_BIN')

In [11]:
p_grid = {"C": [1, 10, 100], "gamma": [0.01, 0.1]}
model = SVC(kernel="linear", max_iter=100)
acc, roc_auc, recall, f1score, marr_corrcoef = classificazione_crossvalidate(X_le, y_le, 'SVM_LE_BIN', 50, p_grid, model, 'SVM')

Iterazione numero 0
Train: balanced_accuracy [0.49092145]
Train: roc_auc [0.47958208]
Train: average_precision [0.55741263]
Train: recall [0.60592591]
Test: balanced_accuracy [0.5132104]
Test: roc_auc [0.48408858]
Test: average_precision [0.56984452]
Test: recall [0.64441963]
Iterazione numero 1
Train: balanced_accuracy [0.49092145]
Train: roc_auc [0.47958208]
Train: average_precision [0.55741263]
Train: recall [0.60592591]
Test: balanced_accuracy [0.5132104]
Test: roc_auc [0.48408858]
Test: average_precision [0.56984452]
Test: recall [0.64441963]
Iterazione numero 2
Train: balanced_accuracy [0.49092145]
Train: roc_auc [0.47958208]
Train: average_precision [0.55741263]
Train: recall [0.60592591]
Test: balanced_accuracy [0.5132104]
Test: roc_auc [0.48408858]
Test: average_precision [0.56984452]
Test: recall [0.64441963]
Iterazione numero 3
Train: balanced_accuracy [0.49092145]
Train: roc_auc [0.47958208]
Train: average_precision [0.55741263]
Train: recall [0.60592591]
Test: balanced_acc

Iterazione numero 30
Train: balanced_accuracy [0.49092145]
Train: roc_auc [0.47958208]
Train: average_precision [0.55741263]
Train: recall [0.60592591]
Test: balanced_accuracy [0.5132104]
Test: roc_auc [0.48408858]
Test: average_precision [0.56984452]
Test: recall [0.64441963]
Iterazione numero 31
Train: balanced_accuracy [0.49092145]
Train: roc_auc [0.47958208]
Train: average_precision [0.55741263]
Train: recall [0.60592591]
Test: balanced_accuracy [0.5132104]
Test: roc_auc [0.48408858]
Test: average_precision [0.56984452]
Test: recall [0.64441963]
Iterazione numero 32
Train: balanced_accuracy [0.49092145]
Train: roc_auc [0.47958208]
Train: average_precision [0.55741263]
Train: recall [0.60592591]
Test: balanced_accuracy [0.5132104]
Test: roc_auc [0.48408858]
Test: average_precision [0.56984452]
Test: recall [0.64441963]
Iterazione numero 33
Train: balanced_accuracy [0.49092145]
Train: roc_auc [0.47958208]
Train: average_precision [0.55741263]
Train: recall [0.60592591]
Test: balanced



Best final estimator:
SVC(C=1, gamma=0.01, kernel='linear', max_iter=100)


AttributeError: 'SVC' object has no attribute 'dual_coef_'