# GridSearch et cross validation

## 1. Intro

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import tree, ensemble, linear_model, svm, neighbors
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score, classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler,  ClusterCentroids
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score

from joblib import dump, load # pour enregistrer et charger les modèles.

from IPython.display import display_html # pour pouvoir afficher deux df côte à côte.

In [2]:
# Instanciation df
df = pd.read_csv('../../../../data/processed/alex/ready_Cloud9am_Cloud3pm.csv')
df.drop(columns = 'Unnamed: 0', inplace = True)
df['Date'] = pd.to_datetime(df['Date'])

# Séparation data / target:
X = df.drop(columns = ['RainTomorrow', 'Date', 'NonMesNum']).copy()
y = df['RainTomorrow'].copy()

# Séparation du jeu d'entrainement et du jeu de test:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 123)

# Scale de X_train, X_test:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# UnderSample grâce à ClusterCentroids
cc = ClusterCentroids()
X_cc, y_cc = cc.fit_resample(X_train, y_train)

# Oversample avec SMOTE
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X_train, y_train)



## 2. Essais

### 2.1 Fonction report

In [14]:
def report(model, sampling, eval_train):

    # Cette fonction entraine le modèle choisi sur les données X_train et y_train (qui sont ré-équilibrées en amont)
    # et fait des prédictions sur X_test et y_test (qui elles, ne doivent pas être rééquilibrées)
    # On affiche ensuite le rapport complet: matrice de confusion et classification report

    # Définition de X_train et y_train en fonction du choix du sampling:
    if sampling == 'SMOTE':
        X_train = X_sm
        y_train = y_sm
    elif sampling == 'CC':
        X_train = X_cc
        y_train = y_cc

    # Instanciation et entrainement du modèle:
    clf = models[model]
    clf.fit(X_train, y_train)

    # Prédiction et évaluation:
    if eval_train == True:
        y_train_pred = clf.predict(X_train)
        conf_mat = pd.crosstab(y_train,y_train_pred, rownames = ['Classes réelles'], colnames = ['Classes prédites'])       # Création au format df
        class_rep = pd.DataFrame.from_dict(classification_report(y_train, y_train_pred, output_dict=True, digits = 2)).T    # création au format df
    else:
        y_test_pred = clf.predict(X_test)
        conf_mat = pd.crosstab(y_test,y_test_pred, rownames = ['Classes réelles'], colnames = ['Classes prédites'])       # Création au format df
        class_rep = pd.DataFrame.from_dict(classification_report(y_test, y_test_pred, output_dict=True, digits = 2)).T    # création au format df

    # Affichage de la matrice de confusion et du rapport de classification:
    df_cm = conf_mat.style.set_table_attributes("style='display:inline'").set_caption('Confusion Matrix')
    df_cr = class_rep.style.set_table_attributes("style='display:inline'").set_caption('Classification Report')

    #pour afficher au format txt:
    #display_html(print(confusion_matrix(y_test, y_test_pred)) + print(classification_report(y_test, y_test_pred, output_dict=False, digits = 2)), raw=True)

    # Création du titre en fonction du modèle et du ré-échantillonage choisi:
    if sampling == 'SMOTE':
        title = 'Rapport pour'+ ' ' + str(model) + ' ' + ' avec SMOTE'
    elif sampling == 'CC':
        title = 'Rapport pour'+ ' ' + str(model) + ' ' + ' avec ClusterCentroids'

    # Pour afficher au format df
    display_html(title + df_cm._repr_html_()+df_cr._repr_html_(), raw=True)

    # Enregistrement des résultats:
    acc    = class_rep['precision'][2]
    prec_0 = class_rep['precision'][0]
    rec_0  = class_rep['recall'][0]
    f1_0   = class_rep['f1-score'][0]
    prec_1 = class_rep['precision'][1]
    rec_1  = class_rep['recall'][1]
    f1_1   = class_rep['f1-score'][1]
    f1_weighted_avg = class_rep['f1-score'][4]



### 2.2 Essais

#### 2.2.a Dictionnaires

In [6]:
# Dictionnaire des modèles et zone de recherche des paramètres:
dict_grid = {'logreg' : [ linear_model.LogisticRegression(class_weight = 'balanced'), {"C" : np.logspace(-3, 3, num = 7)} ],
            'dt' : [ tree.DecisionTreeClassifier(class_weight = 'balanced'),  {"criterion" : ['gini', 'entropy', 'log_loss'],
                                                                               "max_depth" : [5, 10, 50, 100]}],
            'rdf' : [ ensemble.RandomForestClassifier(class_weight = 'balanced'), {"n_estimators" : [5, 10, 50, 100, 300],
                                                                                   "criterion" :['gini', 'entropy', 'log_loss']}  ],
            'knn' : [ neighbors.KNeighborsClassifier() , {"n_neighbors" : [10, 25, 50, 100],
                                                          "metric" : ['minkowski', 'manhattan', 'chebyshev'],
                                                          "weights": ['uniform', 'distance']}],
            'svm' : [ svm.SVC(class_weight = 'balanced'), {"C" : np.logspace(-2, 2, num = 5),
                                                           "kernel" :['rbf', 'poly', 'sigmoid'] ,
                                                           "gamma" : np.logspace(-2, 2, num = 5)} ]}

In [10]:
# Définition d'un dictionnaire pour appeler dans la fonction report, avec les best params de GridSearchCV
models = {"logreg" : linear_model.LogisticRegression(C = 1),
          "dt"  : tree.DecisionTreeClassifier(criterion =  'gini', max_depth =  50),
          "rdf" : ensemble.RandomForestClassifier(n_estimators =  100, criterion =  'gini'),
          "knn" : neighbors.KNeighborsClassifier(n_neighbors =  5, metric = 'minkowski'),
          "svm" : svm.SVC(kernel = 'rbf', C =  10, gamma =  0.1)}

#### 2.2.b Mesures

In [12]:
report('rdf', 'SMOTE')

Classes prédites,0,1
Classes réelles,Unnamed: 1_level_1,Unnamed: 2_level_1
0,960,133
1,124,273

Unnamed: 0,precision,recall,f1-score,support
0,0.885609,0.878317,0.881948,1093.0
1,0.672414,0.687657,0.67995,397.0
accuracy,0.827517,0.827517,0.827517,0.827517
macro avg,0.779011,0.782987,0.780949,1490.0
weighted avg,0.828805,0.827517,0.828127,1490.0


In [15]:
report('rdf', 'SMOTE', eval_train = True)

Classes prédites,0,1
Classes réelles,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3291,0
1,0,3291

Unnamed: 0,precision,recall,f1-score,support
0,1.0,1.0,1.0,3291.0
1,1.0,1.0,1.0,3291.0
accuracy,1.0,1.0,1.0,1.0
macro avg,1.0,1.0,1.0,6582.0
weighted avg,1.0,1.0,1.0,6582.0


## 3. Essais à la main

In [35]:
clf_lr = linear_model.LogisticRegression(max_iter = 1000)
params_lr = {'solver': ['liblinear', 'lbfgs'], 'C': [10**(i) for i in range(-1, 1)]}

gridcv = GridSearchCV(estimator = clf_lr, param_grid = params_lr, scoring='f1_weighted', cv=3)

In [36]:
gridcv.fit(X_train, y_train)

In [37]:
gridcv.best_params_

{'C': 0.1, 'solver': 'liblinear'}

In [38]:
pd.DataFrame(gridcv.cv_results_)[['params', 'mean_test_score', 'std_test_score']]

Unnamed: 0,params,mean_test_score,std_test_score
0,"{'C': 0.1, 'solver': 'liblinear'}",0.85545,0.005583
1,"{'C': 0.1, 'solver': 'lbfgs'}",0.854406,0.004543
2,"{'C': 1, 'solver': 'liblinear'}",0.853739,0.005737
3,"{'C': 1, 'solver': 'lbfgs'}",0.853739,0.005737
