# 05. Réduire l'overfitting

Test sur le df Clouds, avec SMOTE et rdf otpimisé

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import tree, ensemble, linear_model, svm, neighbors
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score, classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler,  ClusterCentroids
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score

from joblib import dump, load # pour enregistrer et charger les modèles.

from IPython.display import display_html # pour pouvoir afficher deux df côte à côte.

In [3]:
# Instanciation df
df = pd.read_csv('ready_Cloud9am_Cloud3pm.csv')
df.drop(columns = 'Unnamed: 0', inplace = True)
df['Date'] = pd.to_datetime(df['Date'])

# Séparation data / target:
X = df.drop(columns = ['RainTomorrow', 'Date', 'NonMesNum']).copy()
y = df['RainTomorrow'].copy()

# Séparation du jeu d'entrainement et du jeu de test:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 123)

# Scale de X_train, X_test:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# UnderSample grâce à ClusterCentroids
cc = ClusterCentroids()
X_cc, y_cc = cc.fit_resample(X_train, y_train)

# Oversample avec SMOTE
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X_train, y_train)



In [4]:
# Chargement des grid:
grid_logreg = load('alex_df_clouds/grid_saves/grid_logreg.joblib')
grid_dt = load('alex_df_clouds/grid_saves/grid_dt.joblib')
grid_rdf = load('alex_df_clouds/grid_saves/grid_rdf.joblib')
grid_knn = load('alex_df_clouds/grid_saves/grid_knn.joblib')
grid_svm = load('alex_df_clouds/grid_saves/grid_svm.joblib')

In [5]:
# Définition d'un dictionnaire pour appeler dans la fonction report, avec les best params de GridSearchCV
models = {"logreg" : linear_model.LogisticRegression(C = grid_logreg.best_params_['C']), 
          "dt"  : tree.DecisionTreeClassifier(criterion =  grid_dt.best_params_['criterion'], max_depth =  grid_dt.best_params_['max_depth']),
          "rdf" : ensemble.RandomForestClassifier(n_estimators =  grid_rdf.best_params_['n_estimators'], criterion =  grid_rdf.best_params_['criterion']),
          "knn" : neighbors.KNeighborsClassifier(n_neighbors =  grid_knn.best_params_['n_neighbors'], metric = grid_knn.best_params_['metric']),
          "svm" : svm.SVC(kernel = grid_svm.best_params_['kernel'], C =  grid_svm.best_params_['C'], gamma =  grid_svm.best_params_['gamma'])}

In [20]:
cv = KFold(n_splits = 5, random_state = 111, shuffle = True)
clf = models['rdf']
scores = cross_validate(clf, X_sm, y_sm, cv=cv, scoring='f1_weighted', return_estimator = True)
#scores
scores['test_score'].mean()

0.8995637322371686

In [14]:
y_test_pred = clf.predict(X_test)
conf_mat = pd.crosstab(y_test,y_test_pred, rownames = ['Classes réelles'], colnames = ['Classes prédites'])       # Création au format df
class_rep = pd.DataFrame.from_dict(classification_report(y_test, y_test_pred, output_dict=True, digits = 2)).T    # création au format df
    
# Affichage de la matrice de confusion et du rapport de classification:
df_cm = conf_mat.style.set_table_attributes("style='display:inline'").set_caption('Confusion Matrix')
df_cr = class_rep.style.set_table_attributes("style='display:inline'").set_caption('Classification Report')

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.