# Voting et Stacking

On utilise ici les versions standards des classifieurs, non optimisés.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style = "darkgrid", palette = "colorblind")

from sklearn import tree, ensemble, linear_model, svm, neighbors
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_validate
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score, classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, StackingClassifier

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler,  ClusterCentroids
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score

from joblib import dump, load # pour enregistrer et charger les modèles.

from IPython.display import display_html # pour pouvoir afficher deux df côte à côte.

n_coeurs = 4 # spécifie le nombre de coeurs utilisés sur le processeur par les fonctions qui le permettent (logreg, rdf, knn ici)

## 1.Préparation des données et chargement des modèles

In [2]:
# Instanciation df
df = pd.read_csv('../../../../data/processed/model_weatherAUS.csv')
df.drop(columns = 'Unnamed: 0', inplace = True)
df['Date'] = pd.to_datetime(df['Date'])

###### Proportions initiales target:
#RainTomorrow
#0    0.778382
#1    0.221618

# Séparation data / target:
X = df.drop(columns = ['RainTomorrow', 'Date']).copy()
y = df['RainTomorrow'].copy()

# Séparation du jeu d'entrainement et du jeu de test:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 123, stratify = y)
# stratify = y assure que la proportion initiale de y se retrouve à l'identique dans les deux échantillons de train et test,
# mais ne permet pas de ré-équilibrer les classes!

# Scale de X_train, X_test:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Oversample avec SMOTE
smote = SMOTE(random_state = 12, n_jobs = n_coeurs)
X_sm, y_sm = smote.fit_resample(X_train, y_train)

print('Classes échantillon SMOTE :', dict(pd.Series(y_sm).value_counts()))



Classes échantillon SMOTE : {0: 82189, 1: 82189}


## 2. Voting

In [13]:
logreg = linear_model.LogisticRegression()
dt = tree.DecisionTreeClassifier()
knn = neighbors.KNeighborsClassifier()
rdf = ensemble.RandomForestClassifier()
svm = svm.SVC()

In [14]:
vclf = VotingClassifier(estimators = [('logreg', logreg), ('dt', dt), ('rdf', rdf), ('knn', knn), ('svm', svm)], voting = 'hard', n_jobs = n_coeurs, verbose = True)
vclf.fit(X_train, y_train)

### 2.1 Evaluation sur test

In [15]:
y_test_pred = vclf.predict(X_test)
cm = pd.crosstab(y_test,y_test_pred, rownames = ['Classes réelles'], colnames = ['Classes prédites'])
cm

Classes prédites,0,1
Classes réelles,Unnamed: 1_level_1,Unnamed: 2_level_1
0,26279,1118
1,3724,4076


In [10]:
print(classification_report(y_test, y_test_pred,  digits = 2))

              precision    recall  f1-score   support

           0       0.86      0.97      0.91     27397
           1       0.81      0.46      0.59      7800

    accuracy                           0.86     35197
   macro avg       0.83      0.71      0.75     35197
weighted avg       0.85      0.86      0.84     35197



### 2.2 Evaluation sur train

In [21]:
y_train_pred = vclf.predict(X_train)
cm = pd.crosstab(y_train,y_train_pred, rownames = ['Classes réelles'], colnames = ['Classes prédites'])
cm

Classes prédites,0,1
Classes réelles,Unnamed: 1_level_1,Unnamed: 2_level_1
0,82189,0
1,9380,14021


In [22]:
print(classification_report(y_train, y_train_pred,  digits = 2))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95     82189
           1       1.00      0.60      0.75     23401

    accuracy                           0.91    105590
   macro avg       0.95      0.80      0.85    105590
weighted avg       0.92      0.91      0.90    105590



Il y a clairement overfitting.

### 2.3 Voting Soft

In [11]:
vclf_soft = VotingClassifier(estimators = [('logreg', logreg), ('dt', dt), ('rdf', rdf), ('knn', knn)], voting = 'soft')

vclf_soft.fit(X_train, y_train)

y_test_pred = vclf_soft.predict(X_test)
cm = pd.crosstab(y_test,y_test_pred, rownames = ['Classes réelles'], colnames = ['Classes prédites'])
cm


Classes prédites,0,1
Classes réelles,Unnamed: 1_level_1,Unnamed: 2_level_1
0,25892,1505
1,3652,4148


In [12]:
print(classification_report(y_test, y_test_pred,  digits = 2))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91     27397
           1       0.73      0.53      0.62      7800

    accuracy                           0.85     35197
   macro avg       0.81      0.74      0.76     35197
weighted avg       0.84      0.85      0.84     35197



Je préfère la version 'hard', qui donne une meilleure précision sur la classe positive (82% contre 74% ici)

## 3. Stacking