## Le Randomforest

In [1]:
# Manipulation
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from joblib import dump, load

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from ipywidgets import Dropdown, interact
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, ConfusionMatrixDisplay, make_scorer, classification_report


In [2]:
# Importation de la donnée
df = pd.read_csv("work_data.csv")
X = df.drop('Exited', axis=1)
y = df['Exited']

In [3]:
# Normalisation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Enregistrement du scaler
dump(scaler, './scaler.joblib')

['./scaler.joblib']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, shuffle=True)

### SMOTE

In [5]:
smote = SMOTE(k_neighbors=1, sampling_strategy=0.3, random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_scaled, y)

In [6]:
print(f"""
Original X shape: {X_train.shape}
SMOTE X shape: {X_train_sm.shape}
""")


Original X shape: (7984, 10)
SMOTE X shape: (10332, 10)



In [7]:
acc_baseline = y_train.value_counts(normalize=True).max()
print("Baseline Accuracy:", round(acc_baseline*100, 2), "%")

Baseline Accuracy: 79.57 %


### Paramétisation 

In [9]:
params_rf = { 
                'n_estimators' : [50,100, 200, 250, 500, 800],
               'max_features' : [1, 0.5, 0.2],
               'random_state' : [3, 4, 5],
               "max_depth": range(5,31,5),
               "min_samples_leaf": [1,2],
               "min_samples_split": [2,3]
               }

rf = RandomForestClassifier()
grid_search_rf = GridSearchCV(
                            rf, 
                            params_rf,  
                            n_jobs=-1, # Use all available CPU cores for parallel computation
                            cv=5, 
                            scoring=["recall", "accuracy", 'precision'], # Set the evaluation metrics to be used for scoring
                            refit="recall" # Choose the metric to optimize during randomized search
                            )
grid_search_rf.fit(X_train_sm, y_train_sm)

print ("Score final : ", round(grid_search_rf.score(X_train_sm, y_train_sm) *100,4), " %")
print ("Meilleurs parametres: ", grid_search_rf.best_params_)


In [None]:
# Utilisation du meilleur modèle trouvé pour la prédiction
rdf = grid_search_rf.best_estimator_
rdf = rdf.fit(X_train_sm, y_train_sm)
# Prediction
y_train_predi = rdf.predict(X_train_sm)
y_test_predi = rdf.predict(X_test)

### Rapport de classification

In [None]:
# Classification Report
print("\nTrain Classification Report:")
print(classification_report(y_train_sm, y_train_predi))
print("\nTest Classification Report:")
print(classification_report(y_test, y_test_predi))

True

### Variables importantes

In [None]:
features = X.columns
importances = rdf.feature_importances_
feat_imp = pd.Series(importances, index=features)
# Plot 10 most important features
feat_imp.sort_values().tail(10).plot(kind="barh")
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Feature Importance with Smote")

### Matrixe de confusion

In [None]:
confusion_mat = confusion_matrix(y_test, y_test_predi)
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_mat, display_labels=rdf.classes_)

# Representation confusion matrix
disp.plot()
plt.show()