# MALE Project

DJAROUD Adam
LEVEQUE Clément

## Imports

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import roc_auc_score, f1_score, recall_score, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
import numpy as np
import sklearn.metrics

## Models

### Linear Discriminant Analysis

In [3]:
## 1. Chargement des données
df = pd.read_csv('celldata.csv')

# Séparer la cible (Y) et les variables explicatives (X)
X = df.drop('Churn', axis=1)
y = df['Churn']

## 2. Définition des Catégories de Variables
# Variables numériques continues 
numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'Salary']

# Variables catégorielles nominales 
categorical_features = ['Geography', 'Gender']

# Variables binaires déjà bianires 
binary_features = ['HasCrCard', 'IsActiveMember']


## 3. Création du Préprocesseur 

preprocessor = ColumnTransformer(
    transformers=[
        # 1. Standardisation pour les variables numériques
        ('num', StandardScaler(), numerical_features),
        
        # 2. Encodage One-Hot pour les variables catégorielles
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        
        # 3. Passer les variables binaires directement
        ('bin', 'passthrough', binary_features)
    ],
    remainder='drop' 
)

# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


## 4. Implémentation et Évaluation de LDA

print("--- Résultats Linear Discriminant Analysis (LDA) ---")

# Création du Pipeline
lda_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', LinearDiscriminantAnalysis())])

# Entraînement
lda_pipeline.fit(X_train, y_train)

# Prédiction
y_pred_lda = lda_pipeline.predict(X_test)
y_proba_lda = lda_pipeline.predict_proba(X_test)[:, 1]

# Évaluation des métriques clés
auc_lda = roc_auc_score(y_test, y_proba_lda)
recall_lda = recall_score(y_test, y_pred_lda)
f1_lda = f1_score(y_test, y_pred_lda)
accuracy_lda = accuracy_score(y_test, y_pred_lda)
precision_lda = precision_score(y_test, y_pred_lda)

print(f"Accuracy (Précision globale) : {accuracy_lda:.4f}")
print(f"Recall (Rappel) - Churn : {recall_lda:.4f}")
print(f"Precision (Précision) - Churn : {precision_lda:.4f}") 
print(f"F1-Score : {f1_lda:.4f}")
print(f"AUC : {auc_lda:.4f} ")


--- Résultats Linear Discriminant Analysis (LDA) ---
Accuracy (Précision globale) : 0.8087
Recall (Rappel) - Churn : 0.2195
Precision (Précision) - Churn : 0.5638
F1-Score : 0.3159
AUC : 0.7598 


### Quadratic Discriminant Analysis

In [4]:
# Création du Pipeline
qda_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', QuadraticDiscriminantAnalysis())])

# Entraînement
qda_pipeline.fit(X_train, y_train)

# Prédiction
y_pred_qda = qda_pipeline.predict(X_test)
y_proba_qda = qda_pipeline.predict_proba(X_test)[:, 1]

# Évaluation des métriques clés
auc_qda = roc_auc_score(y_test, y_proba_qda)
recall_qda = recall_score(y_test, y_pred_qda)
f1_qda = f1_score(y_test, y_pred_qda)
accuracy_qda = accuracy_score(y_test, y_pred_qda)
precision_qda = precision_score(y_test, y_pred_qda)

print(f"Accuracy (Précision globale) : {accuracy_qda:.4f}")
print(f"Recall (Rappel) - Churn : {recall_qda:.4f}")
print(f"Precision (Précision) - Churn : {precision_qda:.4f}")
print(f"F1-Score : {f1_qda:.4f}")
print(f"AUC : {auc_qda:.4f} ")

Accuracy (Précision globale) : 0.6175
Recall (Rappel) - Churn : 0.5590
Precision (Précision) - Churn : 0.2769
F1-Score : 0.3704
AUC : 0.6051 




### Logistic regression

In [5]:
## c) Logistic Regression

print("\n--- Résultats Logistic Regression ---")

# Création du Pipeline (C=1.0 est la régularisation par défaut)
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', LogisticRegression(solver='liblinear', random_state=42))])

# Entraînement
lr_pipeline.fit(X_train, y_train)

# Prédiction
y_pred_lr = lr_pipeline.predict(X_test)
y_proba_lr = lr_pipeline.predict_proba(X_test)[:, 1]

# Évaluation des métriques clés
auc_lr = roc_auc_score(y_test, y_proba_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr) 

print(f"Accuracy (Précision globale) : {accuracy_lr:.4f}")
print(f"Recall (Rappel) - Churn : {recall_lr:.4f}")
print(f"Precision (Précision) - Churn : {precision_lr:.4f}")
print(f"F1-Score : {f1_lr:.4f}")
print(f"AUC : {auc_lr:.4f}")




--- Résultats Logistic Regression ---
Accuracy (Précision globale) : 0.8121
Recall (Rappel) - Churn : 0.1946
Precision (Précision) - Churn : 0.6026
F1-Score : 0.2942
AUC : 0.7563


### KNN

In [6]:

## d) KNN (K-Nearest Neighbors)

# Nous allons utiliser K=5 comme valeur par défaut, une validation croisée pourrait optimiser ce paramètre.
print("\n--- Résultats K-Nearest Neighbors (KNN, K=5) ---")

# Création du Pipeline
knn_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', KNeighborsClassifier(n_neighbors=5))])

# Entraînement
knn_pipeline.fit(X_train, y_train)

# Prédiction
y_pred_knn = knn_pipeline.predict(X_test)
y_proba_knn = knn_pipeline.predict_proba(X_test)[:, 1]

# Évaluation des métriques clés
auc_knn = roc_auc_score(y_test, y_proba_knn)
recall_knn = recall_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn) 


print(f"Accuracy (Précision globale) : {accuracy_knn:.4f}")
print(f"Recall (Rappel) - Churn : {recall_knn:.4f}")
print(f"Precision (Précision) - Churn : {precision_knn:.4f}")
print(f"F1-Score : {f1_knn:.4f}")
print(f"AUC : {auc_knn:.4f}")


--- Résultats K-Nearest Neighbors (KNN, K=5) ---
Accuracy (Précision globale) : 0.8396
Recall (Rappel) - Churn : 0.4099
Precision (Précision) - Churn : 0.6644
F1-Score : 0.5070
AUC : 0.7785


### AdaBoost

In [7]:
## e) AdaBoost (Adaptive Boosting)

print("\n--- Résultats AdaBoost ---")

# Création de la Pipeline 
adaboost_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('classifier', AdaBoostClassifier(
                                        estimator=DecisionTreeClassifier(max_depth=1), 
                                        n_estimators=100,
                                        random_state=42))])

# Entraînement
adaboost_pipeline.fit(X_train, y_train)

# Prédiction
y_pred_ada = adaboost_pipeline.predict(X_test)
y_proba_ada = adaboost_pipeline.predict_proba(X_test)[:, 1]

# Évaluation des métriques clés
auc_ada = roc_auc_score(y_test, y_proba_ada)
recall_ada = recall_score(y_test, y_pred_ada)
precision_ada = precision_score(y_test, y_pred_ada) 
f1_ada = f1_score(y_test, y_pred_ada)
accuracy_ada = accuracy_score(y_test, y_pred_ada)

print(f"Accuracy (Précision globale) : {accuracy_ada:.4f}")
print(f"Recall (Rappel) - Churn : {recall_ada:.4f}")
print(f"Precision (Précision) - Churn : {precision_ada:.4f}")
print(f"F1-Score : {f1_ada:.4f}")
print(f"AUC : {auc_ada:.4f}")


--- Résultats AdaBoost ---
Accuracy (Précision globale) : 0.8604
Recall (Rappel) - Churn : 0.4493
Precision (Précision) - Churn : 0.7587
F1-Score : 0.5644
AUC : 0.8421


### Gradient Boosting

In [8]:
## f) Gradient Boosting (GBM)

print("\n--- Résultats Gradient Boosting ---")

# Création du Pipeline
# Utilise la technique de gradient boosting pour construire une forêt d'arbres.
gbm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', GradientBoostingClassifier(
                                   n_estimators=100, 
                                   learning_rate=0.1, 
                                   max_depth=3, 
                                   random_state=42))])

# Entraînement
gbm_pipeline.fit(X_train, y_train)

# Prédiction
y_pred_gbm = gbm_pipeline.predict(X_test)
y_proba_gbm = gbm_pipeline.predict_proba(X_test)[:, 1]

# Évaluation des métriques clés
auc_gbm = roc_auc_score(y_test, y_proba_gbm)
recall_gbm = recall_score(y_test, y_pred_gbm)
precision_gbm = precision_score(y_test, y_pred_gbm) 
f1_gbm = f1_score(y_test, y_pred_gbm)
accuracy_gbm = accuracy_score(y_test, y_pred_gbm)

print(f"Accuracy (Précision globale) : {accuracy_gbm:.4f}")
print(f"Recall (Rappel) - Churn : {recall_gbm:.4f}")
print(f"Precision (Précision) - Churn : {precision_gbm:.4f}")
print(f"F1-Score : {f1_gbm:.4f}")
print(f"AUC : {auc_gbm:.4f}")


--- Résultats Gradient Boosting ---
Accuracy (Précision globale) : 0.8679
Recall (Rappel) - Churn : 0.4576
Precision (Précision) - Churn : 0.8007
F1-Score : 0.5823
AUC : 0.8678


### Classification Tree

In [9]:
model_tree = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('tree', DecisionTreeClassifier())
])

modelfit = model_tree.fit(X_train, y_train)
pY_train = modelfit.predict_proba(X_test)

predxclass = np.argmax(pY_train, axis=1)
E_train = (y_test != predxclass).sum()/len(y_test)

print("The accuracy on the training set is %5.2f->", 1-E_train)
print(f"Confusion matrix : {sklearn.metrics.confusion_matrix(y_test, predxclass)}")
print(f"Recall : {sklearn.metrics.recall_score(y_test, predxclass)}")
print(f"Precision : {sklearn.metrics.precision_score(y_test, predxclass)}")
print(f"F1-score : {sklearn.metrics.f1_score(y_test, predxclass)}")
print(f"ROC-AUC score : {sklearn.metrics.roc_auc_score(y_test, model_tree.predict_proba(X_test)[:,1])}")

The accuracy on the training set is %5.2f-> 0.79
Confusion matrix : [[1645  272]
 [ 232  251]]
Recall : 0.5196687370600414
Precision : 0.47992351816443596
F1-score : 0.4990059642147117
ROC-AUC score : 0.6888901849097806


### Bagging

In [10]:
model_bagging = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('bagging', BaggingClassifier(n_estimators=10, random_state=0))
])

modelfit = model_bagging.fit(X_train, y_train)
pY_train = modelfit.predict_proba(X_test)

predxclass = np.argmax(pY_train, axis=1)
E_train = (y_test != predxclass).sum()/len(y_test)

print("The accuracy on the training set is %5.2f->", 1-E_train)
print(f"Confusion matrix : {sklearn.metrics.confusion_matrix(y_test, predxclass)}")
print(f"Recall : {sklearn.metrics.recall_score(y_test, predxclass)}")
print(f"Precision : {sklearn.metrics.precision_score(y_test, predxclass)}")
print(f"F1-score : {sklearn.metrics.f1_score(y_test, predxclass)}")
print(f"ROC-AUC score : {sklearn.metrics.roc_auc_score(y_test, model_bagging.predict_proba(X_test)[:,1])}")

The accuracy on the training set is %5.2f-> 0.85
Confusion matrix : [[1833   84]
 [ 276  207]]
Recall : 0.42857142857142855
Precision : 0.711340206185567
F1-score : 0.5348837209302325
ROC-AUC score : 0.8140631226975379


### Random Forest

In [11]:
model_random_forest = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('bagging', RandomForestClassifier(max_depth=2000, random_state=0))
])

modelfit = model_random_forest.fit(X_train, y_train)
pY_train = modelfit.predict_proba(X_test)
predxclass = np.argmax(pY_train, axis=1)
E_train = (y_test != predxclass).sum()/len(y_test)

print("The accuracy on the training set is %5.2f->", 1-E_train)
print(f"Confusion matrix : {sklearn.metrics.confusion_matrix(y_test, predxclass)}")
print(f"Recall : {sklearn.metrics.recall_score(y_test, predxclass)}")
print(f"Precision : {sklearn.metrics.precision_score(y_test, predxclass)}")
print(f"F1-score : {sklearn.metrics.f1_score(y_test, predxclass)}")
print(f"ROC-AUC score : {sklearn.metrics.roc_auc_score(y_test, model_random_forest.predict_proba(X_test)[:,1])}")

The accuracy on the training set is %5.2f-> 0.8658333333333333
Confusion matrix : [[1857   60]
 [ 262  221]]
Recall : 0.4575569358178054
Precision : 0.7864768683274022
F1-score : 0.5785340314136126
ROC-AUC score : 0.844756137468936


### Extra trees

In [12]:
model_extra_trees = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('bagging', ExtraTreesClassifier(max_depth=2000, random_state=0, bootstrap=True, oob_score=True))
])

modelfit = model_extra_trees.fit(X_train, y_train)
pY_train = modelfit.predict_proba(X_test)
predxclass = np.argmax(pY_train, axis=1)
E_train = (y_test != predxclass).sum()/len(y_test)

print("The accuracy on the training set is %5.2f->", 1-E_train)
print(f"Confusion matrix : {sklearn.metrics.confusion_matrix(y_test, predxclass)}")
print(f"Recall : {sklearn.metrics.recall_score(y_test, predxclass)}")
print(f"Precision : {sklearn.metrics.precision_score(y_test, predxclass)}")
print(f"F1-score : {sklearn.metrics.f1_score(y_test, predxclass)}")
print(f"ROC-AUC score : {sklearn.metrics.roc_auc_score(y_test, model_extra_trees.predict_proba(X_test)[:,1])}")

The accuracy on the training set is %5.2f-> 0.8591666666666666
Confusion matrix : [[1863   54]
 [ 284  199]]
Recall : 0.41200828157349895
Precision : 0.7865612648221344
F1-score : 0.5407608695652174
ROC-AUC score : 0.8387593408005738
