# Taux de désabonnement des clients d'une entreprise

Importer les modules nécessaires :

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import statistics

Importer et lire les données :

In [2]:
data = pd.read_csv("data.csv")
data

Unnamed: 0,num_ligne,ID_Client,Nom,Score_Credit,Pays,Sex,Age,Tenure,Balance,Num_Produit,il_a_CrCard,Membre_actif,Salaire_estime,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


Vérifier les possibles valeurs manquantes :

In [3]:
display(data.isna().sum())

num_ligne         0
ID_Client         0
Nom               0
Score_Credit      0
Pays              0
Sex               0
Age               0
Tenure            0
Balance           0
Num_Produit       0
il_a_CrCard       0
Membre_actif      0
Salaire_estime    0
Exited            0
dtype: int64

Définir X et y, avec y la colonne target confirmant ou non le désabonnement.

In [4]:
X = data.drop(columns=["num_ligne", "Exited"])
X

Unnamed: 0,ID_Client,Nom,Score_Credit,Pays,Sex,Age,Tenure,Balance,Num_Produit,il_a_CrCard,Membre_actif,Salaire_estime
0,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88
1,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57
3,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63
4,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64
9996,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77
9997,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58
9998,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52


In [5]:
y = np.array(data["Exited"])
y

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

Appliquer LabelEncoder à X pour convertir les valeurs catégoriques :

In [6]:
le = LabelEncoder()
X['Nom'] = le.fit_transform(X['Nom'])
X['Pays'] = le.fit_transform(X['Pays'])
X['Sex'] = le.fit_transform(X['Sex'])
X

Unnamed: 0,ID_Client,Nom,Score_Credit,Pays,Sex,Age,Tenure,Balance,Num_Produit,il_a_CrCard,Membre_actif,Salaire_estime
0,15634602,1115,619,0,0,42,2,0.00,1,1,1,101348.88
1,15647311,1177,608,2,0,41,1,83807.86,1,0,1,112542.58
2,15619304,2040,502,0,0,42,8,159660.80,3,1,0,113931.57
3,15701354,289,699,0,0,39,1,0.00,2,0,0,93826.63
4,15737888,1822,850,2,0,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,15606229,1999,771,0,1,39,5,0.00,2,1,0,96270.64
9996,15569892,1336,516,0,1,35,10,57369.61,1,1,1,101699.77
9997,15584532,1570,709,0,0,36,7,0.00,1,0,1,42085.58
9998,15682355,2345,772,1,1,42,3,75075.31,2,1,0,92888.52


Normaliser les valeurs de X avec StandardScaler :

In [7]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
display(X)

array([[-0.78321342, -0.46418322, -0.32622142, ...,  0.64609167,
         0.97024255,  0.02188649],
       [-0.60653412, -0.3909112 , -0.44003595, ..., -1.54776799,
         0.97024255,  0.21653375],
       [-0.99588476,  0.62898807, -1.53679418, ...,  0.64609167,
        -1.03067011,  0.2406869 ],
       ...,
       [-1.47928179,  0.07353887,  0.60498839, ..., -1.54776799,
         0.97024255, -1.00864308],
       [-0.11935577,  0.98943914,  1.25683526, ...,  0.64609167,
        -1.03067011, -0.12523071],
       [-0.87055909,  1.4692527 ,  1.46377078, ...,  0.64609167,
        -1.03067011, -1.07636976]])

Diviser les données entre entraînement et test :

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [9]:
display(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8000, 12)

(2000, 12)

(8000,)

(2000,)

## RandomForest

Entraîner RandomForest sur les données d'entraînement :

In [10]:
randomforest = RandomForestClassifier(n_estimators=100)

randomforest.fit(X_train, y_train)

RandomForestClassifier()

Tester le modèle sur les données de test :

In [11]:
y_pred = randomforest.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [12]:
accuracy_score(y_test, y_pred)

0.8655

## RandomForest avec PCA

In [13]:
pca = PCA(n_components=0.95)

In [14]:
X_pca = pca.fit_transform(X_train)

In [15]:
randomforest_pca = RandomForestClassifier(n_estimators=100)

randomforest_pca.fit(X_pca, y_train)

RandomForestClassifier()

In [16]:
X_test_pca = pca.transform(X_test)

In [17]:
y_pred_pca = randomforest_pca.predict(X_test_pca)
y_pred_pca

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [18]:
accuracy_score(y_test, y_pred_pca)

0.855

## LogisticRegression

In [19]:
logisticregression = LogisticRegression(max_iter=1000)

logisticregression.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [20]:
y_pred2 = logisticregression.predict(X_test)
y_pred2

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [21]:
accuracy_score(y_test, y_pred2)

0.8055

## LogisticRegression avec PCA

In [22]:
logisticregression_pca = LogisticRegression(max_iter=1000)

logisticregression_pca.fit(X_pca, y_train)

LogisticRegression(max_iter=1000)

In [23]:
y_pred_pca2 = logisticregression_pca.predict(X_test_pca)
y_pred_pca2

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [24]:
accuracy_score(y_test, y_pred_pca2)

0.8055

## Mode des quatre méthodes

In [25]:
y_pred_mode = []
for i in range(len(y_test)):
    y_pred_mode.append(statistics.mode([y_pred[i], y_pred_pca[i], y_pred2[i], y_pred_pca2[i]]))

In [26]:
accuracy_score(y_test, y_pred_mode)

0.8565

## Conclusion

RandomForest sans PCA reste légèrement plus efficace pour faire des prédictions sur la base de données, avec une précision d'environ 0.86.