# Taux de désabonnement des clients d'une entreprise

## 1. Récupération et préparation des données

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#lire csv
clients = pd.read_csv("C:/Users/utilisateur/Documents/GitHub/clients/data.csv", sep=',')

clients.head()

Unnamed: 0,num_ligne,ID_Client,Nom,Score_Credit,Pays,Sex,Age,Tenure,Balance,Num_Produit,il_a_CrCard,Membre_actif,Salaire_estime,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


Décision de supprimer les colonnes num_ligne, ID_client qui ne présentent pas d'intérêt pour l'analyse

In [3]:
clients.drop('num_ligne', axis=1, inplace = True)
clients.drop('ID_Client', axis=1, inplace = True)
clients.head()

Unnamed: 0,Nom,Score_Credit,Pays,Sex,Age,Tenure,Balance,Num_Produit,il_a_CrCard,Membre_actif,Salaire_estime,Exited
0,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
# vérification de la présence de données nulles
clients.isna().sum()

Nom               0
Score_Credit      0
Pays              0
Sex               0
Age               0
Tenure            0
Balance           0
Num_Produit       0
il_a_CrCard       0
Membre_actif      0
Salaire_estime    0
Exited            0
dtype: int64

In [5]:
#Visualisation des variables qualitatives

df_cat = clients.select_dtypes(include='object')
print(df_cat)

            Nom     Pays     Sex
0      Hargrave   France  Female
1          Hill    Spain  Female
2          Onio   France  Female
3          Boni   France  Female
4      Mitchell    Spain  Female
...         ...      ...     ...
9995   Obijiaku   France    Male
9996  Johnstone   France    Male
9997        Liu   France  Female
9998  Sabbatini  Germany    Male
9999     Walker   France  Female

[10000 rows x 3 columns]


In [6]:
#vérification des cardinbalités
for col in df_cat:
    print(col, ' = ', len(clients[col].unique()))

Nom  =  2932
Pays  =  3
Sex  =  2


Nous sommes donc obligés de retirer la colonne nom, qui ne présente pas vraiment d'intérêt.


In [7]:
clients.drop('Nom', axis=1, inplace = True)
clients.head()

Unnamed: 0,Score_Credit,Pays,Sex,Age,Tenure,Balance,Num_Produit,il_a_CrCard,Membre_actif,Salaire_estime,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [8]:
#création des variables muettes
clients = pd.concat([clients,
                     pd.get_dummies(clients.Sex), 
                     pd.get_dummies(clients.Pays)], axis=1)

# suppression des colonnes d'origine
clients.drop('Pays', axis=1, inplace = True)
clients.drop('Sex', axis=1, inplace = True)
clients.head()

Unnamed: 0,Score_Credit,Age,Tenure,Balance,Num_Produit,il_a_CrCard,Membre_actif,Salaire_estime,Exited,Female,Male,France,Germany,Spain
0,619,42,2,0.0,1,1,1,101348.88,1,1,0,1,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0,0,0,1
2,502,42,8,159660.8,3,1,0,113931.57,1,1,0,1,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,1,0,1,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,1,0,0,0,1


In [19]:
# récupération X et Y

Y = clients.Exited
Y=np.array((Y))

print(Y.shape)

X = clients.loc[:, clients.columns != 'Exited']
print(X.shape)

(10000,)
(10000, 13)


Maintenant que les données sont prêtes, nous allons les standardiser

In [10]:
print(X)

      Score_Credit  Age  Tenure    Balance  Num_Produit  il_a_CrCard  \
0              619   42       2       0.00            1            1   
1              608   41       1   83807.86            1            0   
2              502   42       8  159660.80            3            1   
3              699   39       1       0.00            2            0   
4              850   43       2  125510.82            1            1   
...            ...  ...     ...        ...          ...          ...   
9995           771   39       5       0.00            2            1   
9996           516   35      10   57369.61            1            1   
9997           709   36       7       0.00            1            0   
9998           772   42       3   75075.31            2            1   
9999           792   28       4  130142.79            1            1   

      Membre_actif  Salaire_estime  Female  Male  France  Germany  Spain  
0                1       101348.88       1     0       1    

In [11]:
cols = X.columns

In [12]:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X = scaler.fit_transform(X)

print(X)

[[-0.32622142  0.29351742 -1.04175968 ...  0.99720391 -0.57873591
  -0.57380915]
 [-0.44003595  0.19816383 -1.38753759 ... -1.00280393 -0.57873591
   1.74273971]
 [-1.53679418  0.29351742  1.03290776 ...  0.99720391 -0.57873591
  -0.57380915]
 ...
 [ 0.60498839 -0.27860412  0.68712986 ...  0.99720391 -0.57873591
  -0.57380915]
 [ 1.25683526  0.29351742 -0.69598177 ... -1.00280393  1.72790383
  -0.57380915]
 [ 1.46377078 -1.04143285 -0.35020386 ...  0.99720391 -0.57873591
  -0.57380915]]


In [15]:
X = pd.DataFrame(X, columns=[cols])
X.head()

Unnamed: 0,Score_Credit,Age,Tenure,Balance,Num_Produit,il_a_CrCard,Membre_actif,Salaire_estime,Female,Male,France,Germany,Spain
0,-0.326221,0.293517,-1.04176,-1.225848,-0.911583,0.646092,0.970243,0.021886,1.095988,-1.095988,0.997204,-0.578736,-0.573809
1,-0.440036,0.198164,-1.387538,0.11735,-0.911583,-1.547768,0.970243,0.216534,1.095988,-1.095988,-1.002804,-0.578736,1.74274
2,-1.536794,0.293517,1.032908,1.333053,2.527057,0.646092,-1.03067,0.240687,1.095988,-1.095988,0.997204,-0.578736,-0.573809
3,0.501521,0.007457,-1.387538,-1.225848,0.807737,-1.547768,-1.03067,-0.108918,1.095988,-1.095988,0.997204,-0.578736,-0.573809
4,2.063884,0.388871,-1.04176,0.785728,-0.911583,0.646092,0.970243,-0.365276,1.095988,-1.095988,-1.002804,-0.578736,1.74274


Les données sont maintenant prêtes à être utilisées. Nous pouvons commencer l'analyse.

In [20]:
from sklearn.model_selection import train_test_split

# Subdivision en ensembles d'entrainement et de test:

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=9)
print(len(X_train), len(X_test))

9000 1000


## Essai de prédiction avec une régression logistique

In [26]:
from sklearn.linear_model import LogisticRegression
import time

#### Entrainement du modele
temps_debut = time.time()
model_logistic = LogisticRegression(max_iter=1000, random_state=9).fit(X_train, y_train)
temps_fin = time.time()
temps_ecoule = temps_fin - temps_debut

print("temps écoulé : ", format(temps_ecoule, '.2f'))

temps écoulé :  0.15


In [29]:
#taux de classification

from sklearn.metrics import accuracy_score
y_pred_log = model_logistic.predict(X_test)

print("taux de classification : ", accuracy_score(y_test, y_pred_log))

taux de classification :  0.787


## Essai de prédiction avec un random Forest

In [31]:
from sklearn.ensemble import RandomForestClassifier

temps_debut = time.time()
#### Entrainement du modele
model_forest = RandomForestClassifier(n_estimators=100, random_state=9)
model_forest.fit(X_train, y_train)
temps_fin = time.time()
temps_ecoule = temps_fin - temps_debut

print("temps écoulé : ", format(temps_ecoule, '.2f'))

temps écoulé :  1.07


In [32]:
#taux de classification

y_pred_rforest = model_forest.predict(X_test)

print("taux de classification : ", accuracy_score(y_test, y_pred_rforest))

taux de classification :  0.865


## Essai de prédiction avec un SGD classifier

In [38]:
from sklearn.linear_model import SGDClassifier

temps_debut = time.time()
model_sgd = SGDClassifier(loss="hinge", penalty="l2", max_iter=500)

model_sgd.fit(X_train, y_train)
SGDClassifier(max_iter=5)

temps_fin = time.time()
temps_ecoule = temps_fin - temps_debut
print("temps écoulé : ", format(temps_ecoule, '.2f'))

temps écoulé :  0.25


In [37]:
#taux de classification

y_pred_sgd = model_sgd.predict(X_test)

print("taux de classification : ", accuracy_score(y_test, y_pred_sgd))

taux de classification :  0.687


## Essai avec un SVM

In [41]:
# détermination des meilleurs paramètres
from sklearn.model_selection import GridSearchCV
from sklearn import svm

#param_grid={'C': [1, 10, 25, 50, 100, 1000], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': [1e-3, 1e-4, 0.1]}
#model_grid = GridSearchCV(svm.SVC(), param_grid, cv =10)
## (temps de calcul trop long, arrêté après 15 minutes)

temps_debut = time.time()

model_svm = svm.SVC()
model_svm.fit(X_train, y_train)

temps_fin = time.time()
temps_ecoule = temps_fin - temps_debut
print("temps écoulé : ", format(temps_ecoule, '.2f'))

temps écoulé :  1.64


In [45]:
#taux de classification

y_pred_svm = model_svm.predict(X_test)

print("taux de classification : ", accuracy_score(y_test, y_pred_svm))

plt.
print(y_pred_svm[0:40])
print(y_test[0:40])

taux de classification :  0.795
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]
[0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0]


## Essai avec un réseau de neurones

In [48]:
from tensorflow.keras import datasets, layers, models
from keras.layers import Dense, Conv1D, Flatten
from keras.utils import to_categorical


#implémentation du modèle séquentiel
model_seq = models.Sequential()

model_seq.add(Conv1D(64, kernel_size=3, activation='relu'))
model_seq.add(Conv1D(32, kernel_size=3, activation='relu'))
model_seq.add(Flatten())
model_seq.add(Dense(2, activation='softmax')) # couche de sortie taille 2 pour correspondre à Y

model_seq.summary()