# Librairies

In [51]:
!pip install imbalanced-learn



In [2]:
import os
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from matplotlib import pyplot
import sys 
from sklearn.metrics import *
import scikitplot as skplt
from sklearn.preprocessing import StandardScaler

# Algorithmes
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

#Imbalanced learn
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from imblearn.under_sampling import NearMiss, EditedNearestNeighbours, OneSidedSelection
from imblearn.pipeline import Pipeline, make_pipeline

#from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Importation des données

In [3]:
os.chdir(r"C:\Users\cornuch\Desktop\SISE\Fouille_donnees_massives\projet_fouille")  

In [4]:
df = pd.read_table("guillaume.txt", sep=";", header=0, decimal=",")

In [5]:
df=df.drop(df[df["CodeDecision"]==4].index, axis= 0)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4646772 entries, 0 to 4646772
Data columns (total 23 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   ZIBZIN                    object 
 1   IDAvisAutorisationCheque  int64  
 2   FlagImpaye                int64  
 3   Montant                   float64
 4   DateTransaction           object 
 5   CodeDecision              int64  
 6   VerifianceCPT1            int64  
 7   VerifianceCPT2            int64  
 8   VerifianceCPT3            int64  
 9   D2CB                      int64  
 10  ScoringFP1                float64
 11  ScoringFP2                float64
 12  ScoringFP3                float64
 13  TauxImpNb_RB              float64
 14  TauxImpNB_CPM             float64
 15  EcartNumCheq              int64  
 16  NbrMagasin3J              int64  
 17  DiffDateTr1               float64
 18  DiffDateTr2               float64
 19  DiffDateTr3               float64
 20  CA3TRetMtt              

In [7]:
df.drop(['Heure'], axis = "columns", inplace=True)

In [9]:
DateTransaction_1=df["DateTransaction"].str.split(' ',1)
df['Date']=DateTransaction_1.str[0]
df['Heure']=DateTransaction_1.str[1]
df.drop(['DateTransaction'], axis='columns', inplace=True)

In [11]:
df["Heure_num"] = df['Heure'].str[:2]
df["Heure_num"]=df["Heure_num"].astype("int64")

In [13]:
df.drop(['ZIBZIN', 'IDAvisAutorisationCheque', 'CodeDecision','VerifianceCPT2','VerifianceCPT3','TauxImpNb_RB','DiffDateTr1','DiffDateTr2','CA3TRetMtt'], axis='columns', inplace=True)

**Train et Test**

In [14]:
train=df.loc[df['Date']<='2017-08-31']
test=df.loc[df['Date']>='2017-09-01']

In [15]:
train = train.drop(["Date","Heure"], axis = "columns")
test = test.drop(["Date","Heure"], axis = "columns")

In [35]:
train.shape
#84%

(3899362, 13)

In [36]:
test.shape
#16%

(747410, 13)

In [38]:
train['FlagImpaye']. value_counts(normalize = True)

0    0.993993
1    0.006007
Name: FlagImpaye, dtype: float64

In [39]:
test['FlagImpaye']. value_counts(normalize = True)

0    0.991206
1    0.008794
Name: FlagImpaye, dtype: float64

In [41]:
train_sample=train.sample(10080)

In [42]:
train_sample['FlagImpaye']. value_counts(normalize = True)

0    0.99494
1    0.00506
Name: FlagImpaye, dtype: float64

In [45]:
test_sample=test.sample(1920)

In [46]:
test_sample['FlagImpaye']. value_counts(normalize = True)

0    0.991667
1    0.008333
Name: FlagImpaye, dtype: float64

In [47]:
# Définir la variable cible 
y_train = train["FlagImpaye"]
y_test = test["FlagImpaye"]

In [48]:
# Définir les variables explicatives 
X_train = train.drop(["FlagImpaye"], axis = "columns")
X_test = test.drop(["FlagImpaye"], axis = "columns")

In [49]:
X_train.head()

Unnamed: 0,Montant,VerifianceCPT1,D2CB,ScoringFP1,ScoringFP2,ScoringFP3,TauxImpNB_CPM,EcartNumCheq,NbrMagasin3J,DiffDateTr3,CA3TR,Heure_num
0,20.0,0,551,0.0,0.0,0.0,52.076034,0,1,4.0,0.0,7
1,20.0,0,551,0.0,0.0,0.0,52.076034,1,2,4.0,8.61,7
2,57.64,0,549,0.0,0.0,0.0,52.076034,0,1,4.0,0.0,7
3,54.29,1,267,0.0,0.0,0.0,53.554234,0,1,4.0,0.0,7
4,26.9,0,549,0.003769,8.586333,0.001192,52.076034,1,1,4.0,32.25,8


In [50]:
y_train_sample = train_sample["FlagImpaye"]
y_test_sample = test_sample["FlagImpaye"]

In [51]:
X_train_sample = train_sample.drop(["FlagImpaye"], axis = "columns")
X_test_sample = test_sample.drop(["FlagImpaye"], axis = "columns")

**Centrage et réduction**

In [22]:
def mean_norm(df_input):
    return df_input.apply(lambda x: (x-x.mean())/ x.std(), axis=0)

In [23]:
X_train_scale=mean_norm(X_train)

In [24]:
X_train_scale.head()

Unnamed: 0,Montant,VerifianceCPT1,D2CB,ScoringFP1,ScoringFP2,ScoringFP3,TauxImpNB_CPM,EcartNumCheq,NbrMagasin3J,DiffDateTr3,CA3TR,Heure_num
0,-0.420302,-0.581824,1.017656,-0.08459,-0.239505,-0.275801,0.818043,-0.043817,-0.194165,-0.451325,-0.202861,-2.122399
1,-0.420302,-0.581824,1.017656,-0.08459,-0.239505,-0.275801,0.818043,-0.043812,4.975724,-0.451325,0.038808,-2.122399
2,-0.022127,-0.581824,1.008573,-0.08459,-0.239505,-0.275801,0.818043,-0.043817,-0.194165,-0.451325,-0.202861,-2.122399
3,-0.057565,1.545332,-0.272152,-0.08459,-0.239505,-0.275801,0.906727,-0.043817,-0.194165,-0.451325,-0.202861,-2.122399
4,-0.34731,-0.581824,1.008573,-0.084434,0.637044,-0.273204,0.818043,-0.043812,-0.194165,-0.451325,0.702346,-1.820898


In [64]:
X_train_sample_scale=mean_norm(X_train_sample)
X_test_sample_scale=mean_norm(X_test_sample)

# Définition des critères des performances

In [25]:
def perte(Montant):
    if Montant<=20:
        Montant=Montant*0
    elif 20<Montant<=50:
        Montant=0.2*Montant
    elif 50<Montant<=100:
        Montant=0.3*Montant
    elif 100<Montant<=200:
        Montant=0.5*Montant
    elif Montant>200:
        Montant=0.8*Montant
    return(Montant)

In [26]:
def Calcul_Marge(Montant, yReel, yPred):
    # Création de dfmerge
    dfmerge = pd.concat([Montant, yReel], axis=1)
    dfmerge["Ypred"] = yPred
    
    # Création de la variable Marge
    # Création d'une colonne Marge égale au Montant
    dfmerge["Marge"] = dfmerge["Montant"]
    #Calcul du gain ou de la perte selon la différence entre réel et prédiction
    dfmerge.loc[((dfmerge["FlagImpaye"] == 1) & (dfmerge["Ypred"] == 1)), "Marge"] = 0
    dfmerge.loc[((dfmerge["FlagImpaye"] == 0) & (dfmerge["Ypred"] == 1)), "Marge"] = 0.7 * 0.05 * dfmerge["Montant"]
    dfmerge.loc[((dfmerge["FlagImpaye"] == 1) & (dfmerge["Ypred"] == 0)), "Marge"] = dfmerge["Montant"].apply(lambda x: -perte(x))
    dfmerge.loc[((dfmerge["FlagImpaye"] == 0) & (dfmerge["Ypred"] == 0)), "Marge"] = 0.05 * dfmerge["Montant"]
    
    # Calcul du Marge_totale
    Marge_totale = dfmerge["Marge"].sum()
    
    return Marge_totale

# Modification du seuil d'affectation

In [28]:
from numpy import arange
from numpy import argmax

In [29]:
# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
 return (pos_probs >= threshold).astype('int')

In [30]:
def modif_seuil(classifier,X_test_scale,y_test):
    # predict probabilities
    yhat = classifier.predict_proba(X_test_scale)
    # keep probabilities for the positive outcome only
    probs = yhat[:, 1]
    # define thresholds
    thresholds = arange(0.1, 1, 0.1)
    # evaluate each threshold
    scores = [f1_score(y_test, to_labels(probs, t)) for t in thresholds]
    # get best threshold
    ix = argmax(scores)
    s=('Threshold=%.3f, F-Score=%.5f' % (thresholds[ix], scores[ix]))
    return s

In [96]:
def resultats(predicted):
    conf_mat = confusion_matrix(y_true=y_test_sample, y_pred=predicted)
    print('Confusion matrix:\n', conf_mat)
    f1 = f1_score(y_test_sample, y_pred=predicted)
    print('F1_score :\n', f1)
    cr=classification_report(y_test_sample,y_pred=predicted)
    print('Estimateurs :\n', cr)
    auc = roc_auc_score(y_test_sample, predicted)
    print('Auc Score :\n', auc)
    Marge_totale = Calcul_Marge(Montant = X_test_sample["Montant"], yReel = y_test_sample, yPred=predicted)
    print("Marge = " + str(round(Marge_totale, 2)) + " euros")

# 1. Undersampling

In [32]:
%%time
sampler1=OneSidedSelection(random_state=0)
X_res1, y_res1 = sampler1.fit_resample(X_train_scale, y_train)

CPU times: total: 40min 58s
Wall time: 41min 1s


**Le ré-échantillonnage est trop long, nous optons pour un échantillon aléatoire du train et test de départ en vérifiant les proportions des FlagImpaye**

## 1.1 One Sided Selection

In [57]:
%%time

resampling1 = OneSidedSelection(random_state=0)

model1 = LogisticRegression(random_state=0)

clf1 = Pipeline([('One Sided Selection', resampling1), ('Logistic Regression', model1)])

classifier1=clf1.fit(X_train_sample_scale, y_train_sample)

CPU times: total: 922 ms
Wall time: 827 ms


In [81]:
%%time
predicted1=clf1.predict(X_test_sample)

CPU times: total: 0 ns
Wall time: 0 ns


In [97]:
resultats(predicted1)

Confusion matrix:
 [[1242  662]
 [   7    9]]
F1_score :
 0.02620087336244542
Estimateurs :
               precision    recall  f1-score   support

           0       0.99      0.65      0.79      1904
           1       0.01      0.56      0.03        16

    accuracy                           0.65      1920
   macro avg       0.50      0.61      0.41      1920
weighted avg       0.99      0.65      0.78      1920

Auc Score :
 0.6074054621848739
Marge = 5246.5 euros


## 1.2 Edited Nearest Neighbours

In [102]:
%%time

resampling2 = EditedNearestNeighbours()

model1 = LogisticRegression(random_state=0)

clf2 = Pipeline([('ENN', resampling2), ('Logistic Regression', model1)])

classifier2=clf2.fit(X_train_sample_scale, y_train_sample)

CPU times: total: 828 ms
Wall time: 790 ms


In [86]:
predicted2=clf2.predict(X_test_sample)

In [98]:
resultats(predicted2)

Confusion matrix:
 [[1264  640]
 [   8    8]]
F1_score :
 0.024096385542168672
Estimateurs :
               precision    recall  f1-score   support

           0       0.99      0.66      0.80      1904
           1       0.01      0.50      0.02        16

    accuracy                           0.66      1920
   macro avg       0.50      0.58      0.41      1920
weighted avg       0.99      0.66      0.79      1920

Auc Score :
 0.5819327731092437
Marge = 5254.16 euros


# 2. Over and Under sampling

## 2.1 SMOTEENN

In [106]:
%%time

resampling3 = SMOTEENN(random_state=0)

model1 = LogisticRegression(random_state=0)

clf3 = Pipeline([('SMOTEENN', resampling3), ('Logistic Regression', model1)])

classifier3=clf3.fit(X_train_sample_scale, y_train_sample)

CPU times: total: 1.34 s
Wall time: 1.3 s


In [107]:
predicted3=clf3.predict(X_test_sample)

In [108]:
resultats(predicted3)

Confusion matrix:
 [[ 858 1046]
 [   1   15]]
F1_score :
 0.027855153203342614
Estimateurs :
               precision    recall  f1-score   support

           0       1.00      0.45      0.62      1904
           1       0.01      0.94      0.03        16

    accuracy                           0.45      1920
   macro avg       0.51      0.69      0.32      1920
weighted avg       0.99      0.45      0.62      1920

Auc Score :
 0.6940651260504201
Marge = 4963.44 euros


## 2.2 SMOTETomek

In [124]:
%%time

resampling4 = SMOTEENN(random_state=0)

model1 = LogisticRegression(random_state=0)

clf4 = Pipeline([('SMOTETomek', resampling4), ('Logistic Regression', model1)])

classifier4=clf4.fit(X_train_sample_scale, y_train_sample)

CPU times: total: 1.45 s
Wall time: 1.36 s


In [125]:
predicted4=clf4.predict(X_test_sample)

In [126]:
resultats(predicted4)

Confusion matrix:
 [[ 858 1046]
 [   1   15]]
F1_score :
 0.027855153203342614
Estimateurs :
               precision    recall  f1-score   support

           0       1.00      0.45      0.62      1904
           1       0.01      0.94      0.03        16

    accuracy                           0.45      1920
   macro avg       0.51      0.69      0.32      1920
weighted avg       0.99      0.45      0.62      1920

Auc Score :
 0.6940651260504201
Marge = 4963.44 euros


# 3. Point de comparaison avec les méthodes testées en oversampling sur le même sampling data set

# 3.1 SMOTE

In [118]:
%%time

resampling5 = SMOTE(random_state=0)

model1 = LogisticRegression(random_state=0)

clf5 = Pipeline([('SMOTE', resampling5), ('Logistic Regression', model1)])

classifier5=clf5.fit(X_train_sample_scale, y_train_sample)

CPU times: total: 141 ms
Wall time: 46.9 ms


In [119]:
predicted5=clf5.predict(X_test_sample)

In [120]:
resultats(predicted5)

Confusion matrix:
 [[ 875 1029]
 [   1   15]]
F1_score :
 0.028301886792452827
Estimateurs :
               precision    recall  f1-score   support

           0       1.00      0.46      0.63      1904
           1       0.01      0.94      0.03        16

    accuracy                           0.46      1920
   macro avg       0.51      0.70      0.33      1920
weighted avg       0.99      0.46      0.62      1920

Auc Score :
 0.6985294117647058
Marge = 4979.73 euros


# 3.2 Borderline SMOTE

In [121]:
%%time

resampling6 = BorderlineSMOTE(random_state=0)

model1 = LogisticRegression(random_state=0)

clf6 = Pipeline([('BorderlineSMOTE', resampling6), ('Logistic Regression', model1)])

classifier6=clf6.fit(X_train_sample_scale, y_train_sample)

CPU times: total: 359 ms
Wall time: 132 ms


In [122]:
predicted6=clf6.predict(X_test_sample)

In [123]:
resultats(predicted6)

Confusion matrix:
 [[1556  348]
 [  11    5]]
F1_score :
 0.027100271002710025
Estimateurs :
               precision    recall  f1-score   support

           0       0.99      0.82      0.90      1904
           1       0.01      0.31      0.03        16

    accuracy                           0.81      1920
   macro avg       0.50      0.56      0.46      1920
weighted avg       0.98      0.81      0.89      1920

Auc Score :
 0.5648634453781513
Marge = 5376.7 euros


## 3.3 ADASYN

In [127]:
%%time

resampling7 = ADASYN(random_state=0)

model1 = LogisticRegression(random_state=0)

clf7 = Pipeline([('ADASYN', resampling7), ('Logistic Regression', model1)])

classifier7=clf7.fit(X_train_sample_scale, y_train_sample)

CPU times: total: 188 ms
Wall time: 84.7 ms


In [128]:
predicted7=clf7.predict(X_test_sample)

In [129]:
resultats(predicted7)

Confusion matrix:
 [[ 885 1019]
 [   1   15]]
F1_score :
 0.02857142857142857
Estimateurs :
               precision    recall  f1-score   support

           0       1.00      0.46      0.63      1904
           1       0.01      0.94      0.03        16

    accuracy                           0.47      1920
   macro avg       0.51      0.70      0.33      1920
weighted avg       0.99      0.47      0.63      1920

Auc Score :
 0.7011554621848739
Marge = 4993.58 euros


**Les résulats en under et over-and-under sampling ne semble pas apporter pas améliore de manière significative, ne terme de performances, les résultats obtenus avec BorderlineSMOTE**