In [1]:
import os
os.chdir(r"C:\Users\pauli\Documents\M2\fouille de données\projet\fichiers")


In [None]:
import mlflow
import os
#rendre silencieux les messages du GIT
os.environ["GIT_PYTHON_REFRESH"] = "quiet"
#définir une expérimentation
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Fouilles de Données Massives")

In [3]:
import random
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import StratifiedKFold

#lib de pre-process
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Normalizer
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE

#lib de metriques
from sklearn.metrics import precision_recall_curve, confusion_matrix, f1_score, make_scorer



In [4]:
df = pd.read_csv('df_clean.csv', sep=",")
index_train = pd.read_csv('index_train.csv',sep=",")
index_train = index_train["0"].values.tolist()
index_test = pd.read_csv('index_test.csv',sep=",")
index_test = index_test["0"].values.tolist()

In [5]:
#prendre moins de lignes ??? ou l'under sampling suffira ??? 
index_train30 = random.sample(index_train, round(len(index_train)*0.03))
index_test30 = random.sample(index_test, round(len(index_test)*0.03))


In [6]:
y = df.loc[:,"FlagImpaye"]
# Drop the 'FlagImpaye' column
X = df.drop('FlagImpaye', axis=1)

#ytrain
ytrain = y.loc[index_train,]
ytrain30 = y.loc[index_train30,]
#ytest
ytest = y.loc[index_test,]
ytest30 = y.loc[index_test30,]
#Xtrain
Xtrain = X.loc[index_train,]
Xtrain30 = X.loc[index_train30,]
#Xtest
Xtest = X.loc[index_test,]
Xtest30 = X.loc[index_test30,]


## Retravailler le dataset d'entrainement

In [None]:
#undersampling with tomek-link
from collections import Counter
print('Original dataset shape %s' % Counter(ytrain))
print('0.03 dataset shape %s' % Counter(ytrain30))

tl = TomekLinks()
Xtrain_tl, ytrain_tl = tl.fit_resample(Xtrain, ytrain)
print('Tomeklinks resampled original dataset shape %s' % Counter(ytrain_tl))

sm = SMOTE(sampling_strategy=0.5, k_neighbors=5, random_state=1)
Xtrain_smote, ytrain_smote = sm.fit_resample(Xtrain30, ytrain30)
print('Smote resampled 0.03 dataset shape %s' % Counter(ytrain_smote))

#les 2...




Original dataset shape Counter({0: 91491, 1: 448})


In [7]:
Xtrain= Xtrain30
ytrain= ytrain30
Xtest= Xtest30
ytest= ytest30

In [8]:
#normaliser 
normalizer = Normalizer() 
normalizer.fit(Xtrain)    
Xtrain = normalizer.transform(Xtrain)
Xtest = normalizer.transform(Xtest)

## Boucle de recherche sur meilleur modele avec les meilleurs hyperparametres

In [16]:
import warnings
warnings.filterwarnings("ignore")

seed = 1

# On stocke ci-dessous les valeurs des hyper-paramètres que l'on souhaite tester
#np.arange(start = 5, stop = 250, step = 50)
params_modeles = [
{'estimator__max_features' : [5],
'estimator__min_samples_split' : [50],
'estimator__max_depth':[30],
'estimator__min_samples_leaf':[10]},
{"estimator__solver":["newton-cg"],
 "estimator__penalty":["none"],
 "estimator__max_iter":[10000]}
]
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
modeles_list = [
    DecisionTreeClassifier(),
    LogisticRegression()
]

np.random.seed(seed)
random.seed(seed)


def select_model(modeles, parameters, Xtrain, ytrain, Xtest, ytest) :
    df = pd.DataFrame(columns = ['best','score', 'ft_imp', 'ftest', 'ftrain', 'lr_auc', 'time_train'])    
    
    for i, modele in enumerate(modeles):
        f1 = make_scorer(f1_score , average='macro')
        #select only usefull features
        from sklearn.feature_selection import RFECV
        sel = RFECV(estimator=modele, step=1, cv=5,scoring = f1)
        #si je decide d'enelever la rfecv, attention a la syntaxe des hyperpaparam dans params_modeles
        #enelever aussi sel dans estimator de grifsearcv et replacer par modele
        #test all models with CV
        from sklearn.model_selection import GridSearchCV
        model = GridSearchCV(estimator=sel,
                            param_grid=parameters[i],
                            scoring = f1,
                            verbose = False,
                            cv = 5)
        start_time = time.time()
        model.fit(Xtrain, ytrain)
        full_time = time.time() - start_time
        
        rankTrain = model.predict(Xtrain)
        rankTest = model.predict(Xtest)

        #calcul metrics
        #calcul de la f-mesure pour mesurer la performance du modele 
        ctrain = confusion_matrix(ytrain, rankTrain)
        ftrain = round(2*ctrain[1,1]/(2*ctrain[1,1]+ctrain[0,1]+ctrain[1,0]),4)
        ctest = confusion_matrix(ytest, rankTest)
        ftest = round(2*ctest[1,1]/(2*ctest[1,1]+ctest[0,1]+ctest[1,0]),4)
        #calcul de l-AUC Precision-Rappel
        lr_precision, lr_recall, _ = precision_recall_curve(ytest, rankTrain)
        lr_auc =  auc(lr_recall, lr_precision)
        
        #save indicators
        features=list(X.columns[model.best_estimator_.support_])
        #df with all indicators
        df.loc[i]=[model.best_estimator_, model.best_score_, features, ftest, ftrain, lr_auc, full_time]
    #return df with all indicators  
    return df

df_ind = select_model(modeles_list, params_modeles, Xtrain, ytrain, Xtest, ytest)


KeyboardInterrupt: 

In [None]:
df_ind

In [None]:
#appliquer et sauvegarder le meilleur modele
#get index of df wich have best f1 score
indice=df_ind['ftest'].idxmax()
#keep model with best f score
best_model = df_ind["best"][indice]
best_model.fit(X, y)
pickle.dump(best_model, open("./model.pickle.dat", "wb"))


In [None]:
#XGBoost/ gradient tree boosting 
from sklearn.ensemble import GradientBoostingClassifier 
param = {"loss":"log_loss","learning_rate":0.1,"n_estimators":100,"min_samples_split":2}
gbc = GradientBoostingClassifier(loss="log_loss", learning_rate=0.1, n_estimators=100, min_samples_split=2)
gbc.fit(Xtrain, ytrain)

#Nearest-Neighbor
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors=2, algorithm='ball_tree')
knc.fit(Xtrain, ytrain)

#Decision Trees
from sklearn.tree import DecisionTreeClassifier
param = {"criterion":"gini","max_depth":None,"min_samples_split":2,"min_samples_leaf":1,"max_features":"sqrt"}
dtc = DecisionTreeClassifier()
dtc = dtc.fit(Xtrain, ytrain)

#Random Forests
from sklearn.ensemble import RandomForestClassifier
param = {"n_estimators":100,"criterion":"gini","max_depth":None,"min_samples_split":2,"min_samples_leaf":1,"max_features":"sqrt","oob_score":False,"warm_start":False,"max_samples":None}
rfc = RandomForestClassifier()
rfc = rfc.fit(Xtrain, ytrain)

#SVM
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
param = {"kernel":"rbf","degree":3,}
#ici on peut changer le noyaux
svc = make_pipeline(StandardScaler(), SVC(kernel="rbf",degree=3))
svc.fit(Xtrain, ytrain)

#K-means
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto")
kmeans.fit(Xtrain)
#bof ca, on ne prend meme pas en compte les y...


#LOF
from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor(n_neighbors=2)
lof.fit(Xtrain)

#Auto-encodeurs
#keskecé ???

#Reseaux de neurones
from sklearn.neural_network import MLPClassifier
mlpc = MLPClassifier(random_state=1, max_iter=100)
mlpc.fit(Xtrain, ytrain)

#ADL
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
adl = LinearDiscriminantAnalysis()
adl.fit(Xtrain, ytrain)

#ADQ
from sklearn.qda import QDA
qda = QDA()
qda.fit(Xtrain, ytrain)

#Cost-sensitive learning
#On pondère les erreurs 
#Modifier le poids de chaque classe sur le substitue de taux d’erreur 
#Attribuer un poids a chaque entrée de la matrice de confusion (cout a l’échelle de chaque classe) 

#Methodes ensemblistes
#bagging
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
param = {"max_features":0.5,"max_samples" : 0.5}
bagging = BaggingClassifier(KNeighborsClassifier())

#boosting 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
adab = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(adab, Xtrain, ytrain, cv=5)

#regression logistique
from sklearn.linear_model import LogisticRegression
param = {"solver":"saga","penalty":"none","max_iter":100}
logit = LogisticRegression(solver="saga", penalty="none", max_iter=100, random_state=1)
logit.fit(Xtrain, ytrain)



