In [75]:
import os
import random
import numpy as np
import pandas as pd
import time
import zipfile
import pickle
import mlflow.sklearn


from sklearn.model_selection import StratifiedKFold
#lib de metriques
from sklearn.metrics import precision_recall_curve, confusion_matrix, f1_score, make_scorer, auc


# Commandes pour telecharger les données depuis kaggle

In [76]:
kaggle_username = 'paulineattal'
kaggle_key = '842753570fc0411f2cc69d2e7a242157'
path = 'C:/Users/pauli/Documents/M2/fouille_de_donnees/projet/fichiers/kaggle'
os.chdir(path)

In [77]:
!kaggle config set -n username -v $kaggle_username
!kaggle config set -n key -v $kaggle_key
!kaggle datasets download -d axeltrc/fraudes-bancaires-smotetomek10 -p $path

- username is now set to: paulineattal
- key is now set to: 842753570fc0411f2cc69d2e7a242157
fraudes-bancaires-smotetomek10.zip: Skipping, found more recently modified local copy (use --force to force download)


In [78]:

with zipfile.ZipFile('fraudes-bancaires-smotetomek10.zip', 'r') as zip_ref :
    zip_ref.extractall()
    for file in os.listdir():
        Xtrain = pd.read_csv('Xtrain_SMOTETomek.csv', sep=",")
        ytrain = pd.read_csv('ytrain_SMOTETomek.csv', sep=",")
        ytest = pd.read_csv('ytest.csv', sep=",")
        Xtest = pd.read_csv('Xtest.csv', sep=",")

In [None]:
index_train = Xtrain.index.to_list()
print(len(index_train))
random_index = random.sample(index_train, round(len(index_train)*0.1))
print(len(random_index))
Xtrain = Xtrain.loc[random_index,]
ytrain = ytrain.loc[random_index,]

In [79]:
ytest.reset_index(drop=True, inplace=True)
Xtest.reset_index(drop=True, inplace=True)
ytrain = ytrain.reset_index(drop=True)
Xtrain = Xtrain.reset_index(drop=True)

## Boucle de recherche sur meilleur modele avec les meilleurs hyperparametres

In [None]:
!mlflow ui

In [24]:
import warnings
warnings.filterwarnings("ignore")

import mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Fouilles de Données Massives")

# On stocke ci-dessous les valeurs des hyper-paramètres que l'on souhaite tester
#np.arange(start = 5, stop = 250, step = 50)
params_modeles = [
{"loss":["log_loss"],
 "learning_rate":[0.09],
 "n_estimators":[200],
 "min_samples_split":[2]}
]

from sklearn.ensemble import GradientBoostingClassifier 
modeles_list = [
    GradientBoostingClassifier()
]


def select_model(modeles, parameters, Xtrain, ytrain, Xtest, ytest) :
    df = pd.DataFrame(columns = ['best','score', 'ftest', 'ftrain','rappel', 'precision', 'lr_auc', 'lr_precision', 'lr_recall', 'time_train'])    
    
    for i, modele in enumerate(modeles):
        modele_name = str(modele)
        
        #start run mlflow
        my_run = mlflow.start_run(run_name = modele_name)
        
        f1 = make_scorer(f1_score , average='macro')
        #test all models with CV
        from sklearn.model_selection import GridSearchCV
        model = GridSearchCV(estimator=modele,
                            param_grid=parameters[i],
                            scoring = f1,
                            verbose = False,
                            cv = 3)
        start_time = time.time()
        model.fit(Xtrain, ytrain)
        full_time = time.time() - start_time
        
        rankTrain = model.predict(Xtrain)
        rankTest = model.predict(Xtest)

        #calcul metrics
        #calcul de la f-mesure pour mesurer la performance du modele 
        ctrain = confusion_matrix(ytrain, rankTrain)
        rappel = round(ctrain[1,1]/(ctrain[1,1]+ctrain[1,0]),4)
        precision = round(ctrain[1,1]/(ctrain[1,1]+ctrain[0,1]),4)
        ftrain = round(2*ctrain[1,1]/(2*ctrain[1,1]+ctrain[0,1]+ctrain[1,0]),4)
        ctest = confusion_matrix(ytest, rankTest)
        ftest = round(2*ctest[1,1]/(2*ctest[1,1]+ctest[0,1]+ctest[1,0]),4)
        #calcul de l-AUC Precision-Rappel
        lr_precision, lr_recall, _ = precision_recall_curve(ytest, rankTest)
        lr_auc =  auc(lr_recall, lr_precision)
        
        #df with all indicators
        df.loc[i]=[model.best_estimator_, model.best_score_, ftest, ftrain, rappel, precision, lr_auc, lr_precision, lr_recall, full_time]
        
        #save info i mlflow
        mlflow.sklearn.log_model(modele,modele_name)
        #artifact
        #mlflow.log_artifact("guillaume.txt")
        #stocker les métriques
        my_run.metrics = {}
        #rajout des éléments de performance
        my_run.metrics['best'] = model.best_score_
        my_run.metrics['ftest'] = ftest
        my_run.metrics['ftrain'] = ftrain
        my_run.metrics['precision'] = precision
        my_run.metrics['rappel'] = rappel
        my_run.metrics['lr_auc']= lr_auc
        my_run.metrics['full_time']=full_time
        mlflow.log_metrics(my_run.metrics)
        #paramètres de l'algo
        mlflow.log_params(modele.get_params())
        mlflow.end_run()
        
    #return df with all indicators  
    return df

df_ind = select_model(modeles_list, params_modeles, Xtrain, ytrain, Xtest, ytest)


In [72]:
df_ind

Unnamed: 0,best,score,ftest,ftrain,rappel,precision,lr_auc,lr_precision,lr_recall,time_train
0,([DecisionTreeRegressor(criterion='friedman_ms...,0.859674,0.1125,0.7321,0.5854,0.9769,0.117885,"[0.008794358124244893, 0.12654379631388943, 1.0]","[1.0, 0.10132359653126426, 0.0]",1430.800826


# recuperer le modele depuis mlflow

In [None]:
path_save = 'C:/Users/pauli/Documents/M2/fouille_de_donnees/projet/SISE_Fraudes_Bancaires/methodes'
os.chdir(path_save)

run_id = "a76b8d4468c44427a007505886a930ee"
artifact_path = "artifacts/GradientBoostingClassifier()/model.pkl"
artifact_path_bis = 'mlflow-artifacts:/541795256308840198/a76b8d4468c44427a007505886a930ee/artifacts/GradientBoostingClassifier()'

model = mlflow.sklearn.load_model(artifact_path_bis)
model.fit(Xtrain, ytrain)
pickle.dump(model, open("./model.pickle.dat", "wb"))


####voir si c'est ok ... 
rankTrain = model.predict(Xtrain)
rankTest = model.predict(Xtest)
#calcul metrics
#calcul de la f-mesure pour mesurer la performance du modele 
ctrain = confusion_matrix(ytrain, rankTrain)
rappel = round(ctrain[1,1]/(ctrain[1,1]+ctrain[1,0]),4)
precision = round(ctrain[1,1]/(ctrain[1,1]+ctrain[0,1]),4)
ftrain = round(2*ctrain[1,1]/(2*ctrain[1,1]+ctrain[0,1]+ctrain[1,0]),4)
ctest = confusion_matrix(ytest, rankTest)
ftest = round(2*ctest[1,1]/(2*ctest[1,1]+ctest[0,1]+ctest[1,0]),4)

print(ftest)

# recuperer le modele depuis le DF

In [21]:
#appliquer et sauvegarder le meilleur modele
#get index of df wich have best f1 score
indice=df_ind['ftest'].idxmax()
#keep model with best f score
best_model = df_ind["best"][indice]
best_model.fit(Xtrain, ytrain)
pickle.dump(best_model, open("./model.pickle.dat", "wb"))


NameError: name 'df_ind' is not defined

# afficher les modeles 

In [None]:
#lancer les runs si on veut faire une seule boucle
for modele in liste:
    #le run
    therun = mlflow.start_run(run_name = modele[0])    
    #entraînement du modèle
    themodele = modele[1].fit(XTrain,yTrain)
    #prédiction
    predmodele = themodele.predict(XTest)
    #récupérer les résultats
    therun.metrics = {}
    therun.metrics['Accuracy'] = metrics.accuracy_score(yTest,predmodele)
    therun.metrics['Rappel'] = metrics.recall_score(yTest,predmodele,pos_label='good')
    therun.metrics['Precision'] = metrics.precision_score(yTest,predmodele,pos_label='good')
    #rajouter le modèle dans le log
    mlflow.sklearn.log_model(themodele,modele[0])
    #info sur les donnees
    mlflow.log_artifact("working_conditions.xlsx")
    #stocker les métriques
    mlflow.log_metrics(therun.metrics)
    #paramètres de l'algo
    mlflow.log_params(themodele.get_params())
    #finaliser
    mlflow.end_run()