In [1]:
import os
import random
import numpy as np
import pandas as pd
import time
import zipfile
import pickle
import mlflow.sklearn


from sklearn.model_selection import StratifiedKFold
#lib de metriques
from sklearn.metrics import precision_recall_curve, confusion_matrix, f1_score, make_scorer, auc


# Commandes pour telecharger les données depuis kaggle

In [2]:
kaggle_username = ''
kaggle_key = ''
path = ''
os.chdir(path)

In [3]:
!kaggle config set -n username -v $kaggle_username
!kaggle config set -n key -v $kaggle_key
!kaggle datasets download -d axeltrc/fraudes-bancaires-smotetomek10 -p $path

- username is now set to: paulineattal
- key is now set to: 842753570fc0411f2cc69d2e7a242157
fraudes-bancaires-smotetomek10.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:

with zipfile.ZipFile('fraudes-bancaires-smotetomek10.zip', 'r') as zip_ref :
    zip_ref.extractall()
    for file in os.listdir():
        Xtrain = pd.read_csv('Xtrain_SMOTETomek.csv', sep=",")
        ytrain = pd.read_csv('ytrain_SMOTETomek.csv', sep=",")
        ytest = pd.read_csv('ytest.csv', sep=",")
        Xtest = pd.read_csv('Xtest.csv', sep=",")

Cellule a executer pour diminuer le jeu de données : prend 10% de la base train

In [5]:
index_train = Xtrain.index.to_list()
print(len(index_train))
random_index = random.sample(index_train, round(len(index_train)*0.1))
print(len(random_index))
Xtrain = Xtrain.loc[random_index,]
ytrain = ytrain.loc[random_index,]

3337616
333762


In [8]:
ytest.reset_index(drop=True, inplace=True)
Xtest.reset_index(drop=True, inplace=True)
ytrain.reset_index(drop=True, inplace=True)
Xtrain.reset_index(drop=True, inplace=True)

## Boucle de recherche sur meilleur modele avec les meilleurs hyperparametres

In [None]:
!mlflow ui

In [16]:
import warnings
warnings.filterwarnings("ignore")

import mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Fouilles de Données Massives")

<Experiment: artifact_location='mlflow-artifacts:/541795256308840198', creation_time=1673438592181, experiment_id='541795256308840198', last_update_time=1673438592181, lifecycle_stage='active', name='Fouilles de Données Massives', tags={}>

In [14]:

# On stocke ci-dessous les valeurs des hyper-paramètres que l'on souhaite tester
#np.arange(start = 5, stop = 500, step = 50)
params_modeles = [
{"loss":["log_loss"],
 "learning_rate":[0.09, 0.11],
 "n_estimators":[100],
 "min_samples_split":[5]}
]


from sklearn.ensemble import GradientBoostingClassifier 
modeles_list = [
    GradientBoostingClassifier()
]


def select_model(modeles, parameters, Xtrain, ytrain, Xtest, ytest) :
    df = pd.DataFrame(columns = ['best','score', 'ftest', 'ftrain','rappel', 'precision', 'lr_auc', 'lr_precision', 'lr_recall', 'time_train'])    
    
    for i, modele in enumerate(modeles):
        modele_name = str(modele)
        
        #start run mlflow
        my_run = mlflow.start_run(run_name = modele_name)
        
        f1 = make_scorer(f1_score , average='macro')
        #test all models with CV
        from sklearn.model_selection import GridSearchCV
        model = GridSearchCV(estimator=modele,
                            param_grid=parameters[i],
                            scoring = f1,
                            verbose = False,
                            cv = 3)
        start_time = time.time()
        model.fit(Xtrain, ytrain)
        full_time = time.time() - start_time
        
        rankTrain = model.predict(Xtrain)
        rankTest = model.predict(Xtest)

        #calcul metrics
        #calcul de la f-mesure pour mesurer la performance du modele 
        ctrain = confusion_matrix(ytrain, rankTrain)
        rappel = round(ctrain[1,1]/(ctrain[1,1]+ctrain[1,0]),4)
        precision = round(ctrain[1,1]/(ctrain[1,1]+ctrain[0,1]),4)
        ftrain = round(2*ctrain[1,1]/(2*ctrain[1,1]+ctrain[0,1]+ctrain[1,0]),4)
        ctest = confusion_matrix(ytest, rankTest)
        ftest = round(2*ctest[1,1]/(2*ctest[1,1]+ctest[0,1]+ctest[1,0]),4)
        #calcul de l-AUC Precision-Rappel
        lr_precision, lr_recall, _ = precision_recall_curve(ytest, rankTest)
        lr_auc =  auc(lr_recall, lr_precision)
        
        #df with all indicators
        df.loc[i]=[model.best_estimator_, model.best_score_, ftest, ftrain, rappel, precision, lr_auc, lr_precision, lr_recall, full_time]
        
        #save info i mlflow
        mlflow.sklearn.log_model(modele,modele_name)
        #artifact
        #mlflow.log_artifact("guillaume.txt")
        #stocker les métriques
        my_run.metrics = {}
        #rajout des éléments de performance
        my_run.metrics['best'] = model.best_score_
        my_run.metrics['ftest'] = ftest
        my_run.metrics['ftrain'] = ftrain
        my_run.metrics['precision'] = precision
        my_run.metrics['rappel'] = rappel
        my_run.metrics['lr_auc']= lr_auc
        my_run.metrics['full_time']=full_time
        mlflow.log_metrics(my_run.metrics)
        #paramètres de l'algo
        mlflow.log_params(modele.get_params())
        mlflow.end_run()
        
    #return df with all indicators  
    return df

df_ind = select_model(modeles_list, params_modeles, Xtrain, ytrain, Xtest, ytest)


# recuperer le modele depuis mlflow

In [17]:
path_save = ''
os.chdir(path_save)

run_id = "a76b8d4468c44427a007505886a930ee"
artifact_path = "artifacts/GradientBoostingClassifier()/model.pkl"
artifact_path_bis = 'mlflow-artifacts:/541795256308840198/a76b8d4468c44427a007505886a930ee/artifacts/GradientBoostingClassifier()'

model = mlflow.sklearn.load_model(artifact_path_bis)

#entrainer sur toutes les donnees 
#Xtrain = pd.read_csv('Xtrain_SMOTETomek.csv', sep=",")
#ytrain = pd.read_csv('ytrain_SMOTETomek.csv', sep=",")
model.fit(Xtrain, ytrain)
#sauvegarder sur son ordinateur
with open('model.dat', 'wb') as f:
    pickle.dump(model, f)


####tester la f mesure 
rankTest = model.predict(Xtest)
#calcul metrics
#calcul de la f-mesure pour mesurer la performance du modele 
ctest = confusion_matrix(ytest, rankTest)
ftest = round(2*ctest[1,1]/(2*ctest[1,1]+ctest[0,1]+ctest[1,0]),4)
print(ftest)

0.1067


# recuperer le modele depuis le DF

In [21]:
#appliquer et sauvegarder le meilleur modele
#get index of df wich have best f1 score
indice=df_ind['ftest'].idxmax()
#keep model with best f score
best_model = df_ind["best"][indice]

Xtrain = pd.read_csv('Xtrain_SMOTETomek.csv', sep=",")
ytrain = pd.read_csv('ytrain_SMOTETomek.csv', sep=",")
best_model.fit(Xtrain, ytrain)
#sauvegarder sur son ordinateur
with open('model.dat', 'wb') as f:
    pickle.dump(mon_modele, f)


####tester la f mesure
rankTest = model.predict(Xtest)
#calcul metrics
#calcul de la f-mesure pour mesurer la performance du modele 
ctest = confusion_matrix(ytest, rankTest)
ftest = round(2*ctest[1,1]/(2*ctest[1,1]+ctest[0,1]+ctest[1,0]),4)
print(ftest)

NameError: name 'df_ind' is not defined

# Appliquer un modele deja entrainé 

In [18]:
path_save = ''
os.chdir(path_save)
with open('model.dat', 'rb') as f:
    mon_modele = pickle.load(f)
    
rankTest = mon_modele.predict(Xtest)
#calcul metrics
#calcul de la f-mesure pour mesurer la performance du modele 
ctest = confusion_matrix(ytest, rankTest)
ftest = round(2*ctest[1,1]/(2*ctest[1,1]+ctest[0,1]+ctest[1,0]),4)
print(ftest)

0.1067
