### Chargement des bibliothèques et des données

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import json
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn import metrics
import warnings
import datetime
import time
import re
from sklearn.model_selection import train_test_split


In [2]:
# Monter Google Drive dans Google Colab pour y accéder
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Accéder au répertoire
import os
os.chdir('/content/drive/My Drive')


Mounted at /content/drive


In [3]:
import mlflow
import mlflow.sklearn
import pandas as pd
import joblib
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
import shutil


# Charger les données
data_train = pd.read_csv('data_train.csv')

# Supprimer la colonne 'TARGET'
X = data_train.drop(columns=['TARGET'])

# La variable y contiendra les valeurs de la colonne 'TARGET' du dataframe 'data_train'.
y = data_train['TARGET']

# Diviser les données en ensembles d'apprentissage et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)



In [4]:
!pip install catboost



In [5]:
!pip install mlflow



## MLflow Tracking

In [6]:
!pip install lightgbm




In [9]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import mlflow
import lightgbm as lgb
from imblearn.pipeline import Pipeline

# Créez le pipeline avec imputation, SMOTE et LGBMClassifier
lgbm_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean', missing_values=np.nan)),
    ("undersampler", RandomUnderSampler(sampling_strategy=0.2, random_state=0)),
    ("smote", SMOTE(sampling_strategy=0.3, random_state=0)),
    ('model', lgb.LGBMClassifier(random_state=42))
])

mlflow.set_experiment('lgbm_pipeline')

with mlflow.start_run(run_name='lgbm_pipeline'):
    clf = lgbm_pipeline
    clf.fit(X_train, y_train)

    # Prédisez les classes
    y_pred = clf.predict(X_test)

    # Calculez et enregistrez plusieurs métriques dans MLflow
    auc_roc_score = roc_auc_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric('AUC-ROC Score', auc_roc_score)
    mlflow.log_metric('Accuracy', accuracy)
    mlflow.log_metric('Precision', precision)
    mlflow.log_metric('Recall', recall)
    mlflow.log_metric('F1 Score', f1)

    # Enregistrez le modèle LightGBM directement avec MLflow
    mlflow.lightgbm.log_model(clf, "lgbm_model")


2023/09/18 18:01:56 INFO mlflow.tracking.fluent: Experiment with name 'lgbm_pipeline' does not exist. Creating a new experiment.


[LightGBM] [Info] Number of positive: 29955, number of negative: 99850
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16229
[LightGBM] [Info] Number of data points in the train set: 129805, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.230769 -> initscore=-1.203973
[LightGBM] [Info] Start training from score -1.203973




In [12]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import mlflow
import catboost
import joblib
from imblearn.pipeline import Pipeline  # Import correct


# Créez le pipeline avec imputation, SMOTE et CatBoostClassifier
catboost_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean', missing_values=np.nan)),
    ("undersampler", RandomUnderSampler(sampling_strategy=0.2, random_state=0)),
    ("smote", SMOTE(sampling_strategy=0.3, random_state=0)),
    ('model', catboost.CatBoostClassifier(random_state=42, verbose=0))
])

mlflow.set_experiment('catboost_pipeline')

with mlflow.start_run(run_name='catboost_pipeline'):
    clf = catboost_pipeline
    clf.fit(X_train, y_train)

    # Prédisez les classes
    y_pred = clf.predict(X_test)

    # Calculez et enregistrez plusieurs métriques dans MLflow
    auc_roc_score = roc_auc_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric('AUC-ROC Score', auc_roc_score)
    mlflow.log_metric('Accuracy', accuracy)
    mlflow.log_metric('Precision', precision)
    mlflow.log_metric('Recall', recall)
    mlflow.log_metric('F1 Score', f1)

    # Enregistrez le modèle CatBoost avec joblib
    joblib.dump(clf, "catboost_model.pkl")

    # Chargez le modèle enregistré avec joblib dans MLflow
    mlflow.pyfunc.log_model("catboost_model", loader_module="joblib", code_path=["catboost_model.pkl"])




In [13]:
pip install pyngrok



In [14]:
from pyngrok import ngrok

# Arrêter les tunnels NGROK existants s'il y en a
ngrok.kill()

# Définir le jeton d'authentification NGROK
NGROK_AUTH_TOKEN = '2VBaEIaySsIF9xvwRzFEjPOm5MW_PWN3rHCf2T4BBaL2j1gx'
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Ouvrir un tunnel HTTPS sur le port 5003 pour http://localhost:5000
ngrok_tunnel = ngrok.connect(addr='5000', proto='http', bind_tls=True)

# Imprimer l'URL publique du tunnel NGROK
print("MLflow tracking UI:", ngrok_tunnel.public_url)



MLflow tracking UI: https://e390-35-245-156-126.ngrok-free.app


In [15]:
!mlflow ui

[2023-09-18 18:09:20 +0000] [50439] [INFO] Starting gunicorn 21.2.0
[2023-09-18 18:09:20 +0000] [50439] [INFO] Listening at: http://127.0.0.1:5000 (50439)
[2023-09-18 18:09:20 +0000] [50439] [INFO] Using worker: sync
[2023-09-18 18:09:20 +0000] [50444] [INFO] Booting worker with pid: 50444
[2023-09-18 18:09:20 +0000] [50445] [INFO] Booting worker with pid: 50445
[2023-09-18 18:09:20 +0000] [50446] [INFO] Booting worker with pid: 50446
[2023-09-18 18:09:20 +0000] [50447] [INFO] Booting worker with pid: 50447
[2023-09-18 18:42:49 +0000] [50439] [INFO] Handling signal: int

Aborted!
[2023-09-18 18:42:49 +0000] [50445] [INFO] Worker exiting (pid: 50445)
[2023-09-18 18:42:49 +0000] [50444] [INFO] Worker exiting (pid: 50444)
[2023-09-18 18:42:49 +0000] [50446] [INFO] Worker exiting (pid: 50446)
[2023-09-18 18:42:49 +0000] [50447] [INFO] Worker exiting (pid: 50447)
[2023-09-18 18:42:51 +0000] [50439] [INFO] Shutting down: Master
