In [11]:
!pip install mlflow
!pip install --upgrade jinja2
!pip install --upgrade Flask
!pip install setuptools




In [None]:
!mlflow server --host 127.0.0.1 --port 8080


In [22]:
from mlflow import MlflowClient
from pprint import pprint
from sklearn.ensemble import RandomForestRegressor


In [23]:
# In order to connect to the tracking server, we’ll need to use the uri that we assigned the server when we started it.

client = MlflowClient(tracking_uri="http://127.0.0.1:8080")

#it allows programmatic interaction with the MLflow tracking server.

Deleting existing experiment with ID: 248685807844681205
Experiment 'Bankrupt_Model' is now marked as deleted.


In [24]:
 #Description de l'expérience
experiment_description = (
    "This is the bankrupt prediction project. "
    "This experiment contains models for predicting company bankruptcy."
)

# Tags associés à l'expérience
experiment_tags = {
    "project_name": "bankruptcy-prediction",
    "sector": "finance",
    "team": "risk-analysis",
    "project_quarter": "Q1-2025",
    "mlflow.note.content": experiment_description,
}

# Création de l'expérience avec un nom unique
bankrupt_experiment = mlflow.create_experiment(
    name="Bankrupt_Model", tags=experiment_tags
)

# Recherche d'expériences associées au projet "bankruptcy-prediction"
bank_experiments = mlflow.search_experiments(
    filter_string="tags.project_name = 'bankruptcy-prediction'"
)

# Affichage des métadonnées de l'expérience
print(vars(bank_experiments[0]))

# Définition de l'expérience active pour les futurs runs
mlflow.set_experiment("Bankrupt_Model")

# Définition du nom du run et du chemin pour sauvegarder le modèle
run_name = "bankrupt_model_test"
artifact_path = "bankrupt_model"





{'_experiment_id': '759386245684293040', '_name': 'Bankrupt_Model', '_artifact_location': 'mlflow-artifacts:/759386245684293040', '_lifecycle_stage': 'active', '_tags': {'mlflow.note.content': 'This is the bankrupt prediction project. This experiment contains models for predicting company bankruptcy.', 'project_name': 'bankruptcy-prediction', 'project_quarter': 'Q1-2025', 'sector': 'finance', 'team': 'risk-analysis'}, '_creation_time': 1742986077652, '_last_update_time': 1742986077652}


In [25]:
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:8080")


In [26]:

import pandas as pd 
import numpy as np 
import gzip
import json
import pickle
import re
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
import arff
import pandas as pd
import os

In [27]:
def wrangle(filepath):
    with open(filepath, 'r') as file:
        data = arff.load(file)
    
    df = pd.DataFrame(data['data'], columns=[attr[0] for attr in data['attributes']])
    
    df.columns = [re.sub(pattern='Attr',repl='feat_',string=col) for col in df.columns]
    df.rename(columns={'class':'bankrupt'},inplace=True)
    
    # Change dtype of the Labels columns
    df['bankrupt'] = df['bankrupt'].astype(np.int64)
    df['status'] = df['bankrupt'].apply(lambda x: "The company in bankrupt" if x == 1 else "The company is safe")

    # column is the most missing value
    df.drop(columns='feat_37',inplace=True)
    return df

url = "data/poland.arff"

df = wrangle(url)


In [28]:
# DIVISION DES DONNEES

target = "bankrupt"
X = df.select_dtypes(include= "float64")
y = df[target]

print("X shape:", X.shape)
print("y shape:", y.shape)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X shape: (7027, 63)
y shape: (7027,)


In [29]:
# Définir le modèle


model_reg = make_pipeline(
    SimpleImputer(strategy="median"),
    DecisionTreeClassifier(random_state=42)
)

# Entraîner le modèle
model_reg.fit(X_train, y_train)

# Prédictions
y_pred = model_reg.predict(X_test)

# Calcul des métriques
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Affichage des résultats
print(f"Decision Tree - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

# Enregistrement avec MLflow
with mlflow.start_run(run_name="Decision_Tree"):
    mlflow.log_param("imputer_strategy", "median")
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    mlflow.sklearn.log_model(model_reg, "decision_tree_model")


Decision Tree - Accuracy: 0.9474, Precision: 0.3544, Recall: 0.5490, F1-score: 0.4308




🏃 View run Decision_Tree at: http://127.0.0.1:8080/#/experiments/759386245684293040/runs/f634764714bb4ba29c9969fd4bafd3eb
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/759386245684293040


In [30]:
clf = make_pipeline(
    SimpleImputer(),
    RandomForestClassifier(random_state=42)
)

params = {
    "simpleimputer__strategy": ["mean", "median"],
    "randomforestclassifier__n_estimators": range(25, 100, 25),
    "randomforestclassifier__max_depth": range(10, 50, 10)
}

# GridSearchCV pour optimiser les hyperparamètres
model = GridSearchCV(clf, param_grid=params, cv=5, n_jobs=-1, verbose=1)

# Entraînement
model.fit(X_train, y_train)

# Meilleurs hyperparamètres trouvés
best_params = model.best_params_

# Prédictions
y_pred = model.predict(X_test)

# Calcul des métriques
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Affichage
print(f"Random Forest - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

# Enregistrement avec MLflow
with mlflow.start_run(run_name="Random_Forest"):
    mlflow.log_params(best_params)  # Enregistrer les meilleurs hyperparamètres
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    mlflow.sklearn.log_model(model.best_estimator_, "random_forest_model")


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Random Forest - Accuracy: 0.9822, Precision: 0.9643, Recall: 0.5294, F1-score: 0.6835




🏃 View run Random_Forest at: http://127.0.0.1:8080/#/experiments/759386245684293040/runs/6503661835ab4e04988433ef96f17cc2
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/759386245684293040


In [31]:
clf = make_pipeline(SimpleImputer(), GradientBoostingClassifier())

params = {
    "simpleimputer__strategy": ["mean", "median"], 
    "gradientboostingclassifier__n_estimators": range(20, 31, 5),
    "gradientboostingclassifier__max_depth": range(2, 5)
}

model = GridSearchCV(clf, param_grid=params, cv=5, n_jobs=-1, verbose=1)

# Entraîner le modèle (suréchantillonné si nécessaire)
model.fit(X_train, y_train)

# Meilleurs hyperparamètres
best_params = model.best_params_

# Prédictions
y_pred = model.predict(X_test)

# Calcul des métriques
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Affichage
print(f"Gradient Boosting - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

# Enregistrement avec MLflow
with mlflow.start_run(run_name="Gradient_Boosting"):
    mlflow.log_params(best_params)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    mlflow.sklearn.log_model(model.best_estimator_, "gradient_boosting_model")


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Gradient Boosting - Accuracy: 0.9829, Precision: 0.9355, Recall: 0.5686, F1-score: 0.7073




🏃 View run Gradient_Boosting at: http://127.0.0.1:8080/#/experiments/759386245684293040/runs/8e602661df574319877ec7b3380b0d65
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/759386245684293040
