# Setting up Mlflow

In [1]:
# Install the following librairies (it is better to create a venv (or conda) virtual environment first and install these librairies in it)
!pip install mlflow
!pip install --upgrade jinja2
!pip install --upgrade Flask
!pip install setuptools




[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip




In [2]:

# starts an MLflow server locally.
!mlflow server --host 127.0.0.1 --port 8080


^C


## Using the MLflow Client API


- Initiate a new Experiment.

- Start Runs within an Experiment.

- Document parameters, metrics, and tags for your Runs.

- Log artifacts linked to runs, such as models, tables, plots, and more.



In [2]:
from mlflow import MlflowClient
from pprint import pprint
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

In [3]:
# In order to connect to the tracking server, we’ll need to use the uri that we assigned the server when we started it.

client = MlflowClient(tracking_uri="http://127.0.0.1:8080")

#it allows programmatic interaction with the MLflow tracking server.

We now have a client interface to the tracking server that can both send data to and retrieve data from the tracking server.



In [6]:
all_experiments = client.search_experiments()

print(all_experiments)


[<Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1725353428642, experiment_id='0', last_update_time=1725353428642, lifecycle_stage='active', name='Default', tags={}>]


### create 2 experiments

In [4]:
experiment_description = (
    "This is the credit default prediction project. "
    "This experiment involves building models to estimate the probability of default for each client."
)
experiment_tags = {
    "project_name": "credit-default-prediction",
    "department": "risk-management",
    "team": "risk-ml",
    "project_quarter": "Q4-2023",
    "mlflow.note.content": experiment_description,
}

# Créer les expériences pour les modèles
try:
    rf_experiment_id = client.create_experiment(
        name="Random_Forest_Model", tags=experiment_tags
    )
except Exception as e:
    print("L'expérience Random Forest existe déjà ou une erreur est survenue:", e)
    rf_experiment_id = client.get_experiment_by_name("Random_Forest_Model").experiment_id

try:
    lr_experiment_id = client.create_experiment(
        name="Logistic_Regression_Model", tags=experiment_tags
    )
except Exception as e:
    print("L'expérience Régression Logistique existe déjà ou une erreur est survenue:", e)
    lr_experiment_id = client.get_experiment_by_name("Logistic_Regression_Model").experiment_id


### DATASET

In [5]:
import pandas as pd

data = pd.read_csv('Loan_Data.csv')

In [12]:
data.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [17]:
data.isnull().sum()

customer_id                 0
credit_lines_outstanding    0
loan_amt_outstanding        0
total_debt_outstanding      0
income                      0
years_employed              0
fico_score                  0
default                     0
dtype: int64

In [18]:
class_distribution = data['default'].value_counts()
print('Répartition des classes dans la colonne "default" :')
print(class_distribution)


Répartition des classes dans la colonne "default" :
0    8149
1    1851
Name: default, dtype: int64


### Logging our first runs with MLflow

In [6]:
import mlflow
from sklearn.model_selection import train_test_split


In [7]:
# This function call sets the global tracking URI for the current session.
# It’s a convenient way to configure the tracking server URI without creating a separate client instance.

mlflow.set_tracking_uri("http://127.0.0.1:8080")


In [8]:
# Sets the current active experiment to the "Credit_Default_Models" experiment and
# returns the Experiment metadata
credit_default_experiment = mlflow.set_experiment("Random_Forest_Model")

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
rf_run_name = "Random_Forest_Run"
rf_artifact_path = "rf_model"


In [9]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the data into features and target, and drop irrelevant date field and target field
X = data.drop(columns=["customer_id", "default"])
y = data["default"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameters for the RandomForestClassifier model
params = {
    "n_estimators": 100,
    "max_depth": 10,
    "min_samples_split": 5,
    "min_samples_leaf": 2,
    "random_state": 888,
    "class_weight": 'balanced', #Pondération des Classes:  Ajuste les poids des classes pour équilibrer le déséquilibre
    "bootstrap": True,
}

# Train the RandomForestClassifier model
rf = RandomForestClassifier(**params)

# Fit the model on the training data
rf.fit(X_train, y_train)

# Predict on the validation set
y_pred = rf.predict(X_val)

# Calculate classification metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Assemble the metrics we're going to write into a collection
metrics = {"accuracy": accuracy, "precision": precision, "recall": recall, "f1_score": f1}

# Initiate the MLflow run context
with mlflow.start_run(run_name=rf_run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params(params)

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    mlflow.sklearn.log_model(
        sk_model=rf, input_example=X_val, artifact_path=rf_artifact_path
    )




In [10]:
# Sets the current active experiment to the "Credit_Default_Models" experiment and
# returns the Experiment metadata
credit_default_experiment = mlflow.set_experiment("Logistic_Regression_Model")

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
lr_run_name = "Logistic_Regression_Run"
lr_artifact_path = "lr_model"

In [11]:
# Split the data into features and target, and drop irrelevant date field and target field
X = data.drop(columns=["customer_id", "default"])
y = data["default"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameters for the LogisticRegression model, including class weight to handle class imbalance
params = {
    "penalty": "l2",
    "C": 1.0,
    "solver": "liblinear",
    "random_state": 888,
    "max_iter": 100,
    "class_weight": "balanced"  # Automatically adjust weights inversely proportional to class frequencies
}

# Train the LogisticRegression model
lr = LogisticRegression(**params)

# Fit the model on the training data
lr.fit(X_train, y_train)

# Predict on the validation set
y_pred = lr.predict(X_val)

# Calculate error metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

# Assemble the metrics we're going to write into a collection
metrics = {"accuracy": accuracy, "precision": precision, "recall": recall, "f1_score": f1}

# Initiate the MLflow run context
with mlflow.start_run(run_name=lr_run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params(params)

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    mlflow.sklearn.log_model(
        sk_model=lr, input_example=X_val, artifact_path=lr_artifact_path
    )


