In [21]:
# Fix imports when running from notebooks/ folder
import sys
from pathlib import Path
import os
project_root = Path.cwd().parent 
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Added to path: {project_root}")

%load_ext autoreload
%autoreload 2

Added to path: /Users/elshaday/DEV/10Academy/credit-risk-probability-week4
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [74]:
from src import DataManager, ExperimentRunner, TrainModels, ModelRegistryManager
from scripts.constants import (
    READY_TO_MODEL_DATA_FILE_NAME,
    Columns,
    TARGET_COL,
    MODEL_NAME
)
from tabulate import tabulate
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import mlflow
import pandas as pd
from tabulate import tabulate

In [23]:
# Import Model Ready Data Set
dm = DataManager()
model_ready_df = dm.load_csv(file_name=READY_TO_MODEL_DATA_FILE_NAME, load_clean=True)
model_ready_df.drop(columns=["Unnamed: 0"], inplace=True)

Loading ../data/processed/final_data.csv...
Sucessfully loaded ../data/processed/final_data.csv!


In [24]:
print(tabulate(model_ready_df.head(), headers="keys", tablefmt="grid"))

+----+-----------------+--------------------+--------------------------+------------------------------+------------------------+--------------------------+----------------------------+---------------------+----------------------------+------------------------------+----------------+
|    | CustomerId      |   TransactionCount |   TotalTransactionAmount |   UniqueProductCategoryCount |   TransactionAmountSTD |   AverageTransactionHour |   AverageTransactionAmount |   MostCommonChannel |   MostCommonTransactionDay |   MostCommonTransactionMonth |   is_high_risk |
|  0 | CustomerId_1    |           -3.45181 |                 -1.42267 |                     -1.91337 |              -3.59831  |                -1.04588  |                  -1.42267  |            -1.85149 |                  -0.162459 |                    -0.44516  |              1 |
+----+-----------------+--------------------+--------------------------+------------------------------+------------------------+--------------------

In [25]:
customer_id_series = model_ready_df[Columns.CustomerId.value]
working_df = model_ready_df.copy().drop(columns=[Columns.CustomerId.value])

print(tabulate(working_df.head(), headers="keys", tablefmt="grid"))

+----+--------------------+--------------------------+------------------------------+------------------------+--------------------------+----------------------------+---------------------+----------------------------+------------------------------+----------------+
|    |   TransactionCount |   TotalTransactionAmount |   UniqueProductCategoryCount |   TransactionAmountSTD |   AverageTransactionHour |   AverageTransactionAmount |   MostCommonChannel |   MostCommonTransactionDay |   MostCommonTransactionMonth |   is_high_risk |
|  0 |           -3.45181 |                 -1.42267 |                     -1.91337 |              -3.59831  |                -1.04588  |                  -1.42267  |            -1.85149 |                  -0.162459 |                    -0.44516  |              1 |
+----+--------------------+--------------------------+------------------------------+------------------------+--------------------------+----------------------------+---------------------+--------------

In [26]:
trainer = TrainModels(working_df, target_col=TARGET_COL)
trainer.initialize_mlflow()
trainer.split_data()

2025/12/15 20:16:44 INFO mlflow.tracking.fluent: Experiment with name 'credit-risk-models' does not exist. Creating a new experiment.


### 1 - Logistic Regression (Baseline Model)

In [27]:
trainer.run_experiment(
    run_name="LogisticRegression_Baseline",
    runner=ExperimentRunner(
        LogisticRegression(max_iter=1000), model_name="LogisticRegression"
    ),
)



Successfully registered model 'credit-risk-models'.
Created version '1' of model 'credit-risk-models'.


{'accuracy': 0.986648865153538,
 'precision': 0.9866310160427807,
 'recall': 1.0,
 'f1': 0.9932705248990579,
 'roc_auc': 0.9836166543483617}

### 2 - Random Forest Classifier (Baseline Model)

In [28]:
trainer.run_experiment(
    run_name="RandomForest_Baseline",
    runner=ExperimentRunner(
        model=RandomForestClassifier(random_state=42), model_name="RandomForest"
    ),
)

Registered model 'credit-risk-models' already exists. Creating a new version of this model...
Created version '2' of model 'credit-risk-models'.


{'accuracy': 0.9853137516688919,
 'precision': 0.9892328398384926,
 'recall': 0.9959349593495935,
 'f1': 0.9925725860904794,
 'roc_auc': 0.9739467849223947}

### 3 - Logistic Regression (Hyperparameter Tuning)

In [29]:
lr_param_dist = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],
    "class_weight": [None, "balanced"],
    "solver": ["liblinear"],  
}

lr = LogisticRegression(
    max_iter=1000,
    random_state=42,
)

lr_search = RandomizedSearchCV(
    estimator=lr,
    param_distributions=lr_param_dist,
    n_iter=10,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1,
    random_state=42,
)

trainer.run_experiment(
    run_name="LogisticRegression_Tuned",
    runner=ExperimentRunner(
        model=lr,
        model_name="LogisticRegression",
        param_search=lr_search,
    ),
)

Registered model 'credit-risk-models' already exists. Creating a new version of this model...
Created version '3' of model 'credit-risk-models'.


{'accuracy': 0.9732977303070761,
 'precision': 0.9944903581267218,
 'recall': 0.978319783197832,
 'f1': 0.9863387978142076,
 'roc_auc': 0.9832471051983246}

### 4 - Random Forest Classifier (Hyperparameter Tuning)

In [30]:
rf_param_dist = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10],
}

rf = RandomForestClassifier(random_state=42)

rf_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=rf_param_dist,
    n_iter=10,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    random_state=42,
)

trainer.run_experiment(
    run_name="RandomForest_Tuned",
    runner=ExperimentRunner(
        model=rf,
        model_name="RandomForest",
        param_search=rf_search,
    ),
)

Registered model 'credit-risk-models' already exists. Creating a new version of this model...
Created version '4' of model 'credit-risk-models'.


{'accuracy': 0.9826435246995995,
 'precision': 0.9852744310575636,
 'recall': 0.997289972899729,
 'f1': 0.9912457912457913,
 'roc_auc': 0.9809066272480906}

### Model Comparision and Winner Model

In [47]:
# Fetch Run ID for Models

experiment = mlflow.get_experiment_by_name(MODEL_NAME)
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id], max_results=10)
runs_df = runs[["run_id", "tags.mlflow.runName"]]

print(runs_df)

                             run_id          tags.mlflow.runName
0  aeef71b99f394ee8ba82570712765351           RandomForest_Tuned
1  f5aa9860c6ba47fb981a6a02738aa5b2     LogisticRegression_Tuned
2  ca18223f58df4d7687ce58ac157b00f3        RandomForest_Baseline
3  3a230646c6234a2390a6999cdf85d0c1  LogisticRegression_Baseline


In [73]:
import mlflow
from tabulate import tabulate
import pandas as pd

client = mlflow.tracking.MlflowClient()

rows = []

for _, row in runs_df.iterrows():
    run_id = row["run_id"]
    model_name = row["tags.mlflow.runName"]

    run = client.get_run(run_id)
    metrics = run.data.metrics

    rows.append({"model_name": model_name, "run_id": run_id, **metrics})

df = pd.DataFrame(rows)
print(tabulate(df, headers="keys", tablefmt="grid", showindex=False))

+-----------------------------+----------------------------------+------------+----------+-----------+-------------+----------+
| model_name                  | run_id                           |   accuracy |   recall |   roc_auc |   precision |       f1 |
| RandomForest_Tuned          | aeef71b99f394ee8ba82570712765351 |   0.982644 | 0.99729  |  0.980907 |    0.985274 | 0.991246 |
+-----------------------------+----------------------------------+------------+----------+-----------+-------------+----------+
| LogisticRegression_Tuned    | f5aa9860c6ba47fb981a6a02738aa5b2 |   0.973298 | 0.97832  |  0.983247 |    0.99449  | 0.986339 |
+-----------------------------+----------------------------------+------------+----------+-----------+-------------+----------+
| RandomForest_Baseline       | ca18223f58df4d7687ce58ac157b00f3 |   0.985314 | 0.995935 |  0.973947 |    0.989233 | 0.992573 |
+-----------------------------+----------------------------------+------------+----------+-----------+--

### Analysis

The Logistic Regression baseline model will be selected for production because it achieved the highest ROC-AUC, indicating identification between high-risk and low-risk customers. Additionally, it achieved perfect recall, ensuring no high-risk customers were missed, which is critical in credit risk scenarios where false negatives are costly. Despite its simplicity, the model also delivered the highest F1 score, demonstrating a strong balance between risk detection and false alert control.



In [83]:
manager = ModelRegistryManager(model_name=MODEL_NAME)

version, metrics = manager.promote_to_production()

print(
    f"Promoted {MODEL_NAME} version {version} to Production "
)

[<ModelVersion: aliases=[], creation_timestamp=1765819007904, current_stage='Production', deployment_job_state=None, description=None, last_updated_timestamp=1765822158013, metrics=[<Metric: dataset_digest=None, dataset_name=None, key='accuracy', model_id='m-0ea9f8c38d714992befd3d265fa8a87a', run_id='3a230646c6234a2390a6999cdf85d0c1', step=0, timestamp=1765819004594, value=0.986648865153538>,
 <Metric: dataset_digest=None, dataset_name=None, key='recall', model_id='m-0ea9f8c38d714992befd3d265fa8a87a', run_id='3a230646c6234a2390a6999cdf85d0c1', step=0, timestamp=1765819004601, value=1.0>,
 <Metric: dataset_digest=None, dataset_name=None, key='roc_auc', model_id='m-0ea9f8c38d714992befd3d265fa8a87a', run_id='3a230646c6234a2390a6999cdf85d0c1', step=0, timestamp=1765819004604, value=0.9836166543483617>,
 <Metric: dataset_digest=None, dataset_name=None, key='precision', model_id='m-0ea9f8c38d714992befd3d265fa8a87a', run_id='3a230646c6234a2390a6999cdf85d0c1', step=0, timestamp=1765819004597, 