In [35]:
# Fix imports when running from notebooks/ folder
import sys
from pathlib import Path
import os
project_root = Path.cwd().parent 
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Added to path: {project_root}")

%load_ext autoreload
%autoreload 2

Added to path: /Users/elshaday/DEV/10Academy/credit-risk-probability-week4
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
from src import DataManager, ExperimentRunner
from scripts.constants import READY_TO_MODEL_DATA_FILE_NAME, Columns, TARGET_COL
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import mlflow
from sklearn.ensemble import RandomForestClassifier

In [37]:
MLRUNS_PATH = os.path.join(project_root, "mlruns")

mlflow.set_tracking_uri(f"file://{MLRUNS_PATH}")

In [38]:
# Import Model Ready Data Set
dm = DataManager()
model_ready_df = dm.load_csv(file_name=READY_TO_MODEL_DATA_FILE_NAME, load_clean=True)
model_ready_df.drop(columns=["Unnamed: 0"], inplace=True)

Loading ../data/processed/final_data.csv...
Sucessfully loaded ../data/processed/final_data.csv!


In [39]:
print(tabulate(model_ready_df.head(), headers="keys", tablefmt="grid"))

+----+-----------------+--------------------+--------------------------+------------------------------+------------------------+--------------------------+----------------------------+---------------------+----------------------------+------------------------------+----------------+
|    | CustomerId      |   TransactionCount |   TotalTransactionAmount |   UniqueProductCategoryCount |   TransactionAmountSTD |   AverageTransactionHour |   AverageTransactionAmount |   MostCommonChannel |   MostCommonTransactionDay |   MostCommonTransactionMonth |   is_high_risk |
|  0 | CustomerId_1    |           -3.45181 |                 -1.42267 |                     -1.91337 |              -3.59831  |                -1.04588  |                  -1.42267  |            -1.85149 |                  -0.162459 |                    -0.44516  |              1 |
+----+-----------------+--------------------+--------------------------+------------------------------+------------------------+--------------------

In [40]:
customer_id_series = model_ready_df[Columns.CustomerId.value]
working_df = model_ready_df.copy().drop(columns=[Columns.CustomerId.value])

print(tabulate(working_df.head(), headers="keys", tablefmt="grid"))

+----+--------------------+--------------------------+------------------------------+------------------------+--------------------------+----------------------------+---------------------+----------------------------+------------------------------+----------------+
|    |   TransactionCount |   TotalTransactionAmount |   UniqueProductCategoryCount |   TransactionAmountSTD |   AverageTransactionHour |   AverageTransactionAmount |   MostCommonChannel |   MostCommonTransactionDay |   MostCommonTransactionMonth |   is_high_risk |
|  0 |           -3.45181 |                 -1.42267 |                     -1.91337 |              -3.59831  |                -1.04588  |                  -1.42267  |            -1.85149 |                  -0.162459 |                    -0.44516  |              1 |
+----+--------------------+--------------------------+------------------------------+------------------------+--------------------------+----------------------------+---------------------+--------------

In [41]:
# Split the data
y = working_df[TARGET_COL]
X = working_df.drop(columns=[TARGET_COL])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [42]:
mlflow.set_experiment("credit-risk-models")

<Experiment: artifact_location='file:///Users/elshaday/DEV/10Academy/credit-risk-probability-week4/mlruns/141968432534321252', creation_time=1765717363504, experiment_id='141968432534321252', last_update_time=1765717363504, lifecycle_stage='active', name='credit-risk-models', tags={}>

### Model 1 - Logistic Regression (Baseline Model)

In [43]:
with mlflow.start_run(run_name="LogisticRegression"):
    runner = ExperimentRunner(
        model=LogisticRegression(), model_name="LogisticRegression"
    )
    runner.train(X_train, y_train)
    runner.evaluate(X_test, y_test)
    runner.log_to_mlflow()



### Model 2 - Random Forest Classifier (Second Model)

In [44]:
with mlflow.start_run(run_name="RandomForest"):
    runner = ExperimentRunner(
        model=RandomForestClassifier(random_state=42), model_name="RandomForest"
    )
    runner.train(X_train, y_train)
    runner.evaluate(X_test, y_test)
    runner.log_to_mlflow()

