In [23]:
# Fix imports when running from notebooks/ folder
import sys
from pathlib import Path
import os
project_root = Path.cwd().parent 
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Added to path: {project_root}")

%load_ext autoreload
%autoreload 2

Added to path: /Users/elshaday/DEV/10Academy/credit-risk-probability-week4
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
from src import DataManager
from scripts.constants import READY_TO_MODEL_DATA_FILE_NAME, Columns, TARGET_COL
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
import mlflow
from sklearn.ensemble import RandomForestClassifier

In [25]:
MLRUNS_PATH = os.path.join(project_root, "mlruns")

mlflow.set_tracking_uri(f"file://{MLRUNS_PATH}")

In [26]:
# Import Model Ready Data Set
dm = DataManager()
model_ready_df = dm.load_csv(file_name=READY_TO_MODEL_DATA_FILE_NAME, load_clean=True)
model_ready_df.drop(columns=["Unnamed: 0"], inplace=True)

Loading ../data/processed/final_data.csv...
Sucessfully loaded ../data/processed/final_data.csv!


In [27]:
print(tabulate(model_ready_df.head(), headers="keys", tablefmt="grid"))

+----+-----------------+--------------------+--------------------------+------------------------------+------------------------+--------------------------+----------------------------+---------------------+----------------------------+------------------------------+----------------+
|    | CustomerId      |   TransactionCount |   TotalTransactionAmount |   UniqueProductCategoryCount |   TransactionAmountSTD |   AverageTransactionHour |   AverageTransactionAmount |   MostCommonChannel |   MostCommonTransactionDay |   MostCommonTransactionMonth |   is_high_risk |
|  0 | CustomerId_1    |           -3.45181 |                 -1.42267 |                     -1.91337 |              -3.59831  |                -1.04588  |                  -1.42267  |            -1.85149 |                  -0.162459 |                    -0.44516  |              1 |
+----+-----------------+--------------------+--------------------------+------------------------------+------------------------+--------------------

In [28]:
customer_id_series = model_ready_df[Columns.CustomerId.value]
working_df = model_ready_df.copy().drop(columns=[Columns.CustomerId.value])

print(tabulate(working_df.head(), headers="keys", tablefmt="grid"))

+----+--------------------+--------------------------+------------------------------+------------------------+--------------------------+----------------------------+---------------------+----------------------------+------------------------------+----------------+
|    |   TransactionCount |   TotalTransactionAmount |   UniqueProductCategoryCount |   TransactionAmountSTD |   AverageTransactionHour |   AverageTransactionAmount |   MostCommonChannel |   MostCommonTransactionDay |   MostCommonTransactionMonth |   is_high_risk |
|  0 |           -3.45181 |                 -1.42267 |                     -1.91337 |              -3.59831  |                -1.04588  |                  -1.42267  |            -1.85149 |                  -0.162459 |                    -0.44516  |              1 |
+----+--------------------+--------------------------+------------------------------+------------------------+--------------------------+----------------------------+---------------------+--------------

In [29]:
# Split the data
y = working_df[TARGET_COL]
X = working_df.drop(columns=[TARGET_COL])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

### Model 1 - Logistic Regression (Baseline Model)

In [30]:
lr_model = LogisticRegression()

# Train Model
print("Training Linear Regression...")
lr_model.fit(X_train, y_train)

print("\nPredicting...")
lr_y_pred = lr_model.predict(X_test)
lr_y_pred_proba = lr_model.predict_proba(X_test)[:, 1]

lr_accuracy = accuracy_score(y_test, lr_y_pred)
lr_precision = precision_score(y_test, lr_y_pred, zero_division=0)
lr_recall = recall_score(y_test, lr_y_pred)
lr_f1 = f1_score(y_test, lr_y_pred)
lr_roc_auc = roc_auc_score(y_test, lr_y_pred_proba)

print("\nLogistic Regression Model Performance")
print(f"Accuracy  : {lr_accuracy:.4f}")
print(f"Precision : {lr_precision:.4f}")
print(f"Recall    : {lr_recall:.4f}")
print(f"F1 Score  : {lr_f1:.4f}")
print(f"ROC-AUC   : {lr_roc_auc:.4f}")

Training Linear Regression...

Predicting...

Logistic Regression Model Performance
Accuracy  : 0.9861
Precision : 0.9861
Recall    : 1.0000
F1 Score  : 0.9930
ROC-AUC   : 0.9867


### Save Linear Regression (Baseline Model) Results

In [31]:
mlflow.set_experiment("credit-risk-models")
with mlflow.start_run(run_name="LogisticRegression"):
    mlflow.log_metric("Logistic Regression Accuracy", lr_accuracy)
    mlflow.log_metric("Logistic Regression Precision", lr_precision)
    mlflow.log_metric("Logistic Regression Recall", lr_recall)
    mlflow.log_metric("Logistic Regression F1 Score", lr_f1)
    mlflow.log_metric("Logistic Regression ROC-AUC", lr_roc_auc)

### Model 2 - Random Forest Classifier (Second Model)

In [32]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

rf_y_pred = rf_model.predict(X_test)
rf_y_prob = rf_model.predict_proba(X_test)[:, 1]

rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_precision = precision_score(y_test, rf_y_pred)
rf_recall = recall_score(y_test, rf_y_pred)
rf_f1 = f1_score(y_test, rf_y_pred)
rf_roc_auc = roc_auc_score(y_test, rf_y_prob)

print("Random Forest Evaluation Metrics")
print(f"Accuracy  : {rf_accuracy:.4f}")
print(f"Precision : {rf_precision:.4f}")
print(f"Recall    : {rf_recall:.4f}")
print(f"F1 Score  : {rf_f1:.4f}")
print(f"ROC-AUC   : {rf_roc_auc:.4f}")

Random Forest Evaluation Metrics
Accuracy  : 0.9829
Precision : 0.9871
Recall    : 0.9957
F1 Score  : 0.9914
ROC-AUC   : 0.9450


In [33]:
with mlflow.start_run(run_name="Random Forest"):
    mlflow.log_metric("Random Forest Accuracy", rf_accuracy)
    mlflow.log_metric("Random Forest Precision", rf_precision)
    mlflow.log_metric("Random Forest Recall", rf_recall)
    mlflow.log_metric("Random Forest F1 Score", rf_f1)
    mlflow.log_metric("Random Forest ROC-AUC", rf_roc_auc)