## Import Libraries

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import sys
sys.path.append('..')
from scripts.modeling import ModelTrainer
from src.load import load_data
model_trainer = ModelTrainer()

## Load training and test data

In [2]:
X_train = load_data('../Data/processed/X_train_resample_credit.csv')
y_train = load_data('../Data/processed/y_train_resample_credit.csv')
X_test = load_data('../Data/processed/X_test_credit.csv')
y_test = load_data('../Data/processed/y_test_credit.csv')
type(X_train), type(y_train)

(pandas.core.frame.DataFrame, pandas.core.frame.DataFrame)

## Handled Class Imbalance (SMOTE on training set only)

In [3]:
print("Class distribution after SMOTE:\n", y_train.value_counts())

Class distribution after SMOTE:
 Class
0        226602
1        226602
Name: count, dtype: int64


##  Baseline Model: Logistic Regression


In [4]:
y_train = y_train.squeeze()
y_test = y_test.squeeze()
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(453204, 30) (453204,)
(56746, 30) (56746,)


In [5]:
lr = LogisticRegression(solver='saga',max_iter=3000, random_state=42, n_jobs=-1)
lr = model_trainer.train_model(lr, X_train, y_train) # Train model
y_pred_lr, y_proba_lr = model_trainer.predict(lr, X_test) # Predictions
metrics_lr = model_trainer.evaluate_model(y_test, y_pred_lr, y_proba_lr) # Evaluation

F1-score: 0.10006027727546715
AUC-PR: 0.7150096950022679
Confusion Matrix:
 [[55170  1481]
 [   12    83]]


## Build Ensemble Model: Random Forest

In [6]:

rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1)
rf = model_trainer.train_model(rf, X_train, y_train) # Train model
y_pred_rf, y_proba_rf = model_trainer.predict(rf, X_test) # Predictions
metrics_rf = model_trainer.evaluate_model(y_test, y_pred_rf, y_proba_rf) # Evaluation

F1-score: 0.6666666666666666
AUC-PR: 0.7846592931829347
Confusion Matrix:
 [[56592    59]
 [   18    77]]


## Final Model Selection (Clear Winner)
### Best Model: Random Forest
Why?

Best F1-score

Best AUC-PR

Massive reduction in false positives

Stable and reliable predictions