## Import Libraries

In [6]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import sys
sys.path.append('..')
from scripts.modeling import ModelTrainer
from src.load import load_data
model_trainer = ModelTrainer()

## Load training and test data

In [2]:
X_train = load_data('../Data/processed/x_train_resample.csv')
y_train = load_data('../Data/processed/y_train_resample.csv')
X_test = load_data('../Data/processed/X_test.csv')
y_test = load_data('../Data/processed/y_test.csv')

In [12]:
type(X_train), type(y_train)


(pandas.core.frame.DataFrame, numpy.ndarray)

## Handled Class Imbalance (SMOTE on training set only)

In [3]:
print("Class distribution after SMOTE:\n", y_train.value_counts())

Class distribution after SMOTE:
 class
0        93502
1        93502
Name: count, dtype: int64


##  Baseline Model: Logistic Regression


In [11]:
y_train = y_train.squeeze()
y_test = y_test.squeeze()
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(187004, 193) (187004,)
(25830, 193) (25830,)


In [7]:
lr = LogisticRegression(solver='saga',max_iter=3000, random_state=42, n_jobs=-1)
lr = model_trainer.train_model(lr, X_train, y_train) # Train model
y_pred_lr, y_proba_lr = model_trainer.predict(lr, X_test) # Predictions
metrics_lr = model_trainer.evaluate_model(y_test, y_pred_lr, y_proba_lr) # Evaluation

F1-score: 0.32931392931392933
AUC-PR: 0.33180730165856814
Confusion Matrix:
 [[17794  5582]
 [  870  1584]]


## Build Ensemble Model: Random Forest

In [9]:

rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf = model_trainer.train_model(rf, X_train, y_train) # Train model
y_pred_rf, y_proba_rf = model_trainer.predict(rf, X_test) # Predictions
metrics_rf = model_trainer.evaluate_model(y_test, y_pred_rf, y_proba_rf) # Evaluation

F1-score: 0.7038107752956636
AUC-PR: 0.6354495336055523
Confusion Matrix:
 [[23364    12]
 [ 1115  1339]]


## Final Model Selection (Clear Winner)
### Best Model: Random Forest
Why?

Best F1-score

Best AUC-PR

Massive reduction in false positives

Stable and reliable predictions