## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import sys
sys.path.append('..')
from scripts.modeling import ModelTrainer

model_trainer = ModelTrainer()

## Load feature and target values data

In [2]:
X = np.load('../Data/processed/x_credit.npy', allow_pickle=True)
y = np.load('../Data/processed/y_credit.npy', allow_pickle=True)
type(X), type(y)

(numpy.ndarray, numpy.ndarray)

## Data Preprocessing

In [3]:
# STRATIFIED TRAIN-TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,random_state=42)

##  Baseline Model: Logistic Regression


In [4]:
lr = LogisticRegression(solver='saga',max_iter=3000,class_weight="balanced", random_state=42, n_jobs=-1)
lr = model_trainer.train_model(lr, X_train, y_train) # Train model
y_pred_lr, y_proba_lr = model_trainer.predict(lr, X_test) # Predictions
metrics_lr = model_trainer.evaluate_model(y_test, y_pred_lr, y_proba_lr) # Evaluation

F1-score: 0.10246913580246914
AUC-PR: 0.7346170640457217
Confusion Matrix:
 [[55209  1442]
 [   12    83]]




## Build Ensemble Model: Random Forest

In [5]:

rf = RandomForestClassifier(n_estimators=200, max_depth=10, 
                            random_state=42,
                            min_samples_split=5,
                            class_weight="balanced",
                            n_jobs=-1)
rf = model_trainer.train_model(rf, X_train, y_train) # Train model
y_pred_rf, y_proba_rf = model_trainer.predict(rf, X_test) # Predictions
metrics_rf = model_trainer.evaluate_model(y_test, y_pred_rf, y_proba_rf) # Evaluation

F1-score: 0.8
AUC-PR: 0.7758886978338179
Confusion Matrix:
 [[56641    10]
 [   25    70]]


### STRATIFIED K-FOLD CROSS-VALIDATION (k=5)

In [6]:
_, lr_cv_results = model_trainer.cross_validation(lr, X_train, y_train)
_, rf_cv_results = model_trainer.cross_validation(rf, X_train, y_train)

In [7]:
cv_results = {
    'Model':['Logistic Regression', 'Random Forest'],
    'F1 Score (mean ± std)':[
        f"{lr_cv_results['f1_mean']:.4f} ± {lr_cv_results['f1_std']:.4f}",
        f"{rf_cv_results['f1_mean']:.4f} ± {rf_cv_results['f1_std']:.4f}",
    ],
    'AUC-PR (mean ± std)':[
        f"{lr_cv_results['auc_pr_mean']:.4f} ± {lr_cv_results['auc_pr_std']:.4f}",
        f"{rf_cv_results['auc_pr_mean']:.4f} ± {rf_cv_results['auc_pr_std']:.4f}"
    ]
}
comparison_df = pd.DataFrame(cv_results)
print(comparison_df) 

                 Model F1 Score (mean ± std) AUC-PR (mean ± std)
0  Logistic Regression       0.1087 ± 0.0127     0.7505 ± 0.0288
1        Random Forest       0.8333 ± 0.0291     0.8120 ± 0.0269


In [8]:
recommended_model = "Random Forest" if rf_cv_results['f1_mean'] > lr_cv_results['f1_mean'] else "Logistic Regression"
print(f"Recommended model based on CV F1-score: {recommended_model}")


Recommended model based on CV F1-score: Random Forest
