In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score


In [2]:
# -------------------------
# Load credit card dataset
# -------------------------
credit_df = pd.read_csv("../data/cleaned_credit_data.csv")

In [3]:
# Separate features & label
X = credit_df.drop(columns=['Class'])
y = credit_df['Class']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)


In [4]:
# -------------------------
# 1. Logistic Regression
# -------------------------
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)
lr_probs = lr.predict_proba(X_test)[:, 1]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
# -------------------------
# 2. Random Forest
# -------------------------
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
rf_probs = rf.predict_proba(X_test)[:, 1]

In [6]:
# -------------------------
# 3. Evaluate
# -------------------------
def evaluate(y_true, preds, probs, model_name):
    print(f"\n📊 {model_name}")
    print("Confusion Matrix:\n", confusion_matrix(y_true, preds))
    print("\nClassification Report:\n", classification_report(y_true, preds, digits=4))
    print("🔹 ROC-AUC Score:", roc_auc_score(y_true, probs))
    print("🔸 PR-AUC Score :", average_precision_score(y_true, probs))



In [7]:
evaluate(y_test, lr_preds, lr_probs, "Logistic Regression")
evaluate(y_test, rf_preds, rf_probs, "Random Forest")


📊 Logistic Regression
Confusion Matrix:
 [[84963    13]
 [   59    83]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9993    0.9998    0.9996     84976
           1     0.8646    0.5845    0.6975       142

    accuracy                         0.9992     85118
   macro avg     0.9319    0.7922    0.8485     85118
weighted avg     0.9991    0.9992    0.9991     85118

🔹 ROC-AUC Score: 0.9569694574905657
🔸 PR-AUC Score : 0.6932525801723478

📊 Random Forest
Confusion Matrix:
 [[84971     5]
 [   31   111]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9996    0.9999    0.9998     84976
           1     0.9569    0.7817    0.8605       142

    accuracy                         0.9996     85118
   macro avg     0.9783    0.8908    0.9301     85118
weighted avg     0.9996    0.9996    0.9996     85118

🔹 ROC-AUC Score: 0.9660226764939097
🔸 PR-AUC Score : 0.80392659794692
