In [1]:
import sys
from pathlib import Path

# Ensure project root is on path
project_root = Path().resolve().parents[2]
sys.path.append(str(project_root))


In [None]:
#importing packages and lib
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.metrics import (
    roc_auc_score,
    classification_report,
)

from credit_risk.data.load_data import load_cleaned_data
from credit_risk.data.split_data import DataSplitter
from credit_risk.features.build_features import FeatureBuilder
from credit_risk.utils.logging import get_logger
from credit_risk.utils.config import model_config



In [3]:
logger = get_logger("logistic_sgd_experiments")
logger.info("Starting Logistic Regression with SGD experiments")


2026-01-22 01:58:00 | INFO | logistic_sgd_experiments | Starting Logistic Regression with SGD experiments


In [4]:
df = load_cleaned_data()
logger.info(f"Loaded cleaned data: {df.shape}")


2026-01-22 01:58:05 | INFO | credit_risk.data.load_data | Loading cleaned data from D:\Projects\lending-club-credit-risk\data\processed\cleaned_data.parquet
2026-01-22 01:58:07 | INFO | credit_risk.data.load_data | Cleaned data shape: (1345309, 30)
2026-01-22 01:58:07 | INFO | logistic_sgd_experiments | Loaded cleaned data: (1345309, 30)


In [5]:
splitter = DataSplitter()

train_df, val_df, _ = splitter.split(df)

logger.info(f"Train shape: {train_df.shape}")
logger.info(f"Validation shape: {val_df.shape}")


2026-01-22 01:58:18 | INFO | credit_risk.data.split_data | Split sizes → train=941716, val=201796, test=201797
2026-01-22 01:58:18 | INFO | logistic_sgd_experiments | Train shape: (941716, 30)
2026-01-22 01:58:18 | INFO | logistic_sgd_experiments | Validation shape: (201796, 30)


In [6]:
feature_builder = FeatureBuilder()

X_train, y_train = feature_builder.build_features(train_df, fit=True)
X_val, y_val = feature_builder.build_features(val_df, fit=False)

assert y_train is not None, "y_train is None – target column missing!"
assert y_val is not None, "y_val is None – target column missing!"


logger.info(f"X_train shape: {X_train.shape}")
logger.info(f"X_val shape: {X_val.shape}")


2026-01-22 01:58:26 | INFO | credit_risk.features.build_features | Building features


  df["earliest_cr_line"] = pd.to_datetime(df["earliest_cr_line"], errors="coerce")


2026-01-22 01:58:39 | INFO | credit_risk.features.build_features | Building features


  df["earliest_cr_line"] = pd.to_datetime(df["earliest_cr_line"], errors="coerce")


2026-01-22 01:58:40 | INFO | logistic_sgd_experiments | X_train shape: (941716, 135)
2026-01-22 01:58:40 | INFO | logistic_sgd_experiments | X_val shape: (201796, 135)


In [7]:
sgd_logistic = SGDClassifier(
    **model_config.SGD_LOGISTIC_PARAMS
)

logger.info("Initialized SGD Logistic Regression model")


2026-01-22 02:03:37 | INFO | logistic_sgd_experiments | Initialized SGD Logistic Regression model


In [8]:
sgd_logistic.fit(X_train, y_train)

logger.info("Model training completed")


2026-01-22 02:05:33 | INFO | logistic_sgd_experiments | Model training completed


In [9]:
y_val_pred = sgd_logistic.predict(X_val)
y_val_proba = sgd_logistic.predict_proba(X_val)[:, 1]


In [10]:
roc_auc = roc_auc_score(y_val, y_val_proba)
logger.info(f"Validation ROC-AUC: {roc_auc:.4f}")

print("Classification Report (Validation):")
print(classification_report(y_val, y_val_pred))


2026-01-22 02:06:30 | INFO | logistic_sgd_experiments | Validation ROC-AUC: 0.6945
Classification Report (Validation):
              precision    recall  f1-score   support

           0       0.78      0.94      0.85    151847
           1       0.50      0.18      0.27     49949

    accuracy                           0.75    201796
   macro avg       0.64      0.56      0.56    201796
weighted avg       0.71      0.75      0.71    201796



In [11]:
print("Positive class ratio (train):", y_train.mean())
print("Positive class ratio (val):", y_val.mean())


Positive class ratio (train): 0.1869066682524243
Positive class ratio (val): 0.2475222501932645


In [12]:
from sklearn.metrics import confusion_matrix

# Confusion matrix
cm = confusion_matrix(y_val, y_val_pred)

tn, fp, fn, tp = cm.ravel()

print("Confusion Matrix (Validation)")
print(cm)

print("\nBreakdown:")
print(f"True Negatives (Good loans correctly approved): {tn}")
print(f"False Positives (Bad loans approved): {fp}")
print(f"False Negatives (Good loans rejected): {fn}")
print(f"True Positives (Bad loans correctly rejected): {tp}")


Confusion Matrix (Validation)
[[142738   9109]
 [ 40846   9103]]

Breakdown:
True Negatives (Good loans correctly approved): 142738
False Positives (Bad loans approved): 9109
False Negatives (Good loans rejected): 40846
True Positives (Bad loans correctly rejected): 9103


In [13]:
def ks_statistic(y_true, y_prob):
    """
    Compute KS statistic.
    """
    data = pd.DataFrame({
        "y_true": y_true,
        "y_prob": y_prob
    }).sort_values("y_prob", ascending=False)

    data["cum_good"] = (data["y_true"] == 0).cumsum() / (data["y_true"] == 0).sum()
    data["cum_bad"] = (data["y_true"] == 1).cumsum() / (data["y_true"] == 1).sum()

    ks = (data["cum_bad"] - data["cum_good"]).abs().max()
    return ks


ks = ks_statistic(y_val, y_val_proba)

logger.info(f"Validation KS Statistic: {ks:.4f}")
print(f"KS Statistic (Validation): {ks:.4f}")


2026-01-22 02:16:00 | INFO | logistic_sgd_experiments | Validation KS Statistic: 0.2794
KS Statistic (Validation): 0.2794
