## Logistic SGD Hyperparameter Search (Exhaustive Grid)

This notebook performs an exhaustive search over all combinations in the defined grid and reports the best validation ROC-AUC.

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import ParameterGrid

from credit_risk.features.build_features import FeatureBuilder
from credit_risk.utils.paths import project_root


In [2]:
train_df = pd.read_parquet(project_root / 'data' / 'samples' / 'train.parquet')
val_df = pd.read_parquet(project_root / 'data' / 'samples' / 'val.parquet')

feature_builder = FeatureBuilder()
X_train, y_train = feature_builder.build_features(train_df, fit=True)
X_val, y_val = feature_builder.build_features(val_df, fit=False)

X_train.shape, X_val.shape

2026-02-17 21:24:45 | INFO | credit_risk.features.build_features | Building features
2026-02-17 21:25:00 | INFO | credit_risk.features.build_features | Building features


((941716, 135), (201796, 135))

In [3]:
param_grid = {
    'alpha': [1e-5, 1e-4, 1e-3],
    'penalty': ['l2', 'l1'],
    'learning_rate': ['optimal', 'adaptive'],
    'eta0': [0.001, 0.01],
    'max_iter': [1000, 2000],
    'tol': [1e-3, 1e-4],
}

grid = list(ParameterGrid(param_grid))
print(f'Total combinations: {len(grid)}')

Total combinations: 96


In [4]:
results = []
best_auc = -1.0
best_params = None

for idx, params in enumerate(grid, start=1):
    clf = SGDClassifier(
        loss='log_loss',
        random_state=42,
        **params,
    )
    clf.fit(X_train, y_train)
    y_prob = clf.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_prob)

    row = {'trial': idx, 'roc_auc': auc, **params}
    results.append(row)

    if auc > best_auc:
        best_auc = auc
        best_params = params

    if idx % 10 == 0 or idx == len(grid):
        print(f'Completed {idx}/{len(grid)} | current best ROC-AUC={best_auc:.6f}')

results_df = pd.DataFrame(results).sort_values('roc_auc', ascending=False).reset_index(drop=True)
results_df.head(10)

Completed 10/96 | current best ROC-AUC=0.695997
Completed 20/96 | current best ROC-AUC=0.695997
Completed 30/96 | current best ROC-AUC=0.696143
Completed 40/96 | current best ROC-AUC=0.696982
Completed 50/96 | current best ROC-AUC=0.696982
Completed 60/96 | current best ROC-AUC=0.696982
Completed 70/96 | current best ROC-AUC=0.696982
Completed 80/96 | current best ROC-AUC=0.696982
Completed 90/96 | current best ROC-AUC=0.696982
Completed 96/96 | current best ROC-AUC=0.696982


Unnamed: 0,trial,roc_auc,alpha,eta0,learning_rate,max_iter,penalty,tol
0,50,0.696982,0.0001,0.01,optimal,1000,l2,0.0001
1,54,0.696982,0.0001,0.01,optimal,2000,l2,0.0001
2,38,0.696982,0.0001,0.001,optimal,2000,l2,0.0001
3,34,0.696982,0.0001,0.001,optimal,1000,l2,0.0001
4,26,0.696143,1e-05,0.01,adaptive,1000,l2,0.0001
5,30,0.696143,1e-05,0.01,adaptive,2000,l2,0.0001
6,29,0.696118,1e-05,0.01,adaptive,2000,l2,0.001
7,25,0.696118,1e-05,0.01,adaptive,1000,l2,0.001
8,4,0.695997,1e-05,0.001,optimal,1000,l1,0.0001
9,8,0.695997,1e-05,0.001,optimal,2000,l1,0.0001


In [5]:
print('Best ROC-AUC:', round(best_auc, 6))
print('Best params:', best_params)

Best ROC-AUC: 0.696982
Best params: {'alpha': 0.0001, 'eta0': 0.001, 'learning_rate': 'optimal', 'max_iter': 1000, 'penalty': 'l2', 'tol': 0.0001}
