In [1]:
!pip install -q torch torchvision scikit-learn pandas pyarrow tqdm

from google.colab import drive
drive.mount('/content/drive')

import os, random, math, time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_fscore_support
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from tqdm import tqdm

# reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)


Mounted at /content/drive
Device: cpu


In [2]:
# Edit DATA_PATH if you already have processed file
# Accepts parquet or csv. If your processed file exists, set it.
DATA_PATH = '/content/drive/MyDrive/loan-data/processed_sample.parquet'  # change if needed
FALLBACK_RAW = '/content/drive/MyDrive/loan-data/accepted_sample.csv'     # raw sample if processed not present

def load_data(path):
    if path.endswith('.parquet'):
        return pd.read_parquet(path)
    else:
        return pd.read_csv(path, low_memory=False)

if os.path.exists(DATA_PATH):
    df = load_data(DATA_PATH)
    print("Loaded processed file:", DATA_PATH)
elif os.path.exists(FALLBACK_RAW):
    print("Processed not found. Will do quick fallback preprocessing from raw sample.")
    raw = pd.read_csv(FALLBACK_RAW, low_memory=False)
    # Quick fallback preprocess: keep small set of numeric features + target
    # This is minimal; replace with your real preprocessing if you have it.
    # Create binary target
    def map_target(x):
        x = str(x).lower()
        if 'fully paid' in x: return 0
        if 'charged off' in x or 'default' in x: return 1
        return np.nan
    raw['target'] = raw['loan_status'].apply(map_target)
    keep = ['loan_amnt','int_rate','annual_inc','dti','fico_range_low','fico_range_high','term','emp_length','purpose','home_ownership','target']
    keep = [c for c in keep if c in raw.columns]
    df = raw[keep].copy()
    # basic cleaning
    if 'int_rate' in df.columns:
        df['int_rate'] = df['int_rate'].astype(str).str.replace('%','').astype(float)
    # simplify term like "36 months" -> 36
    if 'term' in df.columns:
        df['term'] = df['term'].astype(str).str.extract(r'(\d+)').astype(float)
    # fill numeric na with median, categorical with mode
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [c for c in df.columns if c not in num_cols and c!='target']
    for c in num_cols:
        df[c] = df[c].fillna(df[c].median())
    for c in cat_cols:
        df[c] = df[c].fillna('MISSING').astype(str)
    # one-hot a few low-cardinal cats
    from sklearn.preprocessing import OneHotEncoder
    low_cat = [c for c in ['emp_length','purpose','home_ownership'] if c in df.columns]
    if low_cat:
        ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
        ohe_mat = ohe.fit_transform(df[low_cat])
        ohe_cols = list(ohe.get_feature_names_out(low_cat))
        ohe_df = pd.DataFrame(ohe_mat, columns=ohe_cols, index=df.index)
        df = pd.concat([df.drop(columns=low_cat), ohe_df], axis=1)
    # drop rows with no target
    df = df[df['target'].notna()].reset_index(drop=True)
    print("Fallback processed rows:", df.shape)
else:
    raise FileNotFoundError("No processed file found and no raw sample found. Upload one to Drive and set DATA_PATH.")

# Final check
print("Data shape:", df.shape)
print("Columns sample:", df.columns.tolist()[:30])


Loaded processed file: /content/drive/MyDrive/loan-data/processed_sample.parquet
Data shape: (113034, 15)
Columns sample: ['loan_amnt', 'int_rate', 'annual_inc', 'dti', 'fico_range_low', 'fico_range_high', 'term_ 36 months', 'term_ 60 months', 'home_ownership_ANY', 'home_ownership_MORTGAGE', 'home_ownership_NONE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT', 'target']


In [3]:
# Ensure target is int
df['target'] = df['target'].astype(int)

# Define feature columns (all except 'target')
FEATURES = [c for c in df.columns if c != 'target']
print("Num features:", len(FEATURES))

# Optional: drop any remaining non-numeric columns
non_numeric = [c for c in FEATURES if not np.issubdtype(df[c].dtype, np.number)]
if non_numeric:
    print("Dropping non-numeric features (very small projects should encode instead):", non_numeric)
    FEATURES = [c for c in FEATURES if c not in non_numeric]
    df = df[FEATURES + ['target']]

X = df[FEATURES].values.astype(np.float32)
y = df['target'].values.astype(np.int64)

# train/val/test split (stratify)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.20, random_state=SEED, stratify=y_temp)

print("train/val/test sizes:", X_train.shape[0], X_val.shape[0], X_test.shape[0])

# Create PyTorch datasets/dataloaders
BATCH = 256
train_ds = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_ds   = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
test_ds  = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

train_loader = DataLoader(train_ds, batch_size=BATCH, shuffle=True, drop_last=False)
val_loader   = DataLoader(val_ds, batch_size=BATCH, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=BATCH, shuffle=False)


Num features: 14
train/val/test sizes: 72341 18086 22607


In [4]:
INPUT_DIM = X_train.shape[1]
HIDDEN = [256, 128]   # change if desired
DROPOUT = 0.2

class MLP(nn.Module):
    def __init__(self, in_dim, hidden_dims, dropout=0.2):
        super().__init__()
        layers = []
        prev = in_dim
        for h in hidden_dims:
            layers.append(nn.Linear(prev, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU(inplace=True))
            layers.append(nn.Dropout(dropout))
            prev = h
        layers.append(nn.Linear(prev,1))  # single logit
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x).squeeze(-1)  # returns (batch,)

model = MLP(INPUT_DIM, HIDDEN, dropout=DROPOUT).to(DEVICE)
print(model)


MLP(
  (net): Sequential(
    (0): Linear(in_features=14, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU(inplace=True)
    (7): Dropout(p=0.2, inplace=False)
    (8): Linear(in_features=128, out_features=1, bias=True)
  )
)


In [9]:
# -------------------------
# 4) Loss, optimizer, scheduler, utils  (FIXED)
# -------------------------

# compute pos_weight from training labels
pos = int(y_train.sum())
neg = int(len(y_train) - pos)
pos_weight = torch.tensor([neg / (pos + 1e-9)], dtype=torch.float32).to(DEVICE)
print("pos,neg,pos_weight:", pos, neg, float(pos_weight.item()))

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

# FIXED: remove verbose=True because your PyTorch version doesn't support it
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='max',
    factor=0.5,
    patience=3
)

def get_preds_labels(loader):
    model.eval()
    all_probs = []
    all_labels = []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(DEVICE).float()
            logits = model(xb)                 # (batch,1)
            probs  = torch.sigmoid(logits)     # (batch,1)
            all_probs.append(probs.cpu().numpy())
            all_labels.append(yb.cpu().numpy())
    probs = np.vstack(all_probs).reshape(-1)
    labels = np.vstack(all_labels).reshape(-1)
    return probs, labels

def eval_on(loader):
    probs, labels = get_preds_labels(loader)
    auc = roc_auc_score(labels, probs) if len(np.unique(labels))>1 else float('nan')
    preds = (probs >= 0.5).astype(int)
    f1 = f1_score(labels, preds, zero_division=0)
    precision, recall, _, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
    return {
        'auc': float(auc),
        'f1': float(f1),
        'precision': float(precision),
        'recall': float(recall)
    }


pos,neg,pos_weight: 8642 63699 7.370863437652588


In [10]:
# -------------------------
# 5) Training loop
# -------------------------

EPOCHS = 30
best_val = -1.0
best_state = None
CKPT_PATH = os.path.join(PROJ_DRIVE, "best_mlp_checkpoint.pt")

for epoch in range(1, EPOCHS+1):
    model.train()
    epoch_loss = 0.0

    for xb, yb in train_loader:
        xb = xb.to(DEVICE).float()
        yb = yb.to(DEVICE).float()   # shape (batch,1)

        optimizer.zero_grad()
        logits = model(xb)           # shape (batch,1)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * xb.size(0)

    epoch_loss = epoch_loss / len(train_loader.dataset)

    # eval
    val_metrics = eval_on(val_loader)
    print(f"Epoch {epoch:02d} loss {epoch_loss:.6f}  val_auc {val_metrics['auc']:.4f}  val_f1 {val_metrics['f1']:.4f}")

    # scheduler (NO verbose arg)
    scheduler.step(val_metrics['auc'] if not math.isnan(val_metrics['auc']) else val_metrics['f1'])

    # save best
    if val_metrics['auc'] > best_val:
        best_val = val_metrics['auc']
        best_state = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'epoch': epoch,
            'val': val_metrics
        }
        torch.save(best_state, CKPT_PATH)
        print("  --> saved best model")


Epoch 01 loss 1.125103  val_auc 0.6944  val_f1 0.2926
  --> saved best model
Epoch 02 loss 1.111750  val_auc 0.6959  val_f1 0.2921
  --> saved best model
Epoch 03 loss 1.110088  val_auc 0.6954  val_f1 0.2954
Epoch 04 loss 1.108688  val_auc 0.6960  val_f1 0.2918
  --> saved best model
Epoch 05 loss 1.108385  val_auc 0.6983  val_f1 0.2969
  --> saved best model
Epoch 06 loss 1.106619  val_auc 0.6970  val_f1 0.2966
Epoch 07 loss 1.106910  val_auc 0.6957  val_f1 0.2932
Epoch 08 loss 1.105179  val_auc 0.6983  val_f1 0.2942
  --> saved best model
Epoch 09 loss 1.103831  val_auc 0.6973  val_f1 0.2909
Epoch 10 loss 1.103403  val_auc 0.6968  val_f1 0.2897
Epoch 11 loss 1.102219  val_auc 0.6986  val_f1 0.2923
  --> saved best model
Epoch 12 loss 1.101543  val_auc 0.6968  val_f1 0.2932
Epoch 13 loss 1.101975  val_auc 0.6985  val_f1 0.2944
Epoch 14 loss 1.101575  val_auc 0.6979  val_f1 0.2929
Epoch 15 loss 1.102107  val_auc 0.6985  val_f1 0.2948
Epoch 16 loss 1.099842  val_auc 0.6982  val_f1 0.294

In [11]:
EPOCHS = 30
best_val = -1
best_state = None

for epoch in range(1, EPOCHS+1):
    model.train()
    epoch_loss = 0.0
    for xb, yb in train_loader:
        xb = xb.to(DEVICE).float()
        yb = yb.to(DEVICE).float()
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * xb.size(0)
    epoch_loss = epoch_loss / len(train_loader.dataset)

    # eval
    val_metrics = eval_on(val_loader)
    print(f"Epoch {epoch:02d} loss {epoch_loss:.4f}  val_auc {val_metrics['auc']:.4f}  val_f1 {val_metrics['f1']:.4f}")

    # scheduler and save best
    scheduler.step(val_metrics['auc'] if not math.isnan(val_metrics['auc']) else val_metrics['f1'])
    if val_metrics['auc'] > best_val:
        best_val = val_metrics['auc']
        best_state = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, 'val': val_metrics }
        # save checkpoint to Drive
        torch.save(best_state, "/content/drive/MyDrive/loan-data/best_mlp_checkpoint.pt")
        print("  --> saved best model")


Epoch 01 loss 1.0965  val_auc 0.6986  val_f1 0.2950
  --> saved best model
Epoch 02 loss 1.0965  val_auc 0.6988  val_f1 0.2951
  --> saved best model
Epoch 03 loss 1.0960  val_auc 0.6985  val_f1 0.2951
Epoch 04 loss 1.0957  val_auc 0.6986  val_f1 0.2951
Epoch 05 loss 1.0971  val_auc 0.6989  val_f1 0.2946
  --> saved best model
Epoch 06 loss 1.0955  val_auc 0.6987  val_f1 0.2947
Epoch 07 loss 1.0953  val_auc 0.6987  val_f1 0.2950
Epoch 08 loss 1.0966  val_auc 0.6988  val_f1 0.2932
Epoch 09 loss 1.0963  val_auc 0.6985  val_f1 0.2952
Epoch 10 loss 1.0953  val_auc 0.6988  val_f1 0.2945
Epoch 11 loss 1.0951  val_auc 0.6987  val_f1 0.2950
Epoch 12 loss 1.0969  val_auc 0.6986  val_f1 0.2952
Epoch 13 loss 1.0959  val_auc 0.6982  val_f1 0.2920
Epoch 14 loss 1.0957  val_auc 0.6987  val_f1 0.2950
Epoch 15 loss 1.0945  val_auc 0.6987  val_f1 0.2958
Epoch 16 loss 1.0965  val_auc 0.6989  val_f1 0.2959
  --> saved best model
Epoch 17 loss 1.0961  val_auc 0.6989  val_f1 0.2953
  --> saved best model
E

In [12]:
# load best
ckpt = torch.load("/content/drive/MyDrive/loan-data/best_mlp_checkpoint.pt", map_location=DEVICE)
model.load_state_dict(ckpt['model'])
print("Loaded checkpoint epoch", ckpt['epoch'])

test_metrics = eval_on(test_loader)
print("Test metrics:")
print(f" AUC:  {test_metrics['auc']:.4f}")
print(f" F1:   {test_metrics['f1']:.4f}")
print(f" Prec: {test_metrics['precision']:.4f}")
print(f" Rec:  {test_metrics['recall']:.4f}")

# Save a small CSV of predictions vs labels for analysis
probs, labels = get_preds_labels(test_loader)
out_df = pd.DataFrame({'prob':probs, 'pred':(probs>=0.5).astype(int), 'label':labels})
out_df.to_csv('/content/drive/MyDrive/loan-data/test_predictions.csv', index=False)
print("Saved test_predictions.csv")


Loaded checkpoint epoch 22
Test metrics:
 AUC:  0.6891
 F1:   0.2914
 Prec: 0.1864
 Rec:  0.6675
Saved test_predictions.csv


Model: MLP with two hidden layers [256,128], BatchNorm, Dropout(0.2).
Loss: BCEWithLogitsLoss with pos_weight to correct class imbalance.
Train/Val/Test splits: 64% / 16% / 20% (stratified by target).
Metrics reported: AUC (primary for probabilistic discrimination) and F1 (binary decision quality).
Files saved to Drive:
- best_mlp_checkpoint.pt
- test_predictions.csv
Recommended next steps:
- tune architecture and learning rate via small grid search
- calibrate predicted probabilities (Platt or isotonic)
- experiment with class-threshold other than 0.5 to maximize business metric (expected profit)
