In [1]:
import os
import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd
import joblib


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_fscore_support


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [2]:
DATA_PATH = "/content/drive/MyDrive/shod/accepted_2007_to_2018Q4.csv.gz" # update to your CSV path
PROCESSED_DIR = "data/processed"
os.makedirs(PROCESSED_DIR, exist_ok=True)
ARTIFACTS_DIR = "artifacts"
os.makedirs(ARTIFACTS_DIR, exist_ok=True)


SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7e4409515df0>

In [3]:
# 2) Load data
print("Loading data...")
df = pd.read_csv(DATA_PATH, low_memory=False)
print("Loaded rows:", len(df))

Loading data...
Loaded rows: 2260701


In [4]:
# 3) EDA (light) & target mapping
# Show basic distribution - print a few lines
print(df.columns.tolist())
print(df.head(3))


# Create a consistent target: 0 = fully paid, 1 = default/charged off
# Adjust labels according to your dataset's loan_status categories
paid = ['Fully Paid']
defaulted = ['Charged Off', 'Default']
# Filter dataset to rows that belong to these categories (safe approach)
df = df[df['loan_status'].isin(paid + defaulted)].copy()
df['target'] = df['loan_status'].apply(lambda x: 0 if x in paid else 1)
print("After filtering for clear outcomes, rows:", len(df))

['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'fico_range_low', 'fico_range_high', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'last_fico_range_high', 'last_fico_range_low', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'application_type', 'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'acc_now_delinq',

In [7]:
# 4) Feature engineering & selected features
# Convert int_rate strings like '13.56%' to numeric
def parse_int_rate(x):
  if pd.isna(x):
    return np.nan
  if isinstance(x, str) and x.endswith('%'):
    try:
      return float(x.strip().strip('%'))/100.0
    except:
      return np.nan
  try:
    return float(x)
  except:
    return np.nan


if 'int_rate' in df.columns:
  df['int_rate'] = df['int_rate'].apply(parse_int_rate)


# Example selected features - adapt if your CSV lacks any of these columns
num_features = [f for f in ['loan_amnt','int_rate','annual_inc','dti','open_acc','revol_util'] if f in df.columns]
cat_features = [f for f in ['grade','home_ownership','purpose','emp_length','verification_status'] if f in df.columns]


# Derived feature
if 'annual_inc' in df.columns and 'loan_amnt' in df.columns:
  df['income_to_loan_ratio'] = df['annual_inc'] / (df['loan_amnt'] + 1e-8)
  derived_features = ['income_to_loan_ratio']
else:
  derived_features = []


selected_features = num_features + derived_features + cat_features
print("Selected features:", selected_features)


# Drop rows missing the selected features or target
df_sub = df.dropna(subset=selected_features + ['target']).copy()
print("After dropping NAs rows:", len(df_sub))

Selected features: ['loan_amnt', 'int_rate', 'annual_inc', 'dti', 'open_acc', 'revol_util', 'income_to_loan_ratio', 'grade', 'home_ownership', 'purpose', 'emp_length', 'verification_status']
After dropping NAs rows: 1266011


In [8]:
# 5) Preprocessing pipeline
# Use OneHotEncoder handle_unknown depending on sklearn version
from sklearn import __version__ as sklearn_version
print('scikit-learn version', sklearn_version)


# Create pipelines
num_pipeline = Pipeline([('scaler', StandardScaler())])
# Use sparse_output argument for newer sklearn, else sparse
try:
  cat_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
  cat_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)


cat_pipeline = Pipeline([('onehot', cat_encoder)])


preprocessor = ColumnTransformer(transformers=[
('num', num_pipeline, num_features),
('cat', cat_pipeline, cat_features)
], remainder='drop')

scikit-learn version 1.6.1


In [11]:
# 6) Train/test split and fit preprocessor
X = df_sub[selected_features]
y = df_sub['target'].astype(int)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)


print('Fitting preprocessor on train...')
preprocessor.fit(X_train)
X_train_proc = preprocessor.transform(X_train)
X_test_proc = preprocessor.transform(X_test)


# Try to get feature names for convenience
try:
  out_names = preprocessor.get_feature_names_out(selected_features)
except Exception:
  out_names = None


# Save preprocessor
joblib.dump(preprocessor, os.path.join(ARTIFACTS_DIR, 'preprocessor.joblib'))
print('Saved preprocessor to artifacts')


# Persist processed arrays
np.save(os.path.join(PROCESSED_DIR, 'X_train_proc.npy'), X_train_proc)
np.save(os.path.join(PROCESSED_DIR, 'X_test_proc.npy'), X_test_proc)


y_train.to_csv(os.path.join(PROCESSED_DIR, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(PROCESSED_DIR, 'y_test.csv'), index=False)


print('Shapes:', X_train_proc.shape, X_test_proc.shape)

Fitting preprocessor on train...
Saved preprocessor to artifacts
Shapes: (1012808, 47) (253203, 47)


In [12]:
# 7) Supervised classifier (PyTorch MLP)
class MLP(nn.Module):
  def __init__(self, input_dim):
    super().__init__()
    self.net = nn.Sequential(
      nn.Linear(input_dim, 128),
      nn.ReLU(),
      nn.Dropout(0.2),
      nn.Linear(128,64),
      nn.ReLU(),
      nn.Linear(64,1),
      nn.Sigmoid()
    )
  def forward(self, x):
    return self.net(x)

In [13]:
# Convert to tensors
X_train_tensor = torch.tensor(X_train_proc, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_proc, dtype=torch.float32)
try:
  y_train_vals = pd.read_csv(os.path.join(PROCESSED_DIR, 'y_train.csv')).values.flatten()
  y_test_vals = pd.read_csv(os.path.join(PROCESSED_DIR, 'y_test.csv')).values.flatten()
except Exception:
  y_train_vals = y_train.values
  y_test_vals = y_test.values


y_train_tensor = torch.tensor(y_train_vals, dtype=torch.float32).unsqueeze(1)
y_test_tensor = torch.tensor(y_test_vals, dtype=torch.float32).unsqueeze(1)


train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)


input_dim = X_train_proc.shape[1]
model = MLP(input_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [14]:
# Training loop with simple early stopping by epochs
EPOCHS = 10
for epoch in range(EPOCHS):
  model.train()
  epoch_loss = 0
  for xb, yb in train_loader:
    optimizer.zero_grad()
    preds = model(xb)
    loss = criterion(preds, yb)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
  print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {epoch_loss/len(train_loader):.4f}")

Epoch 1/10 - Loss: 0.4537
Epoch 2/10 - Loss: 0.4514
Epoch 3/10 - Loss: 0.4509
Epoch 4/10 - Loss: 0.4506
Epoch 5/10 - Loss: 0.4504
Epoch 6/10 - Loss: 0.4501
Epoch 7/10 - Loss: 0.4500
Epoch 8/10 - Loss: 0.4499
Epoch 9/10 - Loss: 0.4497
Epoch 10/10 - Loss: 0.4497


In [15]:
# Save supervised model state_dict
torch.save(model.state_dict(), os.path.join(ARTIFACTS_DIR, 'mlp_supervised.pth'))


# Predict probabilities helper
def predict_proba(model, X_numpy):
  model.eval()
  with torch.no_grad():
    probs = model(torch.tensor(X_numpy, dtype=torch.float32)).numpy().flatten()
  return probs


# Evaluate
y_prob = predict_proba(model, X_test_proc)
auc = roc_auc_score(y_test_vals, y_prob)
# best threshold by maximizing F1 on test (or better: validation set)
best_f1 = 0
best_t = 0.5
for t in np.linspace(0.1, 0.9, 41):
  y_pred = (y_prob >= t).astype(int)
  f1 = f1_score(y_test_vals, y_pred)
  if f1 > best_f1:
    best_f1 = f1
    best_t = t


print('Supervised AUC:', auc)
print('Supervised best F1 on test:', best_f1, 'at threshold', best_t)

Supervised AUC: 0.7072591301415952
Supervised best F1 on test: 0.4193311462566714 at threshold 0.22


In [None]:
# 8) Build RL dataset and compute rewards
# Assumption: historical "action" column may not exist. We'll assume behavior_action=1 for approved rows in dataset.
# If your dataset contains columns indicating whether an application was approved (vs. rejected), use that column.


# For simplicity: we'll set behavior_action = 1 (approved) for all rows present (accepted loans dataset),
# but note this biases the behavior policy. Document it in your report.


behavior_actions = np.ones(len(df_sub), dtype=int)


# Create rewards according to brief: if deny -> 0. if approve & fully paid -> loan_amnt * int_rate. if approve & default -> -loan_amnt
# We need loan_amnt and int_rate in df_sub
if 'loan_amnt' not in df_sub.columns:
  raise ValueError('loan_amnt is required for reward calculation')


rewards = []
for idx, row in df_sub.iterrows():
approved = 1 # historical
if approved == 0:
rewards.append(0.0)
else:
if row['target'] == 0:
# fully paid -> gain interest
ir = row['int_rate'] if 'int_rate' in row and not pd.isna(row['int_rate']) else 0.0
rewards.append(row['loan_amnt'] * ir)
else:
rewards.append(-row['loan_amnt'])
rewards = np.array(rewards, dtype=float)


# Observations must be the preprocessed X; ensure alignment with df_sub index
# We'll transform the whole df_sub[selected_features] using the fitted preprocessor
X_all_proc = preprocessor.transform(df_sub[selected_features])


# Actions: for dataset, use behavior_actions aligned to df_sub
actions = behavior_actions # note: all ones here


# Since d3rlpy expects episodes, we create single-step episodes (obs, action, reward, next_obs, terminal)
# d3rlpy.MDPDataset can be built from arrays (observations, actions, rewards, terminals, next_observations)


try:
# next_observations are zeros or same as obs since episode ends
terminals = np.ones(len(X_all_proc), dtype=bool)
next_obs = np.zeros_like(X_all_proc)
dataset = MDPDataset(observations=X_all_proc, actions=actions.reshape(-1,1), rewards=rewards.reshape(-1,1), terminals=terminals, next_observations=next_obs)
print('Built MDPDataset for d3rlpy')
except Exception as e:
print('Failed to build MDPDataset:', e)
dataset = None