In [26]:
import numpy as np
from scipy.stats import norm   # gives you Φ (cdf) and Φ^{-1} (ppf)
import matplotlib.pyplot as plt # optional, only if you want a histogram

In [27]:
import pandas as pd

# Load the training CSV
df = pd.read_csv("data/GiveMeSomeCredit/cs-training.csv")

# Quick look at the first rows
#print(df.head())

# Check the shape (#rows, #columns)
#print(df.shape)

# Inspect column names
#print(df.columns)

In [28]:
# How many defaults vs non-defaults?
#df["SeriousDlqin2yrs"].value_counts()

# Basic stats for income (often has missing values)
#print(df["MonthlyIncome"].describe())

# Check for missing data
print(df.isnull().sum())

Unnamed: 0                                  0
SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64


In [29]:
import numpy as np
rng = np.random.default_rng(42)

In [30]:
def relu(Z):
    return np.maximum(0.0,Z)

def sigmoid(Z):
    return 1.0/(1.0+np.exp(-Z))

In [31]:
def he_init(d,H,rng):
    W1 = rng.normal(loc=0.0, scale = np.sqrt(2.0/d),size = (d,H) )
    b1 = np.zeros((1,H))
    W2 = rng.normal(loc = 0.0, scale = np.sqrt(2.0/H), size = (H,1) )
    b2 = np.zeros((1,1))
    return W1, b1, W2, b2

In [32]:
def forward(X, W1, b1, W2, b2):
    Z1 = X@W1 + b1
    A1 = relu(Z1)
    Z2 = A1@W2 + b2
    Yhat = sigmoid(Z2)
    cache = (X, Z1, A1, Z2, Yhat)
    return Yhat, cache

In [33]:
def bce_loss(Yhat, y, eps=1e-12):
    Yhat = np.clip(Yhat, eps, 1.0 - eps)
    N = y.shape[0]
    loss = -(y * np.log(Yhat) + (1-y) * np.log(1 - Yhat) ).mean()
    return loss

In [34]:
d, H = 10, 64
W1, b1, W2, b2 = he_init(d, H, rng)

X = rng.normal(size=(32, d))
y = rng.integers(low=0, high=2, size=(32,1))

Yhat, cache = forward(X, W1, b1, W2, b2)

In [25]:
rng = np.random.default_rng(0)
d = .shape[1]   # number of features
H = 64      
W1, b1, W2, b2 = he_init(d, H, rng)

NameError: name 'X_train' is not defined

In [24]:
# Define dimensions consistently here (no hard-coding)
           # hidden units (you can try 32 or 128 later)

11


In [10]:
print("Yhat shape", Yhat.shape)
print("Loss:", bce_loss(Yhat, y))

Yhat shape (32, 1)
Loss: 1.2126950363957207


In [11]:
def backward(X,y,cache,W1,b1,W2,b2):
    Z1, A1, Z2, Yhat = cache
    N = X.shape[0]
    
    G2 = (Yhat-y)/N
    
    dW2 = A1.T@G2
    db2 = np.sum(G2, axis = 0, keepdims = True)
    
    G1 = (G2@W2.T) * (Z1>0)
    
    dW1 = X.T@G1
    db1 = np.sum(G1, axis=0, keepdims=True) # (1,H)
    
    return dW1, db1, dW2, db2

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# 1) load
df = pd.read_csv("data/GiveMeSomeCredit/cs-training.csv")

# 2) target y (column is 0/1)
y = df["SeriousDlqin2yrs"].values.reshape(-1, 1)

# 3) features X (drop target; keep only numeric columns)
X = df.drop(columns=["SeriousDlqin2yrs"])
X = X.select_dtypes(include=[np.number]).values  # shape (N, d_raw)

# 4) impute missing values with median
imputer = SimpleImputer(strategy="median")
X = imputer.fit_transform(X)

# 5) standardize features (mean 0, std 1)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 6) train/val split (stratify preserves default rate)
X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

N_tr, d = X_tr.shape
N_va = X_va.shape[0]
print("train shape:", X_tr.shape, "val shape:", X_va.shape)

train shape: (120000, 11) val shape: (30000, 11)


In [17]:
rng = np.random.default_rng(0)
H = 64  # you can try 32, 128 later

W1, b1, W2, b2 = he_init(d, H, rng)


In [18]:
epochs = 40
batch  = 256
lr     = 1e-3       # start modest; increase to 3e-3 if loss plateaus
weight_decay = 1e-5 # L2 regularization on weights

# (optional) class imbalance weight on positives:
pos = y_tr.sum()
neg = len(y_tr) - pos
pos_weight = float(neg / max(pos, 1))   # e.g., ~10-20 typically
use_pos_weight = True  # set False if you want plain BCE first

In [19]:
def bce_loss_weighted(Yhat, y, pos_weight=1.0, eps=1e-12):
    Yhat = np.clip(Yhat, eps, 1.0 - eps)
    # weight the positive term by pos_weight
    loss = -(pos_weight * y * np.log(Yhat) + (1 - y) * np.log(1 - Yhat)).mean()
    return loss


In [20]:
def backward_weighted(X, y, cache, W2, pos_weight=1.0):
    Z1, A1, Z2, Yhat = cache
    N = X.shape[0]
    # start with standard error
    G2 = (Yhat - y)
    # up-weight positive-class errors approximately
    G2[y.astype(bool)] *= pos_weight
    G2 /= N
    dW2 = A1.T @ G2
    db2 = np.sum(G2, axis=0, keepdims=True)
    G1 = (G2 @ W2.T) * (Z1 > 0)
    dW1 = X.T @ G1
    db1 = np.sum(G1, axis=0, keepdims=True)
    return dW1, db1, dW2, db2


In [21]:
def eval_loss(Xe, ye, W1, b1, W2, b2, weighted=False, pos_w=1.0):
    Yhat, _ = forward(Xe, W1, b1, W2, b2)
    if weighted:
        return bce_loss_weighted(Yhat, ye, pos_w)
    else:
        return bce_loss(Yhat, ye)

hist = {"train": [], "val": []}

for ep in range(1, epochs + 1):
    # shuffle training data each epoch
    idx = rng.permutation(N_tr)
    X_tr = X_tr[idx]; y_tr = y_tr[idx]

    # iterate over mini-batches
    for start in range(0, N_tr, batch):
        stop = min(start + batch, N_tr)
        Xb = X_tr[start:stop]
        yb = y_tr[start:stop]

        # forward
        Yhat, cache = forward(Xb, W1, b1, W2, b2)

        # backward (weighted or not)
        if use_pos_weight:
            dW1, db1, dW2, db2 = backward_weighted(Xb, yb, cache, W2, pos_weight)
            loss_b = bce_loss_weighted(Yhat, yb, pos_weight)
        else:
            dW1, db1, dW2, db2 = backward(Xb, yb, cache, W2)
            loss_b = bce_loss(Yhat, yb)

        # L2 regularization on weights
        dW1 += weight_decay * W1
        dW2 += weight_decay * W2

        # SGD update
        W1 -= lr * dW1
        b1 -= lr * db1
        W2 -= lr * dW2
        b2 -= lr * db2

    # end epoch: evaluate full train/val loss
    tr_loss = eval_loss(X_tr, y_tr, W1, b1, W2, b2, weighted=use_pos_weight, pos_w=pos_weight)
    va_loss = eval_loss(X_va, y_va, W1, b1, W2, b2, weighted=use_pos_weight, pos_w=pos_weight)
    hist["train"].append(tr_loss); hist["val"].append(va_loss)

    if ep % 5 == 0 or ep == 1:
        print(f"epoch {ep:02d} | train={tr_loss:.4f} | val={va_loss:.4f}")


ValueError: too many values to unpack (expected 4)

In [36]:
# ===== 0) Imports & seed =====
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

rng = np.random.default_rng(0)

# ===== 1) Activation functions =====
def relu(Z):
    return np.maximum(0.0, Z)

def sigmoid(Z):
    # numerically stable enough for this use
    return 1.0 / (1.0 + np.exp(-Z))

# ===== 2) He initialization (for ReLU) =====
def he_init(d, H, rng):
    W1 = rng.normal(0.0, np.sqrt(2.0/d), size=(d, H))
    b1 = np.zeros((1, H))
    W2 = rng.normal(0.0, np.sqrt(2.0/H), size=(H, 1))
    b2 = np.zeros((1, 1))
    return W1, b1, W2, b2

# ===== 3) Forward pass =====
def forward(X, W1, b1, W2, b2):
    # X: (N, d)
    Z1 = X @ W1 + b1            # (N, H) affine
    A1 = relu(Z1)               # (N, H) nonlinearity
    Z2 = A1 @ W2 + b2           # (N, 1) logit
    Yhat = sigmoid(Z2)          # (N, 1) probability
    cache = (Z1, A1, Z2, Yhat)  # we need these for backprop
    return Yhat, cache

# ===== 4) Binary cross-entropy loss (mean over batch) =====
def bce_loss(Yhat, y, eps=1e-12):
    Yhat = np.clip(Yhat, eps, 1.0 - eps)
    return -(y * np.log(Yhat) + (1 - y) * np.log(1 - Yhat)).mean()

# ===== 5) Backward pass (gradients via chain rule) =====
def backward(X, y, cache, W2):
    Z1, A1, Z2, Yhat = cache
    N = X.shape[0]
    G2 = (Yhat - y) / N               # (N, 1) output error
    dW2 = A1.T @ G2                   # (H, 1)
    db2 = np.sum(G2, axis=0, keepdims=True)   # (1, 1)
    G1 = (G2 @ W2.T) * (Z1 > 0)       # (N, H) elementwise mask for ReLU
    dW1 = X.T @ G1                    # (d, H)
    db1 = np.sum(G1, axis=0, keepdims=True)   # (1, H)
    return dW1, db1, dW2, db2

# ===== 6) Load & prepare data (Give Me Some Credit) =====
# Put the CSV at: data/cs-training.csv
df = pd.read_csv("data/GiveMeSomeCredit/cs-training.csv")

# Target and features
y_all = df["SeriousDlqin2yrs"].values.reshape(-1, 1)  # (N,1)
X_all = df.drop(columns=["SeriousDlqin2yrs"])
X_all = X_all.select_dtypes(include=[np.number]).values  # keep numeric only

# Impute missing with median, then standardize
imputer = SimpleImputer(strategy="median")
scaler  = StandardScaler()

X_all = imputer.fit_transform(X_all)
X_all = scaler.fit_transform(X_all)

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, stratify=y_all
)

# Define dimensions consistently here (no hard-coding)
d = X_train.shape[1]   # number of features
H = 64                 # hidden units (you can try 32 or 128 later)

# ===== 7) Initialize parameters =====
W1, b1, W2, b2 = he_init(d, H, rng)

# ===== 8) Training loop (mini-batch SGD) =====
epochs = 30
batch_size = 256
lr = 1e-3   # try 3e-3 if loss plateaus; 3e-4 if it oscillates

n_train = X_train.shape[0]
for ep in range(1, epochs + 1):
    # shuffle indices for this epoch
    idx = rng.permutation(n_train)

    # iterate over mini-batches by index (no in-place array shuffling)
    for start in range(0, n_train, batch_size):
        stop = min(start + batch_size, n_train)
        bidx = idx[start:stop]
        Xb = X_train[bidx]
        yb = y_train[bidx]

        # forward -> loss
        Yhat_b, cache_b = forward(Xb, W1, b1, W2, b2)
        loss_b = bce_loss(Yhat_b, yb)

        # backward -> grads
        dW1, db1, dW2, db2 = backward(Xb, yb, cache_b, W2)

        # SGD update
        W1 -= lr * dW1
        b1 -= lr * db1
        W2 -= lr * dW2
        b2 -= lr * db2

    # end-epoch: evaluate full train/val loss
    train_pred, _ = forward(X_train, W1, b1, W2, b2)
    val_pred, _   = forward(X_val,   W1, b1, W2, b2)
    train_loss = bce_loss(train_pred, y_train)
    val_loss   = bce_loss(val_pred,   y_val)

    if ep % 5 == 0 or ep == 1:
        print(f"epoch {ep:02d} | train={train_loss:.4f} | val={val_loss:.4f}")

# ===== 9) Example predictions (probabilities) =====
probs_val = val_pred.ravel()
print("First 10 validation PDs:", probs_val[:10])


epoch 01 | train=0.3600 | val=0.3588
epoch 05 | train=0.2579 | val=0.2582
epoch 10 | train=0.2464 | val=0.2469
epoch 15 | train=0.2438 | val=0.2442
epoch 20 | train=0.2419 | val=0.2423
epoch 25 | train=0.2404 | val=0.2406
epoch 30 | train=0.2390 | val=0.2392
First 10 validation PDs: [0.03671458 0.02298935 0.04932452 0.08167806 0.03269545 0.01250404
 0.08270306 0.64588307 0.07126863 0.14015793]
