Task 1: Image Classification


In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

Load the Fashion-MNIST train and test CSV files, normalize pixel values, reshape them into image tensors, and build DataLoaders

In [None]:
# 1. Load dataset & basic preprocessing
TRAIN_CSV_PATH = "/content/fashion-mnist_train.csv"
TEST_CSV_PATH  = "/content/fashion-mnist_test.csv"

# Load the training and test splits
train_df = pd.read_csv(TRAIN_CSV_PATH)
test_df  = pd.read_csv(TEST_CSV_PATH)

def preprocess_fashion_df(df):
    # Labels: class ids from 0 to 9
    y = df.iloc[:, 0].values

    # Pixel data, normalized to [0, 1]
    X = df.iloc[:, 1:].values / 255.0

    # Reshape to image tensors of shape N x 1 x 28 x 28
    X = X.reshape(-1, 1, 28, 28)

    X = torch.tensor(X, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.long)
    return X, y

# Preprocess both train and test splits
X_train, y_train = preprocess_fashion_df(train_df)
X_test,  y_test  = preprocess_fashion_df(test_df)

# Build DataLoaders: shuffle for training, no shuffle for test
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=64, shuffle=True)
val_loader   = DataLoader(TensorDataset(X_test,  y_test),  batch_size=64, shuffle=False)

Task 2: Loan Default Prediction

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch

In [2]:
# 1. Load dataset

df = pd.read_csv("Loan payments data.csv")

# Drop useless ID column
df = df.drop(columns=["Loan_ID"])

In [None]:
# 2. Binary classification label

# Original loan_status may contain:
#   PAIDOFF
#   COLLECTION
#   COLLECTION_PAIDOFF
# We define:
#   PAIDOFF → 0
#   Others  → 1  (default / high-risk)

df["loan_status_binary"] = (df["loan_status"] != "PAIDOFF").astype(int)

In [None]:
# 3. Date preprocessing: parse effective_date, due_date, paid_off_time

df["effective_date"] = pd.to_datetime(df["effective_date"])
df["due_date"] = pd.to_datetime(df["due_date"])
df["paid_off_time"] = pd.to_datetime(df["paid_off_time"], errors="coerce")
# errors="coerce" converts invalid date strings into NaT

In [None]:
# 4. Feature engineering from dates

# loan duration: due_date - effective_date (days)
df["loan_duration"] = (df["due_date"] - df["effective_date"]).dt.days

# actual pay duration (NaN if not paid)
df["actual_pay_duration"] = (df["paid_off_time"] - df["effective_date"]).dt.days

# Missing actual_pay_duration means the loan was NOT fully paid
df["actual_pay_duration"] = df["actual_pay_duration"].fillna(0)

# Binary indicator whether paid_off_time exists
df["paid_off_missing"] = df["paid_off_time"].isna().astype(int)

In [None]:
# 5. Handle missing values in past_due_days

# past_due_days is missing when borrower paid on time:
#   NaN → 0 is correct meaning "no overdue"

df["past_due_days"] = df["past_due_days"].fillna(0)

In [None]:
# 6. Select feature columns

numeric_cols = [
    "Principal",
    "terms",
    "past_due_days",
    "age",
    "loan_duration",
    "actual_pay_duration",
]

categorical_cols = ["education", "Gender"]

binary_cols = ["paid_off_missing"]  # already numeric & meaningful

# Construct feature DataFrame
X_raw = df[numeric_cols + categorical_cols + binary_cols].copy()
y = df["loan_status_binary"]

In [None]:
# 7. One-hot encode categorical features

X_cat = pd.get_dummies(X_raw[categorical_cols], drop_first=True)

# Combine:
X_processed = pd.concat([
    X_raw[numeric_cols],
    X_cat,
    X_raw[binary_cols]
], axis=1)

In [None]:
# 8. Standardize numerical features

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_processed)

In [None]:
# 9. Train/validation split

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# 10. Convert to PyTorch tensors

X_train = torch.tensor(X_train, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)
y_val = torch.tensor(y_val.values, dtype=torch.float32)

In [None]:
# 11. Compute class weights for imbalanced classification

num_pos = (y_train == 1).sum().item()
num_neg = (y_train == 0).sum().item()
pos_weight_value = num_neg / num_pos  # > 1 → positive class rarer

pos_weight = torch.tensor([pos_weight_value], dtype=torch.float32)

print("Class counts:", num_neg, "(no default) /", num_pos, "(default)")
print("pos_weight for BCEWithLogitsLoss =", pos_weight.item())

# Now you can pass pos_weight into BCEWithLogitsLoss:
# criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

print("Preprocessing complete. X_train, X_val, y_train, y_val are ready.")
print("Input dim =", X_train.shape[1])