In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv("train.csv")

y = df["Survived"]
x = df.drop(columns = ["Survived", "PassengerId", "Name"])
x_train, x_val, y_train, y_val = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

num_features = x.select_dtypes(include=["int64", "float64"]).columns
cat_features = x.select_dtypes(include=["object"]).columns

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_imputer = SimpleImputer(strategy = "median")
num_scaler = StandardScaler()

Xtr_num = num_scaler.fit_transform(
    num_imputer.fit_transform(x_train[num_features])
)
Xva_num = num_scaler.transform(
    num_imputer.transform(x_val[num_features])
)

In [8]:
def build_cat_maps(df, cols):
    maps = {}
    for c in cols:
        s = df[c].astype("string").fillna("__nan__")
        uniq = pd.unique(s)
        maps[c] = {v: i + 1 for i, v in enumerate(uniq)}
    return maps

def encode_catgegories(df, cols, maps) -> np.ndarray:
    out = np.zeros((len(df), len(cols)), dtype = np.int64)
    for j, c in enumerate(cols):
        s = df[c].astype("string").fillna("__nan__")
        m = maps[c]
        out[:, j] = s.map(lambda x: m.get(x, 0)).to_numpy(dtype=np.int64)
    return out
    

In [9]:
cat_maps = build_cat_maps(x_train, cat_features)

In [12]:
Xtr_cat = encode_catgegories(x_train, cat_features, cat_maps)
Xva_cat = encode_catgegories(x_val, cat_features, cat_maps)

In [19]:
import torch
from torch.utils.data import TensorDataset, DataLoader

Xtr_num_t = torch.tensor(Xtr_num, dtype=torch.float32)
Xva_num_t = torch.tensor(Xva_num, dtype=torch.float32)

Xtr_cat_t = torch.tensor(Xtr_cat, dtype=torch.long)
Xva_cat_t = torch.tensor(Xva_cat, dtype=torch.long)

ytr_t = torch.tensor(y_train.values, dtype = torch.float32)
yva_t = torch.tensor(y_val.values, dtype = torch.float32)

train_loader = DataLoader(TensorDataset(Xtr_num_t, Xtr_cat_t, ytr_t), batch_size = 32, shuffle = True)

In [28]:
import torch.nn as nn
import math


class EmbeddingNN(nn.Module):
    def __init__(self, num_dim, cat_cards):
        super().__init__()
        self.embs = nn.ModuleList([
            nn.Embedding(n+1, min(16, n//2+1))
            for n in cat_cards
        ])
        emb_dim = sum(e.embedding_dim for e in self.embs)
        
        self.mlp = nn.Sequential(
            nn.Linear(num_dim + emb_dim, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x_num, x_cat):
        embs = []
        for i, emb in enumerate(self.embs):
            embs.append(emb(x_cat[:, i]))
        x = torch.cat([x_num] + embs, dim=1)
        return self.mlp(x).squeeze(1)


from sklearn.metrics import roc_auc_score

model = EmbeddingNN(
    num_dim=Xtr_num.shape[1],
    cat_cards=[len(cat_maps[c]) for c in cat_features]
)

opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()

for epoch in range(1, 21):
    model.train()
    for xb_num, xb_cat, yb in train_loader:
        logits = model(xb_num, xb_cat)
        loss = loss_fn(logits, yb)

        opt.zero_grad()
        loss.backward()
        opt.step()

    model.eval()
    with torch.no_grad():
        val_logits = model(Xva_num_t, Xva_cat_t)
        val_prob = torch.sigmoid(val_logits).numpy()

    auc = roc_auc_score(y_val, val_prob)
    print(f"Epoch {epoch:02d} | AUC: {auc:.4f}")


Epoch 01 | AUC: 0.7329
Epoch 02 | AUC: 0.7603
Epoch 03 | AUC: 0.7759
Epoch 04 | AUC: 0.7978
Epoch 05 | AUC: 0.8221
Epoch 06 | AUC: 0.8315
Epoch 07 | AUC: 0.8369
Epoch 08 | AUC: 0.8292
Epoch 09 | AUC: 0.8357
Epoch 10 | AUC: 0.8422
Epoch 11 | AUC: 0.8379
Epoch 12 | AUC: 0.8295
Epoch 13 | AUC: 0.8324
Epoch 14 | AUC: 0.8348
Epoch 15 | AUC: 0.8300
Epoch 16 | AUC: 0.8366
Epoch 17 | AUC: 0.8366
Epoch 18 | AUC: 0.8391
Epoch 19 | AUC: 0.8401
Epoch 20 | AUC: 0.8306
