In [23]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
y = df_train["SalePrice"]
y_log = np.log1p(y)
x = df_train.drop(columns = ["SalePrice", "Id"])
x_test = df_test.drop(columns = ["Id"])

x_train, x_eval, y_train, y_eval = train_test_split(
    x, y_log, test_size = 0.2, random_state = 42)

num_features = x.select_dtypes(include=["int64", "float64"]).columns
cat_features = x.select_dtypes(include=["object"]).columns

num_imputer = SimpleImputer(strategy = "median")
num_scaler = StandardScaler()

Xtr_num = num_imputer.fit_transform(x_train[num_features])
Xtr_num = num_scaler.fit_transform(Xtr_num)

Xva_num = num_imputer.transform(x_eval[num_features])
Xva_num = num_scaler.transform(Xva_num)

Xtr_num, Xva_num

(array([[-0.8667643 , -0.01246836, -0.21289571, ..., -0.09274033,
         -0.13341669,  1.65006527],
        [ 0.07410996, -0.50235683, -0.26524463, ..., -0.09274033,
         -0.5080097 ,  0.89367742],
        [-0.63154574, -0.14607431, -0.17784146, ..., -0.09274033,
         -0.5080097 ,  0.13728958],
        ...,
        [-0.8667643 , -0.45782152, -0.23409563, ..., -0.09274033,
         -0.88260272, -1.37548612],
        [-0.16110861, -0.6804981 , -0.28337613, ..., -0.09274033,
         -0.13341669, -0.61909827],
        [ 1.48542135, -0.76956873, -0.65139925, ..., -0.09274033,
         -0.13341669,  0.89367742]], shape=(1168, 36)),
 array([[-0.8667643 , -0.01246836, -0.21159396, ..., -0.09274033,
         -1.63178875, -1.37548612],
        [ 0.07410996,  1.23452047,  0.14564323, ..., -0.09274033,
         -0.88260272,  1.65006527],
        [-0.63154574, -0.63596278, -0.16082574, ..., -0.09274033,
         -1.25719573,  1.65006527],
        ...,
        [ 0.07410996, -0.32421557, -

In [24]:
from typing import Dict, List

def build_category_maps(df: pd.DataFrame, cat_cols: List[str]) -> Dict[str, Dict[str, int]]:
    maps = {}
    for c in cat_cols:
        s = df[c].astype("string").fillna("__nan__")
        uniq = pd.unique(s)
        maps[c] = {v: i+1 for i, v in enumerate(uniq)}
    return maps

def encode_categories(df: pd.DataFrame, cat_cols: List[str], maps: Dict[str, Dict[str, int]]) -> np.ndarray:
    out = np.zeros((len(df), len(cat_cols)), dtype = np.int64)
    for j, c in enumerate(cat_cols):
        s = df[c].astype("string").fillna("__nan__")
        m = maps[c]
        out[:, j] = s.map(lambda x: m.get(x, 0)).to_numpy(dtype=np.int64)

    return out


cat_maps = build_category_maps(x_train, cat_features)
Xtr_cat = encode_categories(x_train, cat_features, cat_maps)
Xva_cat = encode_categories(x_eval, cat_features, cat_maps)

Xtr_cat, Xva_cat

        

(array([[1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        ...,
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1]], shape=(1168, 43)),
 array([[1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        [2, 1, 3, ..., 1, 1, 1],
        ...,
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1]], shape=(292, 43)))

In [30]:
import torch
from torch.utils.data import TensorDataset, DataLoader

Xtr_num_t = torch.tensor(Xtr_num, dtype=torch.float32)
Xva_num_t = torch.tensor(Xva_num, dtype=torch.float32)

Xtr_cat_t = torch.tensor(Xtr_cat, dtype=torch.long)
Xva_cat_t = torch.tensor(Xva_cat, dtype=torch.long)

y_scaler = StandardScaler()
ytr_n = y_scaler.fit_transform(y_train.to_numpy().reshape(-1,1)).astype(np.float32)
yva_n = y_scaler.transform(y_eval.to_numpy().reshape(-1,1)).astype(np.float32)

ytr_t = torch.tensor(ytr_n)
yva_t = torch.tensor(yva_n)

train_loader = DataLoader(TensorDataset(Xtr_num_t, Xtr_cat_t, ytr_t), batch_size=32, shuffle=True)

print("train unknown rate:", (Xtr_cat == 0).mean())
print("val unknown rate:", (Xva_cat == 0).mean())

print("num_features:", len(num_features), "cat_features:", len(cat_features))
print("cat cardinalities max:", max(len(cat_maps[c]) for c in cat_features))


train unknown rate: 0.0
val unknown rate: 0.00015928639694170118
num_features: 36 cat_features: 43
cat cardinalities max: 25


In [58]:
import torch.nn as nn
import math


def default_emb_dim(n_cat: int) -> int:
    return int(min(50, round(math.sqrt(n_cat))))

class EmbeddingNN(nn.Module):
    def __init__(self, num_dim: int, cat_cardinalities: List[int]):
        super().__init__()
        self.emb_layers = nn.ModuleList()
        emb_dims = []
        for n in cat_cardinalities:
            d = default_emb_dim(n)
            emb_dims.append(d)
            self.emb_layers.append(nn.Embedding(num_embeddings=n+1, embedding_dim=d))

        self.emb_out_dim = sum(emb_dims)
        in_dim = num_dim + self.emb_out_dim

        self.mlp = nn.Sequential(
            nn.Linear(in_dim, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(128, 1),
        )

    def forward(self, x_num, x_cat):
        embs = []
        for i, emb in enumerate(self.emb_layers):
            embs.append(emb(x_cat[:, i]))
        x = torch.cat([x_num] + embs, dim=1)
        return self.mlp(x)


cat_cardinalities = [len(cat_maps[c]) for c in cat_features]
model = EmbeddingNN(num_dim=Xtr_num.shape[1], cat_cardinalities=cat_cardinalities)

loss_fn = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=5)


In [59]:
def rmse_t(y_true, y_pred):
    return torch.sqrt(torch.mean((y_true - y_pred) ** 2)).item()

def logrmse_from_norm(y_true_n_t, y_pred_n_t):
    y_true_log = y_scaler.inverse_transform(y_true_n_t.detach().cpu().numpy())
    y_pred_log = y_scaler.inverse_transform(y_pred_n_t.detach().cpu().numpy())
    return float(np.sqrt(np.mean((y_true_log - y_pred_log) ** 2)))


best_val = float("inf")
best_state = None
patience = 30
pat_left = patience

for epoch in range(1, 201):
    model.train()
    for xb_num, xb_cat, yb in train_loader:
        pred = model(xb_num, xb_cat)
        loss = loss_fn(pred, yb)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    model.eval()
    with torch.no_grad():
        val_pred = model(Xva_num_t, Xva_cat_t)
        val_rmse = rmse_t(yva_t, val_pred)
        val_rmse_from_norm = logrmse_from_norm(yva_t, val_pred)

    scheduler.step(val_rmse_from_norm)
    lr = optimizer.param_groups[0]["lr"]

    if val_rmse < best_val - 1e-5:
        best_val = val_rmse
        best_logmse = val_rmse_from_norm
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        pat_left = patience
    else:
        pat_left -= 1

    print(f"Epoch {epoch:03d} | val RMSE: {val_rmse:.5f} | val LOGRMSE: {val_rmse_from_norm:.5f} | best: {best_logmse:.5f} | lr: {lr:.2e} | pat_left: {pat_left}")

    if pat_left == 0:
        print(f"Early stopped at epoch {epoch}, best={best_logmse:.5f}")
        break

model.load_state_dict(best_state)
best_logmse

Epoch 001 | val RMSE: 0.45263 | val LOGRMSE: 0.17673 | best: 0.17673 | lr: 1.00e-03 | pat_left: 30
Epoch 002 | val RMSE: 0.40233 | val LOGRMSE: 0.15708 | best: 0.15708 | lr: 1.00e-03 | pat_left: 30
Epoch 003 | val RMSE: 0.44061 | val LOGRMSE: 0.17203 | best: 0.15708 | lr: 1.00e-03 | pat_left: 29
Epoch 004 | val RMSE: 0.38194 | val LOGRMSE: 0.14912 | best: 0.14912 | lr: 1.00e-03 | pat_left: 30
Epoch 005 | val RMSE: 0.37780 | val LOGRMSE: 0.14751 | best: 0.14751 | lr: 1.00e-03 | pat_left: 30
Epoch 006 | val RMSE: 0.37135 | val LOGRMSE: 0.14499 | best: 0.14499 | lr: 1.00e-03 | pat_left: 30
Epoch 007 | val RMSE: 0.40151 | val LOGRMSE: 0.15677 | best: 0.14499 | lr: 1.00e-03 | pat_left: 29
Epoch 008 | val RMSE: 0.43517 | val LOGRMSE: 0.16991 | best: 0.14499 | lr: 1.00e-03 | pat_left: 28
Epoch 009 | val RMSE: 0.37569 | val LOGRMSE: 0.14668 | best: 0.14499 | lr: 1.00e-03 | pat_left: 27
Epoch 010 | val RMSE: 0.36355 | val LOGRMSE: 0.14194 | best: 0.14194 | lr: 1.00e-03 | pat_left: 30
Epoch 011 

0.14094407856464386