<a href="https://colab.research.google.com/github/AgilAptanaDwiPutra/UAP_PRAKTIKUM/blob/main/UAP_MODEL_PRETRAINED_FT_TRANSFORMER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip -q install scikit-learn

In [72]:
import os
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import matplotlib.pyplot as plt

In [73]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

seed_everything(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cpu


In [74]:
df = pd.read_csv("dataset_lagu.csv")

if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

assert "popularity" in df.columns
print(df.shape)

(114000, 20)


In [75]:
TARGET = "popularity"

# ================================
# TARGET & FEATURES (ANTI LEAKAGE)
# ================================
y = df[TARGET].values.astype("float32")
X = df.drop(columns=[TARGET])

assert "popularity" not in X.columns
print("✅ No leakage")

# ================================
# ARTISTS TOP-K ENCODING
# ================================
TOP_K = 2000
if "artists" in X.columns:
    top_artists = X["artists"].value_counts().head(TOP_K).index
    X["artists_top"] = X["artists"].where(
        X["artists"].isin(top_artists),
        "__OTHER__"
    )
    X = X.drop(columns=["artists"])
    print("✅ artists_top added")

# ================================
# DROP OTHER HIGH-CARD COLUMNS
# ================================
for c in ["track_id", "track_name", "album_name"]:
    if c in X.columns:
        X = X.drop(columns=[c])

# ================================
# NUMERIC / CATEGORICAL
# ================================
num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
cat_cols = [c for c in X.columns if c not in num_cols]

print("NUM COLS:", num_cols)
print("CAT COLS:", cat_cols)

# ================================
# SPLIT
# ================================
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.15, random_state=42
)

✅ No leakage
✅ artists_top added
NUM COLS: ['duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
CAT COLS: ['track_genre', 'artists_top']


In [76]:
# ===== NUMERIC =====
scaler = StandardScaler()
Xn_train = scaler.fit_transform(X_train[num_cols]) if num_cols else np.zeros((len(X_train), 0))
Xn_val   = scaler.transform(X_val[num_cols]) if num_cols else np.zeros((len(X_val), 0))
Xn_test  = scaler.transform(X_test[num_cols]) if num_cols else np.zeros((len(X_test), 0))

# ===== CATEGORICAL (SAFE) =====
cat_maps = {}
cat_cards = []

for c in cat_cols:
    uniq = pd.unique(X_train[c].astype(str))
    mp = {v: i+1 for i, v in enumerate(uniq)}  # 0 = UNKNOWN
    cat_maps[c] = mp
    cat_cards.append(len(mp) + 1)

def encode_cat(df_part):
    if len(cat_cols) == 0:
        return np.zeros((len(df_part), 0), dtype=np.int64)

    arr = np.zeros((len(df_part), len(cat_cols)), dtype=np.int64)
    for j, c in enumerate(cat_cols):
        mp = cat_maps[c]
        vals = df_part[c].astype(str).values
        for i, v in enumerate(vals):
            arr[i, j] = mp.get(v, 0)
    return arr

Xc_train = encode_cat(X_train)
Xc_val   = encode_cat(X_val)
Xc_test  = encode_cat(X_test)

# ===== SANITY CHECK (ANTI INDEX ERROR) =====
for i, c in enumerate(cat_cols):
    mx = max(Xc_train[:, i].max(), Xc_val[:, i].max(), Xc_test[:, i].max())
    print(f"[CHECK] {c}: max={mx}, card={cat_cards[i]}")
    assert mx < cat_cards[i], f"❌ INDEX ERROR di {c}"

[CHECK] track_genre: max=114, card=115
[CHECK] artists_top: max=2001, card=2002


In [77]:
class TabDataset(Dataset):
    def __init__(self, Xn, Xc, y):
        self.Xn = torch.tensor(Xn, dtype=torch.float32)
        self.Xc = torch.tensor(Xc, dtype=torch.long)
        self.y  = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, i):
        return self.Xn[i], self.Xc[i], self.y[i]

batch_size = 1024 if device.type == "cuda" else 256

train_loader = DataLoader(TabDataset(Xn_train, Xc_train, y_train), batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(TabDataset(Xn_val, Xc_val, y_val), batch_size=batch_size)
test_loader  = DataLoader(TabDataset(Xn_test, Xc_test, y_test), batch_size=batch_size)

In [78]:
class TabDataset(Dataset):
    def __init__(self, Xn, Xc, y):
        self.Xn = torch.tensor(Xn, dtype=torch.float32)
        self.Xc = torch.tensor(Xc, dtype=torch.long)
        self.y  = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, i):
        return self.Xn[i], self.Xc[i], self.y[i]

batch_size = 1024 if device.type == "cuda" else 256

train_loader = DataLoader(TabDataset(Xn_train, Xc_train, y_train), batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(TabDataset(Xn_val, Xc_val, y_val), batch_size=batch_size)
test_loader  = DataLoader(TabDataset(Xn_test, Xc_test, y_test), batch_size=batch_size)

In [79]:
def eval_reg(model, loader):
    model.eval()
    p, t = [], []
    with torch.no_grad():
        for xn, xc, y in loader:
            xn, xc, y = xn.to(device), xc.to(device), y.to(device)
            p.append(model(xn, xc).cpu().numpy())
            t.append(y.cpu().numpy())
    p = np.concatenate(p)
    t = np.concatenate(t)
    rmse = np.sqrt(mean_squared_error(t, p))
    mae = mean_absolute_error(t, p)
    r2 = r2_score(t, p)
    return rmse, mae, r2


model = FTTransformer(len(num_cols), cat_cards).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
loss_fn = nn.MSELoss()

best_rmse = 1e9
best_state = None
patience, bad = 5, 0
train_loss, val_rmse = [], []

for epoch in range(1, 30):
    model.train()
    tot = 0
    for xn, xc, y in train_loader:
        xn, xc, y = xn.to(device), xc.to(device), y.to(device)
        opt.zero_grad()
        loss = loss_fn(model(xn, xc), y)
        loss.backward()
        opt.step()
        tot += loss.item() * y.size(0)

    tr_loss = tot / len(train_loader.dataset)
    rmse, mae, r2 = eval_reg(model, val_loader)

    train_loss.append(tr_loss)
    val_rmse.append(rmse)

    print(f"Epoch {epoch:02d} | train_loss={tr_loss:.2f} | val_RMSE={rmse:.2f} MAE={mae:.2f} R2={r2:.3f}")

    if rmse < best_rmse:
        best_rmse = rmse
        best_state = {k: v.cpu() for k, v in model.state_dict().items()}
        bad = 0
    else:
        bad += 1
        if bad >= patience:
            print("Early stopping")
            break

model.load_state_dict(best_state)



Epoch 01 | train_loss=930.83 | val_RMSE=25.83 MAE=21.53 R2=-0.325
Epoch 02 | train_loss=548.28 | val_RMSE=22.53 MAE=19.17 R2=-0.008
Epoch 03 | train_loss=455.23 | val_RMSE=19.61 MAE=14.86 R2=0.236
Epoch 04 | train_loss=343.04 | val_RMSE=18.12 MAE=13.13 R2=0.348
Epoch 05 | train_loss=306.07 | val_RMSE=17.69 MAE=12.55 R2=0.378
Epoch 06 | train_loss=289.38 | val_RMSE=17.44 MAE=11.71 R2=0.396
Epoch 07 | train_loss=277.01 | val_RMSE=16.92 MAE=11.70 R2=0.431
Epoch 08 | train_loss=268.75 | val_RMSE=16.77 MAE=11.39 R2=0.442
Epoch 09 | train_loss=261.63 | val_RMSE=16.78 MAE=11.44 R2=0.441
Epoch 10 | train_loss=255.64 | val_RMSE=16.67 MAE=11.15 R2=0.448
Epoch 11 | train_loss=252.24 | val_RMSE=16.55 MAE=10.90 R2=0.456
Epoch 12 | train_loss=247.74 | val_RMSE=16.39 MAE=10.82 R2=0.466
Epoch 13 | train_loss=242.61 | val_RMSE=16.38 MAE=10.69 R2=0.467
Epoch 14 | train_loss=240.20 | val_RMSE=16.37 MAE=10.77 R2=0.468
Epoch 15 | train_loss=237.66 | val_RMSE=16.38 MAE=10.63 R2=0.467
Epoch 16 | train_loss=2

<All keys matched successfully>

In [80]:
rmse, mae, r2 = eval_reg(model, test_loader)
print(f"[TEST] RMSE={rmse:.2f} | MAE={mae:.2f} | R2={r2:.3f}")

[TEST] RMSE=15.85 | MAE=10.18 | R2=0.490


In [83]:
os.makedirs("artifacts", exist_ok=True)

torch.save(
    {
        "model_state": model.state_dict(),
        "num_cols": num_cols,
        "cat_cols": cat_cols,
        "cat_maps": cat_maps,
        "cat_cards": cat_cards,
        "scaler_mean": scaler.mean_.tolist(),
        "scaler_scale": scaler.scale_.tolist(),
    },
    "artifacts/fttransformer_regression.pt",
)

print("✅ Model saved: artifacts/fttransformer_regression.pt")

✅ Model saved: artifacts/fttransformer_regression.pt


In [82]:
rm -rf artifacts