## 1. Import

In [20]:
# file/path uilities
import os
import glob
from pathlib import Path

# for data manipulation/math
import pandas as pd
import numpy as np
import random

# encoding (type_name to number) / split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold

# progress bar
from tqdm import tqdm

# deep learning framework
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence # padding to handle variable length sequences
from torch.utils.data import Dataset, DataLoader

## 2. 하이퍼파라미터 세팅

In [None]:
#--- hyperparameter ---

SEED = 42

# cross-validation
N_SPLITS = 5 # number of folds
FOLD = 0 # which fold for validation

# sequence length
K = 50 # number of events to consider before the target event if smaller than K, pad with zeros
MIN_EVENTS = 2

# training parameters
EPOCHS = 30
BATCH_SIZE = 256
LR = 1e-3
WEIGHT_DECAY = 1e-5

# model parameters
HIDDEN_SIZE = 256 # LSTM hidden size
NUM_LAYERS = 2 # number of LSTM layers
DROPOUT = 0.1 

# data loader parameters
NUM_WORKERS = 0

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

print("Using device:", DEVICE)





Using device: cpu


## 3. 데이터 로드 및 전처리

In [None]:
TRAIN_PATH = "../data/train.csv"

df = pd.read_csv(TRAIN_PATH)

# sort events inside each episode by time, then action_id
# action_id is used for duplicate time_seconds
df = df.sort_values(["game_episode", "time_seconds", "action_id"]).reset_index(drop=True)

# fill missing category text
df["type_name"] = df["type_name"].fillna("__NA_TYPE__")
df["result_name"] = df["result_name"].fillna("__NA_RES__")

# change category text to idx(number)
# mapping number is just name, no matter with performance
le_type = LabelEncoder()
le_res  = LabelEncoder()
df["type_id"] = le_type.fit_transform(df["type_name"])
df["res_id"]  = le_res.fit_transform(df["result_name"])

# helper for goal geometry (fixed by problem: attacking to the right, goal center at (105,34))
GOAL_X, GOAL_Y = 105.0, 34.0

episodes = []
targets  = []
episode_keys = []
episode_game_ids = []

# build sequences per game_episode
for key, g in df.groupby("game_episode"):
    g = g.reset_index(drop=True)
    if len(g) < 2:
        continue

    # target is the last event's end point (always pass in your checks)
    tx, ty = float(g.loc[len(g)-1, "end_x"]), float(g.loc[len(g)-1, "end_y"])
    if np.isnan(tx) or np.isnan(ty):
        continue

    # compute dt inside episode
    t = g["time_seconds"].astype("float32").values
    dt = np.zeros_like(t, dtype="float32")
    dt[1:] = t[1:] - t[:-1]
    dt[dt < 0] = 0.0  # time-reversal safe-guard

    # base coords
    sx = g["start_x"].astype("float32").values
    sy = g["start_y"].astype("float32").values
    ex = g["end_x"].astype("float32").values
    ey = g["end_y"].astype("float32").values

    # leak-safe masking for last event's end
    is_end = np.ones(len(g), dtype="float32")
    is_end[-1] = 0.0
    ex_mask = ex.copy()
    ey_mask = ey.copy()
    ex_mask[-1] = 0.0
    ey_mask[-1] = 0.0

    # is_start is always 1 in event-token
    is_start = np.ones(len(g), dtype="float32")

    # goal geometry from start position
    dxg = GOAL_X - sx
    dyg = GOAL_Y - sy
    dist_to_goal  = np.sqrt(dxg**2 + dyg**2).astype("float32")
    angle_to_goal = np.arctan2(dyg, dxg).astype("float32")  # radians

    # categorical ids per event
    type_id = g["type_id"].astype("int64").values
    res_id  = g["res_id"].astype("int64").values

    # continuous features per event (T, F_cont)
    # x,y -> start_x,start_y
    # end_x,end_y -> masked for last event
    # dt -> time gap
    # is_start,is_end -> flags
    # dist_to_goal, angle_to_goal -> geometry
    cont = np.stack(
        [sx, sy, ex_mask, ey_mask, dt, is_start, is_end, dist_to_goal, angle_to_goal],
        axis=1
    ).astype("float32")

    episodes.append({
        "cont": cont,         # (T, 9)
        "type_id": type_id,   # (T,)
        "res_id": res_id      # (T,)
    })
    targets.append(np.array([tx, ty], dtype="float32"))
    episode_keys.append(key)
    episode_game_ids.append(int(str(key).split("_")[0]))

print("num episodes:", len(episodes))
print("example cont shape:", episodes[0]["cont"].shape, "| example target:", targets[0])


num episodes: 15428
example cont shape: (49, 9) | example target: [97.13403 41.79307]


## 4. Custom Dataset / DataLoader 정의 및 Validation 분할

In [23]:
# build dataset with padding collate for variable-length episodes
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import GroupKFold
import numpy as np

class EpisodeDataset(Dataset):
    # store sequences and targets
    def __init__(self, episodes, targets, keys):
        # episodes: list of dict(cont, type_id, res_id)
        self.episodes = episodes
        self.targets = targets
        self.keys = keys

    # number of episodes
    def __len__(self):
        return len(self.episodes)

    # return one episode
    def __getitem__(self, idx):
        ep = self.episodes[idx]
        cont = torch.from_numpy(ep["cont"])                 # (T, F)
        type_id = torch.from_numpy(ep["type_id"])           # (T,)
        res_id  = torch.from_numpy(ep["res_id"])            # (T,)
        y = torch.from_numpy(self.targets[idx])             # (2,)
        key = self.keys[idx]
        return cont, type_id, res_id, y, key

def collate_fn(batch):
    # unpack batch
    conts, type_ids, res_ids, ys, keys = zip(*batch)

    # lengths for packing
    lengths = torch.tensor([c.shape[0] for c in conts], dtype=torch.long)

    # pad to max length in batch
    cont_pad = pad_sequence(conts, batch_first=True, padding_value=0.0)       # (B, T, F)
    type_pad = pad_sequence(type_ids, batch_first=True, padding_value=0)      # (B, T)
    res_pad  = pad_sequence(res_ids,  batch_first=True, padding_value=0)      # (B, T)
    y = torch.stack(ys, dim=0).float()                                        # (B, 2)

    return cont_pad.float(), type_pad.long(), res_pad.long(), lengths, y, keys

# group split by game_id
episode_game_ids = np.array(episode_game_ids, dtype=np.int64)
gkf = GroupKFold(n_splits=5)

tr_idx, va_idx = next(gkf.split(np.zeros(len(episodes)), np.zeros(len(episodes)), groups=episode_game_ids))

train_eps = [episodes[i] for i in tr_idx]
train_tg  = [targets[i]  for i in tr_idx]
train_keys= [episode_keys[i] for i in tr_idx]

valid_eps = [episodes[i] for i in va_idx]
valid_tg  = [targets[i]  for i in va_idx]
valid_keys= [episode_keys[i] for i in va_idx]

train_ds = EpisodeDataset(train_eps, train_tg, train_keys)
valid_ds = EpisodeDataset(valid_eps, valid_tg, valid_keys)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True,  collate_fn=collate_fn)
valid_loader = DataLoader(valid_ds, batch_size=256, shuffle=False, collate_fn=collate_fn)

print("train episodes:", len(train_ds), "| valid episodes:", len(valid_ds))

cont_pad, type_pad, res_pad, lengths, y, keys = next(iter(train_loader))
print("batch cont_pad:", tuple(cont_pad.shape))
print("batch type_pad:", tuple(type_pad.shape), "batch res_pad:", tuple(res_pad.shape))
print("lengths:", tuple(lengths.shape), "y:", tuple(y.shape))
print("example key:", keys[0])


train episodes: 12320 | valid episodes: 3108
batch cont_pad: (256, 173, 9)
batch type_pad: (256, 173) batch res_pad: (256, 173)
lengths: (256,) y: (256, 2)
example key: 126357_49


## 5. LSTM 베이스라인 모델 정의

In [24]:
# define lstm model with categorical embeddings
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence

class PassLSTM(nn.Module):
    # lstm for sequence regression
    def __init__(self, cont_dim, n_type, n_res, emb_dim=16, hidden=256, num_layers=1, dropout=0.0):
        super().__init__()

        # embeddings for categories
        self.type_emb = nn.Embedding(n_type, emb_dim)
        self.res_emb  = nn.Embedding(n_res,  emb_dim)

        in_dim = cont_dim + emb_dim + emb_dim

        # lstm backbone
        self.lstm = nn.LSTM(
            input_size=in_dim,
            hidden_size=hidden,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=False
        )

        # regression head -> (x,y)
        self.head = nn.Sequential(
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, 2)
        )

    def forward(self, cont_pad, type_pad, res_pad, lengths):
        # embed categories
        te = self.type_emb(type_pad)  # (B,T,emb)
        re = self.res_emb(res_pad)    # (B,T,emb)

        # concat all features
        x = torch.cat([cont_pad, te, re], dim=-1)  # (B,T,in_dim)

        # pack padded sequence to ignore padding steps
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (h_n, _) = self.lstm(packed)

        # use last layer hidden state
        h_last = h_n[-1]  # (B, hidden)
        out = self.head(h_last)  # (B,2)
        return out

# sizes for embeddings
n_type = int(df["type_id"].max() + 1)
n_res  = int(df["res_id"].max() + 1)
cont_dim = 9

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PassLSTM(cont_dim=cont_dim, n_type=n_type, n_res=n_res, emb_dim=16, hidden=256).to(device)

print("device:", device)
print("n_type:", n_type, "n_res:", n_res, "cont_dim:", cont_dim)


device: cpu
n_type: 26 n_res: 9 cont_dim: 9


## 6. 모델 학습 및 검증

In [25]:
# train loop with euclidean metric
import torch
import torch.nn as nn
import numpy as np

def mean_euclidean(pred, true):
    # compute mean euclidean distance
    d = torch.sqrt(((pred - true) ** 2).sum(dim=1))
    return d.mean().item()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.SmoothL1Loss()  # stable for regression

EPOCHS = 30

best_val = 1e9
best_state = None

for epoch in range(1, EPOCHS + 1):
    # train
    model.train()
    tr_losses = []

    for cont_pad, type_pad, res_pad, lengths, y, keys in train_loader:
        cont_pad = cont_pad.to(device)
        type_pad = type_pad.to(device)
        res_pad  = res_pad.to(device)
        lengths  = lengths.to(device)
        y        = y.to(device)

        optimizer.zero_grad()
        pred = model(cont_pad, type_pad, res_pad, lengths)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()

        tr_losses.append(loss.item())

    # valid
    model.eval()
    val_dists = []
    with torch.no_grad():
        for cont_pad, type_pad, res_pad, lengths, y, keys in valid_loader:
            cont_pad = cont_pad.to(device)
            type_pad = type_pad.to(device)
            res_pad  = res_pad.to(device)
            lengths  = lengths.to(device)
            y        = y.to(device)

            pred = model(cont_pad, type_pad, res_pad, lengths)
            val_dists.append(mean_euclidean(pred, y))

    tr_loss = float(np.mean(tr_losses))
    val_dist = float(np.mean(val_dists))

    print(f"[epoch {epoch}] train_loss={tr_loss:.4f} | valid_mean_dist={val_dist:.4f}")

    # save best
    if val_dist < best_val:
        best_val = val_dist
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}

# load best
if best_state is not None:
    model.load_state_dict(best_state)

print("best valid_mean_dist:", best_val)


[epoch 1] train_loss=32.2651 | valid_mean_dist=30.6791
[epoch 2] train_loss=16.7121 | valid_mean_dist=26.0290
[epoch 3] train_loss=14.5704 | valid_mean_dist=20.5087
[epoch 4] train_loss=10.5141 | valid_mean_dist=16.4103
[epoch 5] train_loss=9.5484 | valid_mean_dist=15.7581
[epoch 6] train_loss=9.2841 | valid_mean_dist=15.6826
[epoch 7] train_loss=9.0207 | valid_mean_dist=15.5961
[epoch 8] train_loss=8.9537 | valid_mean_dist=15.0756
[epoch 9] train_loss=8.8639 | valid_mean_dist=15.0737
[epoch 10] train_loss=8.7824 | valid_mean_dist=14.8192
[epoch 11] train_loss=8.6867 | valid_mean_dist=14.9198
[epoch 12] train_loss=8.5941 | valid_mean_dist=14.6976
[epoch 13] train_loss=8.5559 | valid_mean_dist=15.0847
[epoch 14] train_loss=8.5913 | valid_mean_dist=14.7909
[epoch 15] train_loss=8.4692 | valid_mean_dist=14.6760
[epoch 16] train_loss=8.3902 | valid_mean_dist=14.7133
[epoch 17] train_loss=8.3885 | valid_mean_dist=14.6581
[epoch 18] train_loss=8.4596 | valid_mean_dist=14.3820
[epoch 19] trai

## 7. 평가 데이터셋 추론

In [32]:
# 7) inference on test episodes listed in test.csv
import os
import numpy as np
import pandas as pd
import torch

# paths (edit if your notebook location changes)
TEST_META_PATH = "../data/test.csv"
SUBMISSION_PATH = "../data/sample_submission.csv"
DATA_ROOT = "../data"  # base folder that contains ./test/...

# constants for goal geometry
GOAL_X = 105.0
GOAL_Y = 34.0

# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

test_meta = pd.read_csv(TEST_META_PATH)
submission = pd.read_csv(SUBMISSION_PATH)

# build one episode features using the same preprocessing as training
def build_episode_from_df(g):
    # sort inside episode
    g = g.sort_values(["time_seconds", "action_id"]).reset_index(drop=True)

    # fill categories
    g["type_name"] = g["type_name"].fillna("__NA_TYPE__")
    g["result_name"] = g["result_name"].fillna("__NA_RES__")

    # handle unseen labels safely
    g.loc[~g["type_name"].isin(le_type.classes_), "type_name"] = "__NA_TYPE__"
    g.loc[~g["result_name"].isin(le_res.classes_), "result_name"] = "__NA_RES__"

    type_id = le_type.transform(g["type_name"]).astype("int64")
    res_id  = le_res.transform(g["result_name"]).astype("int64")

    # time delta (dt)
    t = g["time_seconds"].astype("float32").values
    dt = np.zeros_like(t, dtype="float32")
    dt[1:] = t[1:] - t[:-1]
    dt[dt < 0] = 0.0

    # coordinates
    sx = g["start_x"].astype("float32").values
    sy = g["start_y"].astype("float32").values
    ex = g["end_x"].astype("float32").values
    ey = g["end_y"].astype("float32").values

    # replace nan for safety
    sx = np.nan_to_num(sx, nan=0.0)
    sy = np.nan_to_num(sy, nan=0.0)
    ex = np.nan_to_num(ex, nan=0.0)
    ey = np.nan_to_num(ey, nan=0.0)

    # mask last end (target leakage prevention consistency)
    is_start = np.ones(len(g), dtype="float32")
    is_end = np.ones(len(g), dtype="float32")
    is_end[-1] = 0.0

    ex_mask = ex.copy()
    ey_mask = ey.copy()
    ex_mask[-1] = 0.0
    ey_mask[-1] = 0.0

    # geometry to goal
    dxg = GOAL_X - sx
    dyg = GOAL_Y - sy
    dist_to_goal  = np.sqrt(dxg**2 + dyg**2).astype("float32")
    angle_to_goal = np.arctan2(dyg, dxg).astype("float32")

    # continuous features
    # cont shape: (T, 9) = [sx, sy, ex_mask, ey_mask, dt, is_start, is_end, dist_to_goal, angle_to_goal]
    cont = np.stack(
        [sx, sy, ex_mask, ey_mask, dt, is_start, is_end, dist_to_goal, angle_to_goal],
        axis=1
    ).astype("float32")

    return cont, type_id, res_id

pred_map = {}  # game_episode -> (pred_x, pred_y)

with torch.no_grad():
    for _, row in test_meta.iterrows():
        game_episode = row["game_episode"]

        # test.csv has a column "path" like "./test/153363/153363_1.csv"
        rel_path = str(row["path"])
        rel_path = rel_path[2:] if rel_path.startswith("./") else rel_path
        full_path = os.path.join(DATA_ROOT, rel_path)

        g = pd.read_csv(full_path)

        cont, type_id, res_id = build_episode_from_df(g)

        # to torch tensor and batchify (B=1)
        cont_t = torch.from_numpy(cont).unsqueeze(0).to(device)     # (1,T,F)
        type_t = torch.from_numpy(type_id).unsqueeze(0).to(device)  # (1,T)
        res_t  = torch.from_numpy(res_id).unsqueeze(0).to(device)   # (1,T)
        lengths = torch.tensor([cont.shape[0]], dtype=torch.long).to(device)

        pred = model(cont_t.float(), type_t.long(), res_t.long(), lengths)  # (1,2)
        pred_xy = pred.squeeze(0).detach().cpu().numpy().astype("float32")

        pred_map[game_episode] = pred_xy

# align predictions to sample_submission order
preds_x = []
preds_y = []
missing = []

for ge in submission["game_episode"].tolist():
    if ge not in pred_map:
        missing.append(ge)
        preds_x.append(0.0)
        preds_y.append(0.0)
        continue
    px, py = pred_map[ge]
    preds_x.append(float(px))
    preds_y.append(float(py))

if len(missing) > 0:
    print("warning: missing episodes in pred_map:", len(missing))

submission["end_x"] = preds_x
submission["end_y"] = preds_y

print("inference done:", len(submission))


inference done: 2414


In [33]:
# 8) save submission with auto-increment filename
import os

base = "LSTM_1_submit"
ext = ".csv"

i = 0
while True:
    out_name = f"{base}_{i}{ext}"
    if not os.path.exists(out_name):
        break
    i += 1

submission[["game_episode", "end_x", "end_y"]].to_csv(out_name, index=False)
print("saved:", out_name)


saved: LSTM_1_submit_1.csv


## 8. 제출 Submission 생성

In [30]:
# save submission with auto-increment filename
import os

base = "LSTM_1_submit"
ext = ".csv"

i = 0
while True:
    out_name = f"{base}_{i}{ext}"
    if not os.path.exists(out_name):
        break
    i += 1

submission[["game_episode", "end_x", "end_y"]].to_csv(out_name, index=False)
print("saved:", out_name)


saved: LSTM_1_submit_0.csv
