## 1. Import

In [1]:
# for data manipulation
import pandas as pd
import numpy as np

# split data into train and test sets
from sklearn.model_selection import train_test_split

# progress bar
from tqdm import tqdm

# deep learning framework
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence # padding to handle variable length sequences
from torch.utils.data import Dataset, DataLoader

## 2. 하이퍼파라미터 세팅

In [2]:
BATCH_SIZE = 64
EPOCHS = 100
LR = 1e-3 # learning rate
HIDDEN_DIM = 64 # memorizing capacity
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

Using device: cpu


## 3. 데이터 로드 및 전처리
### - 에피소드 별 (x,y) 시퀀스 생성

In [3]:

df = pd.read_csv("../data/train.csv")
# arrange by episode and time
df = df.sort_values(["game_episode", "time_seconds"]).reset_index(drop=True)

episodes = []
targets = []

# categorize by game_episode rows
# ex) g1 = game_episode 1 data, g2 = game_episode 2 data, ...
for _, g in tqdm(df.groupby("game_episode")):
    g = g.reset_index(drop=True)
    if len(g) < 2: # skip if less than 2 rows
        continue

    # 정규화된 좌표 준비
    # sizing based on standard soccer field dimensions (105m x 68m)
    sx = g["start_x"].values / 105.0
    sy = g["start_y"].values / 68.0
    ex = g["end_x"].values   / 105.0
    ey = g["end_y"].values   / 68.0

    # number of passes
    coords = []
    for i in range(len(g)):
        # 항상 start는 들어감
        coords.append([sx[i], sy[i]])
        # 마지막 행 이전까지만 end를 넣음 (마지막 end는 타깃이므로)
        if i < len(g) - 1:
            coords.append([ex[i], ey[i]])

    # [T, 2] # if game_episode has 4 rows -> seq has 7 rows (except last end)
    # dim = 2 (x, y)
    seq = np.array(coords, dtype="float32")
    target = np.array([ex[-1], ey[-1]], dtype="float32")  # 마지막 행 end_x, end_y as target(prediction)

    episodes.append(seq)
    targets.append(target)

print("에피소드 수 : ", len(episodes))

100%|██████████| 15435/15435 [00:01<00:00, 11342.10it/s]

에피소드 수 :  15428





## 4. Custom Dataset / DataLoader 정의 및 Validation 분할

In [None]:
class EpisodeDataset(Dataset):
    # initialize with episodes and targets
    def __init__(self, episodes, targets):
        self.episodes = episodes
        self.targets = targets

    # return number of episodes
    def __len__(self):
        return len(self.episodes)

    # data -> tensor
    def __getitem__(self, idx):
        seq = torch.tensor(self.episodes[idx])   # [T, 2]
        tgt = torch.tensor(self.targets[idx])    # [2]
        length = seq.size(0)
        return seq, length, tgt

def collate_fn(batch):
    seqs, lengths, tgts = zip(*batch)
    lengths = torch.tensor(lengths, dtype=torch.long)

    # used to make same length for variable length due to pass counts by padding
    # standard: longest sequence in the batch
    # short sequences are padded with zeros
    # shape: [B, T, 2] = [batch_size, max_seq_length, 2]
    padded = pad_sequence(seqs, batch_first=True)

    tgts = torch.stack(tgts, dim=0)                # [B, 2]

    # to let model know which part is padding
    return padded, lengths, tgts

# 에피소드 단위 train / valid split
# every episode has answer
# 8: train the model
# 2: validate the model performance on unseen data
idx_train, idx_valid = train_test_split(
    np.arange(len(episodes)), test_size=0.2, random_state=42
)

episodes_train = [episodes[i] for i in idx_train]
targets_train  = [targets[i]  for i in idx_train]
episodes_valid = [episodes[i] for i in idx_valid]
targets_valid  = [targets[i]  for i in idx_valid]

# for training, with shuffle
train_loader = DataLoader(
    EpisodeDataset(episodes_train, targets_train),
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
)

# for evaluation, no shuffle
valid_loader = DataLoader(
    EpisodeDataset(episodes_valid, targets_valid),
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
)

print("train episodes:", len(episodes_train), "valid episodes:", len(episodes_valid))

train episodes: 12342 valid episodes: 3086


## 5. LSTM 베이스라인 모델 정의

In [16]:
# use only (x, y) features
class LSTMBaseline(nn.Module):
    def __init__(self, input_dim=2, hidden_dim=64):
        super().__init__()
        # LSTM: mermorizing
        # [input_dim: 2 (x, y), hidden_dim: 64] -> memorize (x, y)'s features by 64 numbers
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            batch_first=True,
        )
        # FC: by LSTM's 64 info predict (x, y)
        self.fc = nn.Linear(hidden_dim, 2)  # (x_norm, y_norm)

    def forward(self, x, lengths):
        # x: [B, T, 2], lengths: [B]
        # ignore padding parts
        packed = pack_padded_sequence(
            x, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        _, (h_n, _) = self.lstm(packed)
        h_last = h_n[-1]      # [B, H] 마지막 layer의 hidden state which is the last compressed info
        out = self.fc(h_last) # [B, 2]
        return out

model = LSTMBaseline(input_dim=2, hidden_dim=HIDDEN_DIM).to(DEVICE)
criterion = nn.MSELoss() # avg[(Predicted - Target)^2] -> when error is large, loss is large
optimizer = torch.optim.Adam(model.parameters(), lr=LR) # gradient descent optimizer

## 6. 모델 학습 및 검증

In [None]:
best_dist = float("inf")
best_model_state = None

for epoch in range(1, EPOCHS + 1):
    # --- Train ---
    model.train()
    total_loss = 0.0

    for X, lengths, y in tqdm(train_loader):
        X, lengths, y = X.to(DEVICE), lengths.to(DEVICE), y.to(DEVICE)

        optimizer.zero_grad() # reset gradients
        pred = model(X, lengths) # forward pass
        loss = criterion(pred, y) # compute loss
        loss.backward() # backpropagation
        optimizer.step() # update parameters

        total_loss += loss.item() * X.size(0) # sum of losses

    train_loss = total_loss / len(train_loader.dataset) # average train loss

    # --- Valid: 평균 유클리드 거리 ---
    # when eval, no gradient calculation
    model.eval()
    dists = []

    with torch.no_grad():
        for X, lengths, y in tqdm(valid_loader):
            X, lengths, y = X.to(DEVICE), lengths.to(DEVICE), y.to(DEVICE)
            pred = model(X, lengths)

            pred_np = pred.cpu().numpy()
            true_np = y.cpu().numpy()

            pred_x = pred_np[:, 0] * 105.0
            pred_y = pred_np[:, 1] * 68.0
            true_x = true_np[:, 0] * 105.0
            true_y = true_np[:, 1] * 68.0

            # calculate Euclidean distance
            dist = np.sqrt((pred_x - true_x) ** 2 + (pred_y - true_y) ** 2)
            dists.append(dist)

    # for eval, use mean distance for easy comparison in human readable way
    mean_dist = np.concatenate(dists).mean()  # 평균 유클리드 거리

    print(
        f"[Epoch {epoch}] "
        f"train_loss={train_loss:.4f} | "
        f"valid_mean_dist={mean_dist:.4f}"
    )

    # ----- BEST MODEL 업데이트 -----
    if mean_dist < best_dist:
        best_dist = mean_dist
        best_model_state = model.state_dict().copy()
        print(f" --> Best model updated! (dist={best_dist:.4f})")

## 7. 평가 데이터셋 추론

In [None]:
# Best Model Load
model.load_state_dict(best_model_state)
model.eval()

# exam
test_meta = pd.read_csv("../data/test.csv")

# answer
submission = pd.read_csv("../data/sample_submission.csv")

submission = submission.merge(test_meta, on="game_episode", how="left")

preds_x, preds_y = [], []

for _, row in tqdm(submission.iterrows(), total=len(submission)):
    g = pd.read_csv(row["path"]).sort_values("time_seconds").reset_index(drop=True)
    # 정규화된 좌표 준비
    sx = g["start_x"].values / 105.0
    sy = g["start_y"].values / 68.0
    ex = g["end_x"].values / 105.0
    ey = g["end_y"].values / 68.0

    coords = []
    for i in range(len(g)):
        # start는 항상 존재하므로 그대로 사용
        coords.append([sx[i], sy[i]])
        # 마지막 행은 end_x가 NaN이므로 자동으로 제외됨
        if i < len(g) - 1:
            coords.append([ex[i], ey[i]])

    seq = np.array(coords, dtype="float32")  # [T, 2]

    x = torch.tensor(seq).unsqueeze(0).to(DEVICE)      # [1, T, 2]
    length = torch.tensor([seq.shape[0]]).to(DEVICE)   # [1]

    with torch.no_grad():
        pred = model(x, length).cpu().numpy()[0]       # [2], 정규화 좌표

    # resize back to original scale
    preds_x.append(pred[0] * 105.0)
    preds_y.append(pred[1] * 68.0)
print("Inference Done.")

  0%|          | 0/2414 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: './test/153363/153363_1.csv'

## 8. 제출 Submission 생성

In [19]:
# submission["end_x"] = preds_x
# submission["end_y"] = preds_y
# submission[["game_episode", "end_x", "end_y"]].to_csv("./baseline_submit.csv", index=False)
# print("Saved: baseline_submit.csv")

import os

submission["end_x"] = preds_x
submission["end_y"] = preds_y

base = "baseline_submit"
i = 0
while os.path.exists(f"{base}_{i}.csv"):
    i += 1

out_path = f"{base}_{i}.csv"
submission[["game_episode", "end_x", "end_y"]].to_csv(out_path, index=False)
print(f"Saved: {out_path}")


Saved: baseline_submit_0.csv
