# Incheon Departure Congestion (Colab)

Colab end-to-end: 데이터 수집 -> EDA -> 회귀/잔차 -> LSTM 예측 + 시각화.

In [1]:
%%capture
!pip install pandas numpy matplotlib seaborn scikit-learn torch requests

## 1) 환경변수와 키 설정
- data.go.kr 서비스키를 URL 인코딩된 상태로 넣어주세요.
- Colab에서는 `os.environ["INCHEON_API_KEY"]`에 직접 할당.

In [2]:
import os
from urllib.parse import unquote

# TODO: 아래에 본인 키 입력 (URL-encoded key)
os.environ["INCHEON_API_KEY"] = "Kgn3NZtSyDOE51%2FjW0cW8kkX7Yxvga%2FZ%2FdrpGvn%2B0m5IBqRV9UCKO%2BXRxFXWwKNHPsRUqzPFW6CdTSHbYln2Kw%3D%3D"
assert os.environ["INCHEON_API_KEY"], "서비스키를 설정하세요"

## 2) 데이터 수집
- XML 응답을 파싱해서 DataFrame으로 변환
- 옵션: terminalId(`P01`), gateId(`DG2_E`), 페이지/행수 조정

In [3]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from typing import List, Dict

BASE_URL = "https://apis.data.go.kr/B551177/statusOfDepartureCongestion/getDepartureCongestion"

def parse_xml_items(text: str) -> List[Dict]:
    root = ET.fromstring(text)
    items_el = root.find(".//items")
    if items_el is None:
        return []
    items = []
    for item_el in items_el.findall("item"):
        items.append({child.tag: child.text for child in item_el})
    return items

def normalize_item(item: Dict) -> Dict:
    gate_id = item.get("gateId", "") or ""
    exitnumber = ""
    if gate_id:
        for part in gate_id.split("_"):
            if part.startswith("DG") and part[2:].isdigit():
                exitnumber = part[2:]
    return {
        "terminalid": (item.get("terminalId", "") or "").upper(),
        "gateid": gate_id.upper(),
        "exitnumber": exitnumber,
        "gatenumber": gate_id.upper(),
        "regdate": item.get("occurtime", ""),
        "congestion": item.get("waitLength", ""),
        "waittime": item.get("waitTime", ""),
        "operatingtime": item.get("operatingTime", ""),
    }

def fetch_page(page: int = 1, rows: int = 200, terminal_id: str | None = None, gate_id: str | None = None) -> List[Dict]:
    raw_key = os.getenv("INCHEON_API_KEY")
    assert raw_key, "환경변수 INCHEON_API_KEY가 필요합니다"
    key = unquote(raw_key)
    params = {
        "serviceKey": key,
        "pageNo": page,
        "numOfRows": rows,
        "type": "xml",
    }
    if terminal_id:
        params["terminalId"] = terminal_id
    if gate_id:
        params["gateId"] = gate_id
    resp = requests.get(BASE_URL, params=params, timeout=15)
    resp.raise_for_status()
    items = parse_xml_items(resp.text)
    return [normalize_item(it) for it in items]

def fetch_all(pages: int = 3, rows: int = 200, terminal_id: str | None = None, gate_id: str | None = None) -> pd.DataFrame:
    all_rows = []
    for p in range(1, pages + 1):
        page_rows = fetch_page(p, rows, terminal_id, gate_id)
        if not page_rows:
            break
        all_rows.extend(page_rows)
    df = pd.DataFrame(all_rows)
    df["regdate"] = pd.to_datetime(df["regdate"], format="%Y%m%d%H%M%S", errors="coerce")
    df["congestion"] = pd.to_numeric(df["congestion"], errors="coerce")
    df["waittime"] = pd.to_numeric(df["waittime"], errors="coerce")
    df = df.dropna(subset=["regdate", "congestion"])
    df = df.sort_values("regdate").reset_index(drop=True)
    return df

df = fetch_all(pages=5, rows=200, terminal_id=None, gate_id=None)
df.head()

Unnamed: 0,terminalid,gateid,exitnumber,gatenumber,regdate,congestion,waittime,operatingtime
0,P01,DG1_E,1,DG1_E,2025-12-09 13:50:00,0,6,06:30~09:30
1,P01,DG1_W,1,DG1_W,2025-12-09 13:50:00,0,6,06:30~09:30
2,P01,DG2_E,2,DG2_E,2025-12-09 13:50:00,16,8,06:00~20:00
3,P01,DG2_W,2,DG2_W,2025-12-09 13:50:00,19,8,06:00~20:00
4,P01,DG3_E,3,DG3_E,2025-12-09 13:50:00,76,17,00:00~24:00


## 3) EDA: 시계열/막대/분포

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

sns.set_theme(style="whitegrid")

plt.figure(figsize=(12, 4))
sns.lineplot(data=df, x="regdate", y="congestion", hue="terminalid")
plt.title("Congestion over time (by terminal)")
plt.show()

plt.figure(figsize=(8,4))
sns.barplot(data=df, x="gateid", y="congestion")
plt.xticks(rotation=45)
plt.title("Congestion by gate")
plt.show()

plt.figure(figsize=(8,4))
sns.histplot(df["congestion"], kde=True)
plt.title("Distribution of congestion")
plt.show()

## 4) 회귀 + 잔차
- 원-핫 인코딩: terminal/gate/exit
- 시간 특성: hour/dow sin-cos

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

df_feat = df.copy()
df_feat["hour"] = df_feat["regdate"].dt.hour
df_feat["dow"] = df_feat["regdate"].dt.dayofweek
df_feat["hour_sin"] = np.sin(2*np.pi*df_feat["hour"]/24)
df_feat["hour_cos"] = np.cos(2*np.pi*df_feat["hour"]/24)
df_feat["dow_sin"] = np.sin(2*np.pi*df_feat["dow"]/7)
df_feat["dow_cos"] = np.cos(2*np.pi*df_feat["dow"]/7)

target = df_feat["congestion"]
feature_cols = ["terminalid", "gateid", "exitnumber", "hour_sin", "hour_cos", "dow_sin", "dow_cos"]
X = df_feat[feature_cols]

cat_cols = ["terminalid", "gateid", "exitnumber"]
num_cols = [c for c in feature_cols if c not in cat_cols]

pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", StandardScaler(), num_cols),
])

model = Pipeline([
    ("prep", pre),
    ("lr", LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, preds))
mae = mean_absolute_error(y_test, preds)
print({"rmse": rmse, "mae": mae})

residuals = y_test - preds
sns.histplot(residuals, kde=True)
plt.title("Residual distribution (linear regression)")
plt.show()

sns.scatterplot(x=preds, y=y_test)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Pred vs Actual")
plt.show()

## 5) LSTM 예측 (시계열)
- lookback 분 단위로 시퀀스 구성
- train/val 손실 곡선 출력
- 마지막 시퀀스로 n 스텝 예측

In [None]:
import torch
from torch import nn
from dataclasses import dataclass
from typing import Tuple, List

@dataclass
class LSTMConfig:
    lookback: int = 30
    hidden_size: int = 64
    num_layers: int = 2
    lr: float = 1e-3
    epochs: int = 20
    train_split: float = 0.8
    device: str = "cpu"

class LSTMForecaster(nn.Module):
    def __init__(self, n_features: int, config: LSTMConfig):
        super().__init__()
        self.lstm = nn.LSTM(n_features, config.hidden_size, num_layers=config.num_layers, batch_first=True)
        self.head = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.ReLU(),
            nn.Linear(config.hidden_size, 1)
        )
    def forward(self, x):
        out, _ = self.lstm(x)
        last = out[:, -1, :]
        return self.head(last)

def make_sequences(df_in: pd.DataFrame, config: LSTMConfig):
    feat_cols = ["congestion", "hour_sin", "hour_cos", "dow_sin", "dow_cos"]
    df_seq = df_in.copy()
    df_seq["hour"] = df_seq["regdate"].dt.hour
    df_seq["dow"] = df_seq["regdate"].dt.dayofweek
    df_seq["hour_sin"] = np.sin(2*np.pi*df_seq["hour"]/24)
    df_seq["hour_cos"] = np.cos(2*np.pi*df_seq["hour"]/24)
    df_seq["dow_sin"] = np.sin(2*np.pi*df_seq["dow"]/7)
    df_seq["dow_cos"] = np.cos(2*np.pi*df_seq["dow"]/7)
    values = df_seq[feat_cols].dropna().to_numpy()
    seqs, tgts = [], []
    for i in range(len(values) - config.lookback):
        seqs.append(values[i:i+config.lookback])
        tgts.append(values[i+config.lookback, 0])
    X = torch.tensor(np.stack(seqs), dtype=torch.float32)
    y = torch.tensor(tgts, dtype=torch.float32).unsqueeze(-1)
    return X, y

def train_lstm(df_in: pd.DataFrame, config: LSTMConfig):
    X, y = make_sequences(df_in, config)
    split = int(len(X) * config.train_split)
    X_tr, X_val = X[:split], X[split:]
    y_tr, y_val = y[:split], y[split:]

    model = LSTMForecaster(X.shape[2], config).to(config.device)
    opt = torch.optim.Adam(model.parameters(), lr=config.lr)
    loss_fn = nn.MSELoss()
    train_loss, val_loss = [], []
    for ep in range(config.epochs):
        model.train(); opt.zero_grad()
        pred = model(X_tr)
        loss = loss_fn(pred, y_tr)
        loss.backward(); opt.step()
        model.eval()
        with torch.no_grad():
            v_pred = model(X_val)
            v_loss = loss_fn(v_pred, y_val)
        train_loss.append(loss.item()); val_loss.append(v_loss.item())
        print(f"Epoch {ep+1}/{config.epochs} train={loss.item():.4f} val={v_loss.item():.4f}")
    return model, train_loss, val_loss, X, y

config = LSTMConfig(lookback=30, epochs=15, device="cpu")
model, tr_loss, val_loss, X_all, y_all = train_lstm(df, config)

plt.plot(tr_loss, label="train")
plt.plot(val_loss, label="val")
plt.xlabel("Epoch")
plt.ylabel("MSE")
plt.title("LSTM loss")
plt.legend()
plt.show()

# 최근 구간으로 n-step 예측
n_steps = 20
last_seq = X_all[-1:].clone()
future = []
model.eval()
for _ in range(n_steps):
    with torch.no_grad():
        pred = model(last_seq)
    future.append(pred.item())
    # shift window: drop first, append predicted congestion with same temporal encodings (keep last encodings)
    new_row = last_seq[0, -1, :].clone()
    new_row[0] = pred.item()
    last_seq = torch.cat([last_seq[:, 1:, :], new_row.view(1,1,-1)], dim=1)

plt.figure(figsize=(10,4))
plt.plot(range(len(y_all[-100:])), y_all[-100:].numpy(), label="actual")
plt.plot(range(len(y_all[-1:]), len(y_all[-1:])+n_steps), future, label="forecast")
plt.title("Recent actual vs forecast")
plt.legend(); plt.show()