## Library

In [1]:
import re
import time
import requests
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET

from __future__ import annotations
from holidayskr import year_holidays
from collections import defaultdict
from statistics import mean

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

### <서울교통공사_1_8호선 역별 일별 시간대별 승객유형별 승하차인원>

- **승·하차 인원 (`승차`, `하차`)**  

- **우대권 인원수 (`우대권_승차`, `우대권_하차`)**  

- **청소년 인원수 (`청소년_승차`, `청소년_하차`)**  

In [3]:
ENCODING = "cp949"

# =========================
# 시간대 표준 매핑 
# =========================
TIME_MAP = {
    "06시이전": "05시",
    "06시-07시": "06시",
    "07시-08시": "07시",
    "08시-09시": "08시",
    "09시-10시": "09시",
    "10시-11시": "10시",
    "11시-12시": "11시",
    "12시-13시": "12시",
    "13시-14시": "13시",
    "14시-15시": "14시",
    "15시-16시": "15시",
    "16시-17시": "16시",
    "17시-18시": "17시",
    "18시-19시": "18시",
    "19시-20시": "19시",
    "20시-21시": "20시",
    "21시-22시": "21시",
    "22시-23시": "22시",
    "23시이후": "00시",
}

# 2022~2023 시간대 컬럼명 -> 2021형(=TIME_MAP key)로 맞추기
TIME_COL_RENAME_2022_2023 = {
    "06시간대이전": "06시이전",
    "06-07시간대": "06시-07시",
    "07-08시간대": "07시-08시",
    "08-09시간대": "08시-09시",
    "09-10시간대": "09시-10시",
    "10-11시간대": "10시-11시",
    "11-12시간대": "11시-12시",
    "12-13시간대": "12시-13시",
    "13-14시간대": "13시-14시",
    "14-15시간대": "14시-15시",
    "15-16시간대": "15시-16시",
    "16-17시간대": "16시-17시",
    "17-18시간대": "17시-18시",
    "18-19시간대": "18시-19시",
    "19-20시간대": "19시-20시",
    "20-21시간대": "20시-21시",
    "21-22시간대": "21시-22시",
    "22-23시간대": "22시-23시",
    "23-24시간대": "23시이후",
    # 24시간대이후는 누락
}


# =========================
# 0) 파일 스키마를 2021형으로 통일
# =========================
def normalize_ridership_schema(df: pd.DataFrame) -> pd.DataFrame:

    out = df.copy()

    # 1) 날짜
    if "수송일자" in out.columns and "날짜" not in out.columns:
        out = out.rename(columns={"수송일자": "날짜"})

    # 2) 호선
    if "호선명" in out.columns and "호선" not in out.columns:
        out = out.rename(columns={"호선명": "호선"})

    # 3) 역번호
    if "고유역번호(외부역코드)" in out.columns and "역번호" not in out.columns:
        out = out.rename(columns={"고유역번호(외부역코드)": "역번호"})

    # 4) 승/하차 구분
    if "승하차구분" in out.columns and "구분" not in out.columns:
        out = out.rename(columns={"승하차구분": "구분"})

    # 5) 24시간대이후는 제거 (연도 불일치 방지)
    if "24시간대이후" in out.columns:
        out = out.drop(columns=["24시간대이후"])

    # 6) 2022~2023 시간대 컬럼명 -> 2021형으로 rename
    rename_candidates = {c: TIME_COL_RENAME_2022_2023[c] for c in out.columns if c in TIME_COL_RENAME_2022_2023}
    if rename_candidates:
        out = out.rename(columns=rename_candidates)

    # 7) 타입 정리 (호선/역번호 float로 들어오는 케이스 방지)
    out["날짜"] = pd.to_datetime(out["날짜"])
    out["호선"] = pd.to_numeric(out["호선"], errors="coerce").astype("Int64")
    out["역번호"] = pd.to_numeric(out["역번호"], errors="coerce").astype("Int64")

    # 8) 필수 컬럼 체크
    required = ["날짜", "호선", "역번호", "역명", "구분", "승객유형"]
    missing = [c for c in required if c not in out.columns]
    if missing:
        raise KeyError(f"필수 컬럼 누락: {missing}\n현재 컬럼={list(out.columns)}")

    return out


def map_user_type(x: str) -> str:
    if x == "일반":
        return "일반"
    if x in {"어린이", "중고생", "청소년"}:
        return "청소년"
    if x == "우대권":
        return "우대권"
    return "외국인"


def get_time_cols(df: pd.DataFrame) -> list[str]:
    # TIME_MAP 키(= 표준화 가능한 시간대)만 사용
    return [c for c in df.columns if c in TIME_MAP]


# =========================
# 1) concat
# =========================
def load_ridership_all(paths: list[str], encoding: str = ENCODING) -> pd.DataFrame:
    dfs = []
    for p in paths:
        df = pd.read_csv(p, encoding=encoding)
        df = normalize_ridership_schema(df)
        dfs.append(df)

    out = pd.concat(dfs, ignore_index=True)
    out = out.sort_values(["날짜", "호선", "역번호", "역명"], kind="mergesort").reset_index(drop=True)
    return out


# =========================
# 2) (승차/하차) -> 표준시간 -> tm
# =========================
def preprocess_passenger_data(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out["승객유형"] = out["승객유형"].apply(map_user_type)

    time_cols = get_time_cols(out)
    if not time_cols:
        raise KeyError("시간대 컬럼을 찾지 못했습니다. TIME_MAP과 호환되는 컬럼이 없습니다.")

    long = out.melt(
        id_vars=["날짜", "호선", "역번호", "역명", "구분", "승객유형"],
        value_vars=time_cols,
        var_name="시간대",
        value_name="인원수",
    )

    # 인원수 NaN -> 0 
    long["인원수"] = pd.to_numeric(long["인원수"], errors="coerce").fillna(0)

    pivoted = (
        long.pivot_table(
            index=["날짜", "호선", "역번호", "역명", "시간대", "승객유형"],
            columns="구분",
            values="인원수",
            aggfunc="sum",
        )
        .reset_index()
    )
    pivoted.columns.name = None
    pivoted["승차"] = pivoted.get("승차", 0)
    pivoted["하차"] = pivoted.get("하차", 0)

    return pivoted.sort_values(
        ["날짜", "호선", "역번호", "역명", "시간대", "승객유형"],
        kind="mergesort",
    ).reset_index(drop=True)


def standardize_time(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out["표준시간"] = out["시간대"].map(TIME_MAP)

    base_cols = ["날짜", "호선", "역번호", "역명", "승객유형"]
    extra = out[base_cols].drop_duplicates().assign(표준시간="01시", 승차=0, 하차=0)

    out2 = pd.concat(
        [out[base_cols + ["표준시간", "승차", "하차"]], extra],
        ignore_index=True,
    )

    return out2.sort_values(
        ["날짜", "호선", "역번호", "역명", "표준시간", "승객유형"],
        kind="mergesort",
    ).reset_index(drop=True)


def add_tm(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out["hour"] = out["표준시간"].str.replace("시", "", regex=False).str.zfill(2)
    out["tm"] = out["날짜"].dt.strftime("%Y%m%d") + out["hour"]
    return out.drop(columns=["표준시간"])


# =========================
# 3) 실행 함수
# =========================
def build_ridership_long_allinone(paths: list[str]) -> pd.DataFrame:
    df_raw = load_ridership_all(paths)
    df_pre = preprocess_passenger_data(df_raw)
    df_std = standardize_time(df_pre)
    df_final = add_tm(df_std)
    return df_final

In [4]:
# =========================
# RUN
# =========================
paths_ridership = [
    # 2021
    "./data/서울교통공사_1-8호선 역별 일별 시간대별 승객유형별 승하차인원_20210630.csv",
    "./data/서울교통공사_1-8호선 역별 일별 시간대별 승객유형별 승하차인원_20211231.csv",
    # 2022
    "./data/서울교통공사_1-8호선 역별 일별 시간대별 승객유형별 승하차인원_20220630.csv",
    "./data/서울교통공사_1_8호선 역별 일별 시간대별 승객유형별 승하차인원_20221231.csv",
    # 2023
    "./data/서울교통공사_1_8호선 역별 일별 시간대별 승객유형별 승하차인원_20230630.csv",
    "./data/서울교통공사_1_8호선 역별 일별 시간대별 승객유형별 승하차인원_20231231.csv",
]

df_ridership_long = build_ridership_long_allinone(paths_ridership)
df_ridership_long

Unnamed: 0,날짜,호선,역번호,역명,승객유형,승차,하차,hour,tm
0,2021-01-01,1,150,서울역,외국인,0,0,00,2021010100
1,2021-01-01,1,150,서울역,우대권,5,7,00,2021010100
2,2021-01-01,1,150,서울역,일반,93,66,00,2021010100
3,2021-01-01,1,150,서울역,청소년,1,0,00,2021010100
4,2021-01-01,1,150,서울역,외국인,0,0,01,2021010101
...,...,...,...,...,...,...,...,...,...
22355495,2023-12-31,8,2828,남위례,청소년,23,6,21,2023123121
22355496,2023-12-31,8,2828,남위례,외국인,0,1,22,2023123122
22355497,2023-12-31,8,2828,남위례,우대권,10,17,22,2023123122
22355498,2023-12-31,8,2828,남위례,일반,152,151,22,2023123122


### <서울교통공사_지하철혼잡도정보>

- **혼잡도 (`congestion`)**  

- **상·하행 구분 (`direction`)**  

- **요일 구분 (`wd`)**  

- **시간 식별자 (`tm`)**  

- **노선 정보 (`line`)**  

- **역 식별 정보 (`station_number`, `station_name`)**  


In [5]:
# =========================
# 혼잡도 
# =========================
def load_and_process_congestion(
    paths: list[str],
    encoding: str = "cp949",
) -> pd.DataFrame:

    dfs_all = []

    for path in paths:
        df = pd.read_csv(path, encoding=encoding)

        # -------------------------
        # 1) 컬럼 자동 매핑
        # -------------------------
        col_wd = "요일구분" if "요일구분" in df.columns else "조사일자"
        col_station = "출발역" if "출발역" in df.columns else "역명"
        col_dir = "상하구분" if "상하구분" in df.columns else "구분"

        required = ["호선", "역번호", col_station, col_dir, col_wd]
        for c in required:
            if c not in df.columns:
                raise KeyError(f"필수 컬럼 누락: {c}")

        # -------------------------
        # 2) 시간대 컬럼 탐색
        # -------------------------
        time_cols = [
            c for c in df.columns
            if re.fullmatch(r"\d+시\d+분", str(c))
        ]

        # -------------------------
        # 3) 파일명에서 날짜 추출 (YYYYMMDD)
        # -------------------------
        m = re.search(r"_(\d{8})\.csv$", path)
        if not m:
            raise ValueError(f"파일명에서 날짜 추출 실패: {path}")
        base_date = m.group(1)

        # -------------------------
        # 4) wide → long
        # -------------------------
        df_long = df.melt(
            id_vars=[col_wd, "호선", "역번호", col_station, col_dir],
            value_vars=time_cols,
            var_name="시간대",
            value_name="congestion",
        )

        # -------------------------
        # 5) 시간대 → HH
        # -------------------------
        df_long["HH"] = df_long["시간대"].apply(
            lambda x: f"{int(str(x).split('시')[0]):02d}"
        )

        # -------------------------
        # 6) 컬럼명 통일
        # -------------------------
        out = (
            df_long.assign(
                tm=base_date + df_long["HH"],
                line=pd.to_numeric(df_long["호선"], errors="coerce"),
                station_number=pd.to_numeric(df_long["역번호"], errors="coerce"),
                station_name=df_long[col_station].astype(str),
                direction=df_long[col_dir].astype(str),
                wd=df_long[col_wd].astype(str),
            )
            [
                [
                    "tm",
                    "line",
                    "station_number",
                    "station_name",
                    "direction",
                    "wd",
                    "congestion",
                ]
            ]
        )

        # -------------------------
        # 7) 30분 → 1시간 평균
        # -------------------------
        out = (
            out.groupby(
                [
                    "tm",
                    "line",
                    "station_number",
                    "station_name",
                    "direction",
                    "wd",
                ],
                as_index=False,
            )["congestion"]
            .mean()
        )

        dfs_all.append(out)

    # =========================
    # 8) 연도별 concat
    # =========================
    df_all = pd.concat(dfs_all, ignore_index=True)

    # =========================
    # 9) 결측치 처리
    # =========================
    df_all["congestion"] = df_all["congestion"].fillna(0)

    return df_all


# =========================
# 실행
# =========================
paths = [
    "./data/서울교통공사_지하철혼잡도정보_20211231.csv",
    "./data/서울교통공사_지하철혼잡도정보_20221231.csv",
    "./data/서울교통공사_지하철혼잡도정보_20231231.csv",
]

df_congestion_all = load_and_process_congestion(paths)
df_congestion_all

Unnamed: 0,tm,line,station_number,station_name,direction,wd,congestion
0,2021123105,1,150,서울역,상선,일요일,2.40
1,2021123105,1,150,서울역,상선,토요일,6.80
2,2021123105,1,150,서울역,상선,평일,6.20
3,2021123105,1,150,서울역,하선,일요일,3.90
4,2021123105,1,150,서울역,하선,토요일,9.40
...,...,...,...,...,...,...,...
98851,2023123123,8,2828,남위례,상선,토요일,3.50
98852,2023123123,8,2828,남위례,상선,평일,5.55
98853,2023123123,8,2828,남위례,하선,일요일,12.30
98854,2023123123,8,2828,남위례,하선,토요일,22.25


### <서울교통공사_월별 환승유입인원>

- **환승 유입 인원 (`transfer_in`)**  

- **월 식별자 (`ym`)**  

- **노선 정보 (`line`)**  

- **역 식별 정보 (`station_number`, `station_name`)**  

- **시간 해상도**  

In [6]:
ENCODING = "cp949"

# =========================
# 역명 정리
# =========================
def clean_station_name(name: str) -> str:
    return re.sub(r"\(\d+\)$", "", str(name)).strip()


# =========================
# 1) 2021: wide(2021년1월~12월)
# =========================
def load_transfer_2021(path: str, encoding: str = ENCODING) -> pd.DataFrame:
    df = pd.read_csv(path, encoding=encoding)

    # 월 컬럼 추출: "2021년1월" 형태
    month_cols = [c for c in df.columns if re.fullmatch(r"2021년\d{1,2}월", str(c))]
    if not month_cols:
        raise KeyError("2021 월 컬럼(예: '2021년1월')을 찾지 못했습니다.")

    df_long = df.melt(
        id_vars=["호선", "역번호", "역명"],
        value_vars=month_cols,
        var_name="ym_raw",
        value_name="transfer_in",
    )

    # "2021년1월" -> "202101"
    def ym_from_kor(x: str) -> str:
        m = re.fullmatch(r"(2021)년(\d{1,2})월", str(x))
        y, mo = m.group(1), int(m.group(2))
        return f"{y}{mo:02d}"

    df_long["ym"] = df_long["ym_raw"].apply(ym_from_kor)

    out = df_long.rename(columns={"역번호": "station_number"}).copy()
    out["line"] = pd.to_numeric(out["호선"], errors="coerce").astype("Int64")
    out["station_number"] = pd.to_numeric(out["station_number"], errors="coerce").astype("Int64")
    out["station_name"] = out["역명"].apply(clean_station_name)

    out["transfer_in"] = pd.to_numeric(out["transfer_in"], errors="coerce").fillna(0)

    return out[["ym", "line", "station_number", "station_name", "transfer_in"]]


# =========================
# 2) 2022/2023: 스키마 통일
# =========================
def load_transfer_2022_2023(path: str, encoding: str = ENCODING) -> pd.DataFrame:
    df = pd.read_csv(path, encoding=encoding)

    # 역번호 컬럼명이 다름: "고유역번호(외부역코드)" -> station_number
    if "고유역번호(외부역코드)" in df.columns:
        df = df.rename(columns={"고유역번호(외부역코드)": "station_number"})
    elif "역번호" in df.columns:
        df = df.rename(columns={"역번호": "station_number"})
    else:
        raise KeyError("역번호 컬럼을 찾지 못했습니다. (고유역번호/역번호)")

    # 수송연월: "2022-01" -> "202201"
    if "수송연월" not in df.columns:
        raise KeyError("수송연월 컬럼이 없습니다.")
    df["ym"] = df["수송연월"].astype(str).str.replace("-", "", regex=False)

    # 환승유입인원수 컬럼
    if "환승유입인원수" not in df.columns:
        raise KeyError("환승유입인원수 컬럼이 없습니다.")

    out = pd.DataFrame(
        {
            "ym": df["ym"],
            "line": pd.to_numeric(df["호선"], errors="coerce").astype("Int64"),
            "station_number": pd.to_numeric(df["station_number"], errors="coerce").astype("Int64"),
            "station_name": df["역명"].apply(clean_station_name),
            "transfer_in": pd.to_numeric(df["환승유입인원수"], errors="coerce").fillna(0),
        }
    )
    return out


# =========================
# 3) 세 파일 통합
# =========================
paths_transfer = [
    "./data/서울교통공사_월별 환승유입인원_20211231.csv",
    "./data/서울교통공사_월별 환승유입인원_20221231.csv",
    "./data/서울교통공사_월별 환승유입인원_20231231.csv",
]

df_t21 = load_transfer_2021(paths_transfer[0])
df_t22 = load_transfer_2022_2023(paths_transfer[1])
df_t23 = load_transfer_2022_2023(paths_transfer[2])

df_transfer_monthly = pd.concat([df_t21, df_t22, df_t23], ignore_index=True)

# 월/역 기준으로 합산
df_transfer_monthly = (
    df_transfer_monthly
    .groupby(["ym", "line", "station_number"], as_index=False)
    .agg(
        station_name=("station_name", "first"),
        transfer_in=("transfer_in", "sum"),
    )
)

df_transfer_monthly

Unnamed: 0,ym,line,station_number,station_name,transfer_in
0,202101,1,150,서울역,555352.0
1,202101,1,151,시청,326347.0
2,202101,1,152,종각,555805.0
3,202101,1,153,종로3가,277482.0
4,202101,1,154,종로5가,379510.0
...,...,...,...,...,...
10363,202312,9,4134,송파나루,136207.0
10364,202312,9,4135,한성백제,52776.0
10365,202312,9,4136,올림픽공원,235870.0
10366,202312,9,4137,둔촌오륜,21021.0


In [7]:
# =========================
# 0) ridership 컬럼명/타입 정규화
#    - ridership에 line이 없으면(=호선/역번호) 자동 rename
# =========================
df_r = df_ridership_long.copy()

if "line" not in df_r.columns:
    df_r = df_r.rename(columns={"호선": "line", "역번호": "station_number"})

df_r["tm"] = df_r["tm"].astype(str)
df_r["line"] = df_r["line"].astype("Int64")
df_r["station_number"] = df_r["station_number"].astype("Int64")

# 월 키 생성 (YYYYMM)
df_r["ym"] = df_r["tm"].str.slice(0, 6)


# =========================
# 1) transfer(월별) 정규화 + ridership에 병합
# =========================
df_t = df_transfer_monthly.copy()
df_t["ym"] = df_t["ym"].astype(str)
df_t["line"] = df_t["line"].astype("Int64")
df_t["station_number"] = df_t["station_number"].astype("Int64")

df_rt = df_r.merge(
    df_t[["ym", "line", "station_number", "transfer_in"]],
    on=["ym", "line", "station_number"],
    how="left",
)

# 환승유입 결측 -> 0
df_rt["transfer_in"] = df_rt["transfer_in"].fillna(0)


# =========================
# 2) congestion(시간별) 상/하 분리 wide 생성
# =========================
df_c = df_congestion_all.copy()
df_c["tm"] = df_c["tm"].astype(str)
df_c["line"] = df_c["line"].astype("Int64")
df_c["station_number"] = df_c["station_number"].astype("Int64")
df_c["direction"] = df_c["direction"].astype(str).str.strip()

# 상선/하선만 사용
df_c = df_c[df_c["direction"].isin(["상선", "하선"])]

# (tm, line, station_number) 기준으로 상/하 혼잡도 wide
df_c_wide = (
    df_c.pivot_table(
        index=["tm", "line", "station_number"],
        columns="direction",
        values="congestion",
        aggfunc="mean",
    )
    .reset_index()
    .rename(columns={"상선": "congestion_up", "하선": "congestion_down"})
)

# 더미는 "존재 여부(결측 아님)" 기준이 더 안전
df_c_wide["is_up"] = df_c_wide["congestion_up"].notna().astype(int)
df_c_wide["is_down"] = df_c_wide["congestion_down"].notna().astype(int)

# 혼잡도 결측 -> 0
df_c_wide["congestion_up"] = df_c_wide["congestion_up"].fillna(0)
df_c_wide["congestion_down"] = df_c_wide["congestion_down"].fillna(0)


# =========================
# 3) 최종 병합
# =========================
df_final = df_rt.merge(
    df_c_wide[["tm", "line", "station_number", "congestion_up", "congestion_down", "is_up", "is_down"]],
    on=["tm", "line", "station_number"],
    how="left",
)

# 병합 후에도 결측은 0 처리
for c in ["congestion_up", "congestion_down", "is_up", "is_down"]:
    if c.startswith("is_"):
        df_final[c] = df_final[c].fillna(0).astype(int)
    else:
        df_final[c] = df_final[c].fillna(0)

# 결과
df_final

Unnamed: 0,날짜,line,station_number,역명,승객유형,승차,하차,hour,tm,ym,transfer_in,congestion_up,congestion_down,is_up,is_down
0,2021-01-01,1,150,서울역,외국인,0,0,00,2021010100,202101,555352.0,0.000000,0.000000,0,0
1,2021-01-01,1,150,서울역,우대권,5,7,00,2021010100,202101,555352.0,0.000000,0.000000,0,0
2,2021-01-01,1,150,서울역,일반,93,66,00,2021010100,202101,555352.0,0.000000,0.000000,0,0
3,2021-01-01,1,150,서울역,청소년,1,0,00,2021010100,202101,555352.0,0.000000,0.000000,0,0
4,2021-01-01,1,150,서울역,외국인,0,0,01,2021010101,202101,555352.0,0.000000,0.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22355495,2023-12-31,8,2828,남위례,청소년,23,6,21,2023123121,202312,119265.0,8.200000,27.183333,1,1
22355496,2023-12-31,8,2828,남위례,외국인,0,1,22,2023123122,202312,119265.0,6.866667,25.350000,1,1
22355497,2023-12-31,8,2828,남위례,우대권,10,17,22,2023123122,202312,119265.0,6.866667,25.350000,1,1
22355498,2023-12-31,8,2828,남위례,일반,152,151,22,2023123122,202312,119265.0,6.866667,25.350000,1,1


In [8]:
def add_calendar_features(df: pd.DataFrame, date_col: str = "날짜") -> pd.DataFrame:
    df = df.copy()

    # 1) 날짜 컬럼 datetime 보장
    df[date_col] = pd.to_datetime(df[date_col])

    # 2) 요일(월~일) 원핫
    weekday_en = df[date_col].dt.day_name()
    order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    kor = {
        "Monday": "월요일", "Tuesday": "화요일", "Wednesday": "수요일",
        "Thursday": "목요일", "Friday": "금요일", "Saturday": "토요일", "Sunday": "일요일",
    }

    weekday_en = pd.Categorical(weekday_en, categories=order, ordered=True)
    onehot = pd.get_dummies(weekday_en).astype(int)
    onehot.columns = [kor[c] for c in onehot.columns]
    df = pd.concat([df, onehot], axis=1)

    # 3) 평일/주말 여부
    df["is_평일"] = df[["월요일", "화요일", "수요일", "목요일", "금요일"]].sum(axis=1).gt(0).astype(int)
    df["is_주말"] = df[["토요일", "일요일"]].sum(axis=1).gt(0).astype(int)

    # 4) 공휴일 여부(holidayskr)
    years = df[date_col].dt.year.unique()
    holiday_dates = set()
    for y in years:
        holiday_dates.update([d for d, _ in year_holidays(int(y))])

    df["is_공휴일"] = df[date_col].dt.date.apply(lambda x: 1 if x in holiday_dates else 0).astype(int)

    # 5) 몇 주차(ISO week)
    df["week_number"] = df[date_col].dt.isocalendar().week.astype(int)

    return df


# =========================
# 사용
# =========================
df_final = add_calendar_features(df_final, date_col="날짜")
df_final

Unnamed: 0,날짜,line,station_number,역명,승객유형,승차,하차,hour,tm,ym,transfer_in,congestion_up,congestion_down,is_up,is_down,월요일,화요일,수요일,목요일,금요일,토요일,일요일,is_평일,is_주말,is_공휴일,week_number
0,2021-01-01,1,150,서울역,외국인,0,0,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53
1,2021-01-01,1,150,서울역,우대권,5,7,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53
2,2021-01-01,1,150,서울역,일반,93,66,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53
3,2021-01-01,1,150,서울역,청소년,1,0,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53
4,2021-01-01,1,150,서울역,외국인,0,0,01,2021010101,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22355495,2023-12-31,8,2828,남위례,청소년,23,6,21,2023123121,202312,119265.0,8.200000,27.183333,1,1,0,0,0,0,0,0,1,0,1,0,52
22355496,2023-12-31,8,2828,남위례,외국인,0,1,22,2023123122,202312,119265.0,6.866667,25.350000,1,1,0,0,0,0,0,0,1,0,1,0,52
22355497,2023-12-31,8,2828,남위례,우대권,10,17,22,2023123122,202312,119265.0,6.866667,25.350000,1,1,0,0,0,0,0,0,1,0,1,0,52
22355498,2023-12-31,8,2828,남위례,일반,152,151,22,2023123122,202312,119265.0,6.866667,25.350000,1,1,0,0,0,0,0,0,1,0,1,0,52


### <종관기상관측>
- 서울특별시 2021-2023 사이의 일시, 평균기온(°C), 일강수량(mm), 평균 풍속(m/s), 평균 상대습도(%), 일 최심적설(cm) 를 추출

In [9]:
df_asos = pd.read_csv(
    "./data/OBS_ASOS_DD_20251223022333.csv",
    encoding="cp949"
)

In [10]:
# 날짜 컬럼 datetime
df_asos["일시"] = pd.to_datetime(df_asos["일시"])

# 서울(108)만 사용
df_asos = df_asos[df_asos["지점"] == 108]

# 컬럼명 통일
df_asos = df_asos.rename(
    columns={
        "일시": "날짜",
        "평균기온(°C)": "temp_avg",
        "일강수량(mm)": "rain_day",
        "평균 풍속(m/s)": "wind_avg",
        "평균 상대습도(%)": "humid_avg",
        "일 최심적설(cm)": "snow_day",
    }
)

# 필요한 컬럼만 유지
df_asos = df_asos[
    ["날짜", "temp_avg", "rain_day", "wind_avg", "humid_avg", "snow_day"]
]

# 숫자형 변환 + 결측 방어
for c in ["temp_avg", "rain_day", "wind_avg", "humid_avg", "snow_day"]:
    df_asos[c] = pd.to_numeric(df_asos[c], errors="coerce")

In [11]:
df_final["날짜"] = pd.to_datetime(df_final["날짜"])

df_final = df_final.merge(
    df_asos,
    on="날짜",
    how="left"
)

In [12]:
weather_cols = ["temp_avg", "rain_day", "wind_avg", "humid_avg", "snow_day"]
df_final[weather_cols] = df_final[weather_cols].fillna(0)

In [13]:
# 상위 30개 역 필터링
top30_stations = [
    "홍대입구", "잠실", "강남", "서울역", "신림", "고속터미널", "신도림", "역삼", "선릉",
    "을지로입구", "종각", "가산디지털단지", "서울대입구", "성수", "혜화", "양재",
    "사당", "명동", "압구정", "수유", "연신내", "건대입구", "합정", "신사", "광화문",
    "시청", "여의도", "경복궁", "안국", "종로3가"
]

df_final = df_final[df_final["역명"].isin(top30_stations)].copy()
df_final

Unnamed: 0,날짜,line,station_number,역명,승객유형,승차,하차,hour,tm,ym,transfer_in,congestion_up,congestion_down,is_up,is_down,월요일,화요일,수요일,목요일,금요일,토요일,일요일,is_평일,is_주말,is_공휴일,week_number,temp_avg,rain_day,wind_avg,humid_avg,snow_day
0,2021-01-01,1,150,서울역,외국인,0,0,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0
1,2021-01-01,1,150,서울역,우대권,5,7,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0
2,2021-01-01,1,150,서울역,일반,93,66,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0
3,2021-01-01,1,150,서울역,청소년,1,0,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0
4,2021-01-01,1,150,서울역,외국인,0,0,01,2021010101,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22353735,2023-12-31,7,2748,가산디지털단지,청소년,13,5,21,2023123121,202312,771655.0,17.466667,41.433333,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8
22353736,2023-12-31,7,2748,가산디지털단지,외국인,0,0,22,2023123122,202312,771655.0,15.600000,38.650000,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8
22353737,2023-12-31,7,2748,가산디지털단지,우대권,17,12,22,2023123122,202312,771655.0,15.600000,38.650000,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8
22353738,2023-12-31,7,2748,가산디지털단지,일반,247,247,22,2023123122,202312,771655.0,15.600000,38.650000,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8


In [14]:
df_final.isna().sum().sort_values(ascending=False).head(10)

날짜                0
line              0
station_number    0
역명                0
승객유형              0
승차                0
하차                0
hour              0
tm                0
ym                0
dtype: int64

### <서울시 역코드로 지하철역별 열차 시간표 정보 검색>

- **평균운행간격 (`상행 평균운행간격`, `하행 평균운행간격`)**  

- **운행횟수 (`상행 운행횟수`, `하행 운행횟수`)**  

In [15]:
# =========================================================
# 0) CONFIG
# =========================================================
SEOUL_API_KEY = "446e7658546a616d3831654b44584a"
BASE_URL = f"http://openapi.seoul.go.kr:8088/{SEOUL_API_KEY}/xml/SearchSTNTimeTableByIDService"

STATION_CODE_CSV_PATH = "./data/서울교통공사_역명 지하철역 검색.csv"
STATION_CODE_ENCODING = "cp949"

WEEK_TAG_MAP = {"평일": "1", "토요일": "2", "공휴일": "3"}  # 공휴일/일요일=3
INOUT_TAG_MAP = {"up": "1", "down": "2"}

TIME_MAP = {
    "06시이전": "05시",
    "06시-07시": "06시",
    "07시-08시": "07시",
    "08시-09시": "08시",
    "09시-10시": "09시",
    "10시-11시": "10시",
    "11시-12시": "11시",
    "12시-13시": "12시",
    "13시-14시": "13시",
    "14시-15시": "14시",
    "15시-16시": "15시",
    "16시-17시": "16시",
    "17시-18시": "17시",
    "18시-19시": "18시",
    "19시-20시": "19시",
    "20시-21시": "20시",
    "21시-22시": "21시",
    "22시-23시": "22시",
    "23시이후": "00시",
}


# =========================================================
# 1) UTILS
# =========================================================
def _pick_col(df: pd.DataFrame, candidates: list[str]) -> str:
    for c in candidates:
        if c in df.columns:
            return c
    raise KeyError(f"Cannot find any of {candidates} in columns={df.columns.tolist()}")


def map_hour_to_time_range(hour_str: str) -> str:
    """00~23 -> TIME_RANGE"""
    h = int(hour_str)
    if h in (23, 0, 1, 2):
        return "23시이후"
    if 3 <= h <= 5:
        return "06시이전"
    return f"{str(h).zfill(2)}시-{str(h+1).zfill(2)}시"


def weighted_mean(x: pd.Series, w: pd.Series) -> float:
    wsum = float(w.sum())
    if wsum == 0:
        return 0.0
    return float(np.average(x.astype(float), weights=w.astype(float)))


# =========================================================
# 2) LOAD STATION MAP (외부코드 -> 전철역코드)
# =========================================================
def build_station_map() -> pd.DataFrame:
    df_code = pd.read_csv(STATION_CODE_CSV_PATH, encoding=STATION_CODE_ENCODING)

    col_line = _pick_col(df_code, ["호선", "LINE_NUM", "line"])
    col_ext = _pick_col(df_code, ["외부코드", "FR_CODE", "외부코드(역번호)", "station_number"])
    col_stcd = _pick_col(df_code, ["전철역코드", "STATION_CD", "역코드", "station_cd"])
    col_name = _pick_col(df_code, ["전철역명", "STATION_NM", "역명", "station_name"])

    df_code["line"] = df_code[col_line].astype(str).str.extract(r"(\d+)", expand=False)
    df_code["line"] = pd.to_numeric(df_code["line"], errors="coerce").astype("Int16")

    df_code["station_number"] = pd.to_numeric(df_code[col_ext], errors="coerce").astype("Int32")

    # API용 station_cd (앞 0 유지 위해 zfill)
    df_code["station_cd"] = df_code[col_stcd].astype(str).str.extract(r"(\d+)", expand=False)
    df_code["station_cd"] = df_code["station_cd"].astype(str).str.zfill(4)

    df_code["역명"] = df_code[col_name].astype(str).str.strip()

    df_map = (
        df_code[df_code["line"].between(1, 9)]
        [["line", "station_number", "station_cd", "역명"]]
        .dropna(subset=["line", "station_number", "station_cd", "역명"])
        .drop_duplicates()
        .reset_index(drop=True)
    )
    return df_map


# =========================================================
# 3) df_final에 day_label/tm_label 생성
# =========================================================
def prepare_df_for_headway(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    
    # 타입 정리
    out["line"] = pd.to_numeric(out["line"], errors="coerce").astype("Int16")
    out["station_number"] = pd.to_numeric(out["station_number"], errors="coerce").astype("Int32")
    out["hour"] = out["hour"].astype(str).str.zfill(2)

    # day_label이 없다면 생성 (일요일도 공휴일로 가정)
    if "day_label" not in out.columns:
        if "is_공휴일" in out.columns and "일요일" in out.columns:
            is_holiday = (out["is_공휴일"].astype(int) == 1) | (out["일요일"].astype(int) == 1)
        elif "is_공휴일" in out.columns:
            is_holiday = (out["is_공휴일"].astype(int) == 1)
        else:
            # 최후 fallback: 토요일/일요일 더미가 있으면 활용, 없으면 평일로
            is_holiday = pd.Series(False, index=out.index)

        if "토요일" in out.columns:
            is_sat = (out["토요일"].astype(int) == 1)
        else:
            is_sat = pd.Series(False, index=out.index)

        # 우선순위: 공휴일/일요일 > 토요일 > 평일
        out["day_label"] = np.where(is_holiday, "공휴일", np.where(is_sat, "토요일", "평일"))

    # time_range/tm_label이 없다면 생성
    if "time_range" not in out.columns:
        out["time_range"] = out["hour"].apply(map_hour_to_time_range)
    if "tm_label" not in out.columns:
        out["tm_label"] = out["time_range"].map(TIME_MAP)

    return out


# =========================================================
# 4) STATION_CD 정규화
# =========================================================
def normalize_station_cd(df: pd.DataFrame, df_map: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # 충돌 방지
    for c in ["station_cd", "station_cd_x", "station_cd_y", "station_cd_map"]:
        if c in out.columns:
            out = out.drop(columns=[c])

    # (line, station_number) -> station_cd 매핑 (외부코드 케이스)
    df_map2 = df_map.copy()
    df_map2["line"] = pd.to_numeric(df_map2["line"], errors="coerce").astype("Int16")
    df_map2["station_number"] = pd.to_numeric(df_map2["station_number"], errors="coerce").astype("Int32")
    df_map2["station_cd"] = df_map2["station_cd"].astype(str).str.extract(r"(\d+)", expand=False).str.zfill(4)

    out = out.merge(
        df_map2[["line", "station_number", "station_cd"]].rename(columns={"station_cd": "station_cd_map"}),
        on=["line", "station_number"],
        how="left",
    )

    # station_number가 4자리 이상이면 그 자체가 station_cd
    sn = out["station_number"].astype("Int64")
    mask_cd_like = sn.notna() & (sn >= 1000)

    out["station_cd"] = out["station_cd_map"]
    out.loc[mask_cd_like, "station_cd"] = sn[mask_cd_like].astype(int).astype(str).str.zfill(4)

    # 최종 체크
    miss = out["station_cd"].isna().mean()
    if miss > 0:
        raise RuntimeError(f"station_cd missing rate is not zero: {miss:.4f}")

    out["station_cd"] = out["station_cd"].astype(str).str.zfill(4)

    # 역명 복원: station_cd로 역명 붙이기
    name_map = df_map2[["line", "station_cd", "역명"]].drop_duplicates()
    out = out.merge(name_map, on=["line", "station_cd"], how="left")

    return out


# =========================================================
# 5) API FETCH + HEADWAY 계산
# =========================================================
def fetch_timetable_arrivals(station_cd: str, week_tag: str, inout_tag: str,
                            start: int = 1, end: int = 1000, timeout: int = 20,
                            max_retries: int = 3, sleep_sec: float = 0.12) -> list[str]:
    url = f"{BASE_URL}/{start}/{end}/{station_cd}/{week_tag}/{inout_tag}/"
    for attempt in range(max_retries):
        try:
            r = requests.get(url, timeout=timeout)
            r.raise_for_status()
            root = ET.fromstring(r.content)

            code = root.findtext(".//RESULT/CODE")
            if code and code.strip() == "INFO-200":
                time.sleep(sleep_sec)
                return []

            arrivals = []
            for row in root.findall(".//row"):
                t = row.findtext("ARRIVETIME")
                if t:
                    arrivals.append(t.strip())

            time.sleep(sleep_sec)
            return arrivals
        except Exception:
            time.sleep(sleep_sec + (attempt + 1) * 0.4)
    return []


def group_minutes_by_hour(arrival_times: list[str]) -> dict[str, list[int]]:
    buckets = defaultdict(list)
    for t in arrival_times:
        parts = t.split(":")
        if len(parts) < 2:
            continue
        try:
            hh_int = int(parts[0])
            mm_int = int(parts[1])
        except ValueError:
            continue
        hh = str(hh_int % 24).zfill(2)
        buckets[hh].append(mm_int)
    for hh in buckets:
        buckets[hh].sort()
    return dict(buckets)


def calc_hour_stats(minutes: list[int]) -> tuple[int, float]:
    cnt = len(minutes)
    if cnt <= 1:
        return cnt, 0.0
    diffs = [minutes[i] - minutes[i-1] for i in range(1, cnt) if minutes[i] >= minutes[i-1]]
    if not diffs:
        return cnt, 0.0
    return cnt, float(mean(diffs))


def build_hourly_headway(station_cds, day_labels=("평일", "토요일", "공휴일"), sleep_sec=0.12) -> pd.DataFrame:
    station_cds = pd.Series(station_cds).dropna().astype(str).str.zfill(4).unique()
    rows = []

    for st_cd in station_cds:
        for dlab in day_labels:
            week_tag = WEEK_TAG_MAP[dlab]

            up_arr = fetch_timetable_arrivals(st_cd, week_tag, INOUT_TAG_MAP["up"], sleep_sec=sleep_sec)
            down_arr = fetch_timetable_arrivals(st_cd, week_tag, INOUT_TAG_MAP["down"], sleep_sec=sleep_sec)

            up_by = group_minutes_by_hour(up_arr)
            down_by = group_minutes_by_hour(down_arr)

            hours = sorted(set(up_by.keys()) | set(down_by.keys()))
            for hh in hours:
                up_cnt, up_hw = calc_hour_stats(up_by.get(hh, []))
                down_cnt, down_hw = calc_hour_stats(down_by.get(hh, []))
                rows.append({
                    "station_cd": st_cd,
                    "day_label": dlab,
                    "hour": str(hh).zfill(2),
                    "up_trip_cnt": up_cnt,
                    "down_trip_cnt": down_cnt,
                    "up_mean_headway": up_hw,
                    "down_mean_headway": down_hw,
                })

    return pd.DataFrame(rows)


def aggregate_to_tm(hourly: pd.DataFrame) -> pd.DataFrame:
    tmp = hourly.copy()
    tmp["time_range"] = tmp["hour"].apply(map_hour_to_time_range)
    tmp["tm_label"] = tmp["time_range"].map(TIME_MAP)

    group_cols = ["station_cd", "day_label", "tm_label"]
    agg = (
        tmp.groupby(group_cols, as_index=False)
        .apply(lambda g: pd.Series({
            "up_trip_cnt": g["up_trip_cnt"].sum(),
            "down_trip_cnt": g["down_trip_cnt"].sum(),
            "up_mean_headway": weighted_mean(g["up_mean_headway"], g["up_trip_cnt"]),
            "down_mean_headway": weighted_mean(g["down_mean_headway"], g["down_trip_cnt"]),
        }))
        .reset_index(drop=True)
    )
    return agg


# =========================================================
# 6) 헤드웨이 추가 실행
# =========================================================
# df_final 준비
df_final = prepare_df_for_headway(df_final)

# station_cd 정규화(혼재 해결) + 역명 복원
df_map = build_station_map()
df_final = normalize_station_cd(df_final, df_map)

# API headway 생성
unique_station_cds = df_final["station_cd"].unique()
print("[CHECK] unique station_cd:", len(unique_station_cds))

hourly = build_hourly_headway(unique_station_cds, day_labels=("평일", "토요일", "공휴일"))
headway_tm = aggregate_to_tm(hourly)

# merge
df_final = df_final.merge(
    headway_tm,
    on=["station_cd", "day_label", "tm_label"],
    how="left"
)

# NaN -> 0
for c in ["up_trip_cnt", "down_trip_cnt", "up_mean_headway", "down_mean_headway"]:
    df_final[c] = pd.to_numeric(df_final[c], errors="coerce").fillna(0)

print("[CHECK] shape:", df_final.shape)
print("[CHECK] day_label dist:\n", df_final["day_label"].value_counts())
df_final

[CHECK] unique station_cd: 33


  .apply(lambda g: pd.Series({


[CHECK] shape: (2773960, 41)
[CHECK] day_label dist:
 day_label
평일     1902200
공휴일     495140
토요일     376620
Name: count, dtype: int64


Unnamed: 0,날짜,line,station_number,역명_x,승객유형,승차,하차,hour,tm,ym,transfer_in,congestion_up,congestion_down,is_up,is_down,월요일,화요일,수요일,목요일,금요일,토요일,일요일,is_평일,is_주말,is_공휴일,week_number,temp_avg,rain_day,wind_avg,humid_avg,snow_day,day_label,time_range,tm_label,station_cd_map,station_cd,역명_y,up_trip_cnt,down_trip_cnt,up_mean_headway,down_mean_headway
0,2021-01-01,1,150,서울역,외국인,0,0,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625
1,2021-01-01,1,150,서울역,우대권,5,7,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625
2,2021-01-01,1,150,서울역,일반,93,66,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625
3,2021-01-01,1,150,서울역,청소년,1,0,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625
4,2021-01-01,1,150,서울역,외국인,0,0,01,2021010101,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2773955,2023-12-31,7,2748,가산디지털단지,청소년,13,5,21,2023123121,202312,771655.0,17.466667,41.433333,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,21시-22시,21시,,2748,가산디지털단지,9.0,9.0,7.000000,6.625
2773956,2023-12-31,7,2748,가산디지털단지,외국인,0,0,22,2023123122,202312,771655.0,15.600000,38.650000,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,,2748,가산디지털단지,7.0,9.0,8.833333,6.750
2773957,2023-12-31,7,2748,가산디지털단지,우대권,17,12,22,2023123122,202312,771655.0,15.600000,38.650000,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,,2748,가산디지털단지,7.0,9.0,8.833333,6.750
2773958,2023-12-31,7,2748,가산디지털단지,일반,247,247,22,2023123122,202312,771655.0,15.600000,38.650000,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,,2748,가산디지털단지,7.0,9.0,8.833333,6.750


### <지하철 요금, 환율>
- 성인요금 기준 2023-10-01 이전에는 1250, 이후에는 1400원

In [16]:
# =========================
# 1) Fare feature
# =========================
def add_subway_fare(df: pd.DataFrame, date_col: str = "날짜") -> pd.DataFrame:
    out = df.copy()
    out[date_col] = pd.to_datetime(out[date_col], errors="coerce")

    cutoff = pd.Timestamp("2023-10-01")
    out["subway_fare_adult"] = np.where(out[date_col] < cutoff, 1250, 1400).astype("int32")
    return out


# =========================
# 2) USDKRW (yfinance)
# =========================
def _to_datetime_index(dates) -> pd.DatetimeIndex:
    idx = pd.to_datetime(pd.Index(dates), errors="coerce")
    idx = idx.dropna()
    idx = pd.DatetimeIndex(idx).normalize()
    idx = pd.DatetimeIndex(sorted(idx.unique()))
    return idx


def fetch_usdkrw_yfinance(dates, primary="KRW=X", fallback="USDKRW=X") -> pd.Series:

    try:
        import yfinance as yf
    except ImportError as e:
        raise ImportError("yfinance 미설치 에러") from e

    date_index = _to_datetime_index(dates)
    if len(date_index) == 0:
        raise ValueError("유효한 날짜 없음")

    start = date_index.min().date()
    end = (date_index.max() + pd.Timedelta(days=1)).date()  

    def _download(ticker: str) -> pd.DataFrame:
        return yf.download(
            ticker,
            start=str(start),
            end=str(end),
            progress=False,
            auto_adjust=False,
            group_by="column"
        )

    fx = _download(primary)
    if fx is None or fx.empty:
        fx = _download(fallback)

    if fx is None or fx.empty:
        raise RuntimeError("yfinance에서 USD/KRW 데이터를 못 가져옴.")

    col_candidates = ["Adj Close", "Close"]
    price = None

    if isinstance(fx.columns, pd.MultiIndex):
        for c in col_candidates:
            if c in fx.columns.get_level_values(0):
                price = fx[c]
                if isinstance(price, pd.DataFrame):
                    price = price.iloc[:, 0]
                break
    else:
        for c in col_candidates:
            if c in fx.columns:
                price = fx[c]
                break

    if price is None:
        raise RuntimeError(f"USD/KRW에서 Close/Adj Close 컬럼을 찾지 못했습니다. columns={fx.columns}")

    price = price.copy()
    price.index = pd.to_datetime(price.index).normalize()
    price = price.sort_index()

    # 요청 날짜에 맞춰 리인덱싱 + 휴장일 ffill
    price = price.reindex(date_index).ffill()
    price.name = "usdkrw"
    return price


def add_usdkrw(df: pd.DataFrame, date_col: str = "날짜") -> pd.DataFrame:
    out = df.copy()
    out[date_col] = pd.to_datetime(out[date_col], errors="coerce")

    date_only = out[date_col].dt.normalize()
    fx = fetch_usdkrw_yfinance(date_only)

    out = out.assign(date_only=date_only).merge(
        fx.rename_axis("date_only").reset_index(),
        on="date_only",
        how="left",
    ).drop(columns=["date_only"])

    out["usdkrw"] = out["usdkrw"].bfill()
    return out


# =========================
# 3) Run
# =========================
df_final = add_subway_fare(df_final, date_col="날짜")
df_final = add_usdkrw(df_final, date_col="날짜")

# checks
print(df_final[["날짜", "subway_fare_adult", "usdkrw"]].head(10))
print(df_final[["날짜", "subway_fare_adult"]].drop_duplicates().sort_values("날짜").tail(5))
print("[CHECK] usdkrw missing rate:", df_final["usdkrw"].isna().mean())
df_final

          날짜  subway_fare_adult       usdkrw
0 2021-01-01               1250  1084.650024
1 2021-01-01               1250  1084.650024
2 2021-01-01               1250  1084.650024
3 2021-01-01               1250  1084.650024
4 2021-01-01               1250  1084.650024
5 2021-01-01               1250  1084.650024
6 2021-01-01               1250  1084.650024
7 2021-01-01               1250  1084.650024
8 2021-01-01               1250  1084.650024
9 2021-01-01               1250  1084.650024
                날짜  subway_fare_adult
2761160 2023-12-27               1400
2763720 2023-12-28               1400
2766280 2023-12-29               1400
2768840 2023-12-30               1400
2771400 2023-12-31               1400
[CHECK] usdkrw missing rate: 0.0


Unnamed: 0,날짜,line,station_number,역명_x,승객유형,승차,하차,hour,tm,ym,transfer_in,congestion_up,congestion_down,is_up,is_down,월요일,화요일,수요일,목요일,금요일,토요일,일요일,is_평일,is_주말,is_공휴일,week_number,temp_avg,rain_day,wind_avg,humid_avg,snow_day,day_label,time_range,tm_label,station_cd_map,station_cd,역명_y,up_trip_cnt,down_trip_cnt,up_mean_headway,down_mean_headway,subway_fare_adult,usdkrw
0,2021-01-01,1,150,서울역,외국인,0,0,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625,1250,1084.650024
1,2021-01-01,1,150,서울역,우대권,5,7,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625,1250,1084.650024
2,2021-01-01,1,150,서울역,일반,93,66,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625,1250,1084.650024
3,2021-01-01,1,150,서울역,청소년,1,0,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625,1250,1084.650024
4,2021-01-01,1,150,서울역,외국인,0,0,01,2021010101,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625,1250,1084.650024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2773955,2023-12-31,7,2748,가산디지털단지,청소년,13,5,21,2023123121,202312,771655.0,17.466667,41.433333,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,21시-22시,21시,,2748,가산디지털단지,9.0,9.0,7.000000,6.625,1400,1277.839966
2773956,2023-12-31,7,2748,가산디지털단지,외국인,0,0,22,2023123122,202312,771655.0,15.600000,38.650000,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,,2748,가산디지털단지,7.0,9.0,8.833333,6.750,1400,1277.839966
2773957,2023-12-31,7,2748,가산디지털단지,우대권,17,12,22,2023123122,202312,771655.0,15.600000,38.650000,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,,2748,가산디지털단지,7.0,9.0,8.833333,6.750,1400,1277.839966
2773958,2023-12-31,7,2748,가산디지털단지,일반,247,247,22,2023123122,202312,771655.0,15.600000,38.650000,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,,2748,가산디지털단지,7.0,9.0,8.833333,6.750,1400,1277.839966


### <한국 CPI>
- 각 행(시간별) 기준 dt = 날짜 + hour
- CPI는 release_dt = 발표일 + 08:00
- 각 dt에 대해 release_dt <= dt 인 것 중 가장 최근 발표값을 붙임(누설 방지)
- cpi_yoy_actual : “발표된 실제값”
- cpi_yoy_prev : “이전 발표값”
- cpi_yoy_actual_filled : 실제가 없으면 이전으로 채운 “항상 존재하는 CPI 피처

In [17]:
# =========================
# 1) CPI CSV 파일 경로
# =========================
CPI_CSV_PATH = "./data/korea_cpi_releases_filtered.csv"


# =========================
# 2) CSV에서 CPI 데이터 로드 함수
# =========================
def load_cpi_from_csv(csv_path: str = CPI_CSV_PATH) -> pd.DataFrame:

    df = pd.read_csv(csv_path, encoding="utf-8-sig")
    df["release_date"] = pd.to_datetime(df["release_date"])
    df["release_dt"] = pd.to_datetime(df["release_dt"])
    df["ref_month"] = pd.PeriodIndex(df["ref_month"], freq="M")
    return df


# =========================
# 3) CPI 텍스트 파서 함수 (레거시 호환성 유지, 필요시 사용)
# =========================
def _to_float_pct(x: str):
    x = str(x).strip()
    if x == "" or x.lower() == "nan":
        return np.nan
    x = x.replace("%", "").strip()
    try:
        return float(x)
    except ValueError:
        return np.nan

def parse_korea_cpi_release_table(text: str) -> pd.DataFrame:
    rows = []
    for line in text.splitlines():
        line = line.strip()
        if not line or line.startswith("발표일"):
            continue

        # 탭/복수 공백 기준 분리
        parts = re.split(r"\s{2,}|\t+", line)
        if len(parts) < 2:
            continue

        # 예: "2024년 12월 31일 (12월)"
        m = re.search(r"(\d{4})년\s*(\d{1,2})월\s*(\d{1,2})일\s*\((\d{1,2})월\)", parts[0])
        if not m:
            continue

        y, mo, d, ref_mo = map(int, m.groups())
        time_str = parts[1].strip() if len(parts) > 1 else "08:00"

        actual = _to_float_pct(parts[2]) if len(parts) > 2 else np.nan
        forecast = _to_float_pct(parts[3]) if len(parts) > 3 else np.nan
        prev = _to_float_pct(parts[4]) if len(parts) > 4 else np.nan

        release_date = pd.Timestamp(year=y, month=mo, day=d)
        release_dt = pd.to_datetime(f"{release_date.date()} {time_str}")

        rows.append({
            "release_date": release_date,
            "release_time": time_str,
            "release_dt": release_dt,     # 발표일시 (08:00)
            "ref_month": pd.Period(f"{y}-{ref_mo:02d}", freq="M"),  # 표에 적힌 (n월)
            "cpi_yoy_actual": actual,
            "cpi_yoy_forecast": forecast,
            "cpi_yoy_prev": prev,
        })

    df = pd.DataFrame(rows).sort_values("release_dt").reset_index(drop=True)
    return df

# CSV에서 CPI 데이터 로드
cpi_df = load_cpi_from_csv(CPI_CSV_PATH)

In [18]:
# CSV에서 CPI 데이터 로드
cpi_df = load_cpi_from_csv(CPI_CSV_PATH)

# =========================
# 3) df_final 범위만 필터링해서 CPI CSV 저장
# =========================
IN_PATH = "./data/df_final_top30_stations_with_headway_tm_v3.csv"
CPI_OUT_PATH = "./data/korea_cpi_releases_filtered.csv"
OUT_PATH = "./data/df_final_top30_stations_with_headway_tm_v5.csv"

df = pd.read_csv(IN_PATH, encoding="utf-8-sig")
df["날짜"] = pd.to_datetime(df["날짜"], errors="coerce")
df["hour"] = df["hour"].astype(str).str.zfill(2)

min_date = df["날짜"].min()
max_date = df["날짜"].max()

# 발표일시 기준으로 “df 기간에 영향을 줄 수 있는” CPI만 남김
df_dt_min = pd.to_datetime(min_date.date()) + pd.Timedelta(hours=0)
df_dt_max = pd.to_datetime(max_date.date()) + pd.Timedelta(hours=23)

cpi_filtered = cpi_df[(cpi_df["release_dt"] <= df_dt_max)].copy()
cpi_filtered.to_csv(CPI_OUT_PATH, index=False, encoding="utf-8-sig")
print(f"[INFO] saved CPI releases -> {CPI_OUT_PATH} (rows={len(cpi_filtered)})")

# =========================
# 4) 발표일 08:00 기준으로 df에 CPI 붙이기 (누설 방지)
# =========================
# df의 관측시각 만들기: 날짜 + hour:00:00
df["dt"] = pd.to_datetime(df["날짜"].dt.date.astype(str) + " " + df["hour"] + ":00:00")

# asof merge: dt 시점에 이미 발표된 최신 CPI를 붙임
cpi_for_merge = cpi_filtered[["release_dt", "cpi_yoy_actual", "cpi_yoy_forecast", "cpi_yoy_prev"]].sort_values("release_dt")

df = df.sort_values("dt")
df = pd.merge_asof(
    df,
    cpi_for_merge,
    left_on="dt",
    right_on="release_dt",
    direction="backward",
)

df["cpi_yoy_actual_filled"] = df["cpi_yoy_actual"].fillna(df["cpi_yoy_prev"])

# 정리
df = df.drop(columns=["release_dt"]) 
df.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")
print(f"[INFO] saved merged df -> {OUT_PATH}")

print("[CHECK] CPI missing rate:", df["cpi_yoy_actual"].isna().mean())
print(df[["날짜","hour","cpi_yoy_actual","cpi_yoy_prev","cpi_yoy_actual_filled"]].head(20))

[INFO] saved CPI releases -> ./data/korea_cpi_releases_filtered.csv (rows=37)
[INFO] saved merged df -> ./data/df_final_top30_stations_with_headway_tm_v5.csv
[CHECK] CPI missing rate: 0.0
           날짜 hour  cpi_yoy_actual  cpi_yoy_prev  cpi_yoy_actual_filled
0  2021-01-01   00             0.5           0.6                    0.5
1  2021-01-01   00             0.5           0.6                    0.5
2  2021-01-01   00             0.5           0.6                    0.5
3  2021-01-01   00             0.5           0.6                    0.5
4  2021-01-01   00             0.5           0.6                    0.5
5  2021-01-01   00             0.5           0.6                    0.5
6  2021-01-01   00             0.5           0.6                    0.5
7  2021-01-01   00             0.5           0.6                    0.5
8  2021-01-01   00             0.5           0.6                    0.5
9  2021-01-01   00             0.5           0.6                    0.5
10 2021-01-01   00  

### <경유가격>
- 고급휘발유 
- 보통휘발유 
- 자동차용경유

In [19]:
def load_and_prepare_fuel_price(csv_path: str) -> pd.DataFrame:

    # 1) 인코딩 안전 로드
    try:
        df = pd.read_csv(csv_path, encoding="utf-8-sig")
    except UnicodeDecodeError:
        df = pd.read_csv(csv_path, encoding="cp949")

    # 2) 컬럼명 정리
    df = df.rename(columns={
        "구분": "날짜",
        "고급휘발유": "fuel_premium_gasoline",
        "보통휘발유": "fuel_regular_gasoline",
        "자동차용경유": "fuel_diesel",
    })

    # 3) 날짜 파싱 (YYYY년MM월DD일)
    df["날짜"] = (
        df["날짜"]
        .astype(str)
        .str.replace("년", "-", regex=False)
        .str.replace("월", "-", regex=False)
        .str.replace("일", "", regex=False)
    )
    df["날짜"] = pd.to_datetime(df["날짜"], errors="coerce")

    # 4) 가격 컬럼: 콤마 제거 → float
    price_cols = [
        "fuel_premium_gasoline",
        "fuel_regular_gasoline",
        "fuel_diesel",
    ]
    for c in price_cols:
        df[c] = (
            df[c]
            .astype(str)
            .str.replace(",", "", regex=False)
            .astype(float)
        )

    df = df.sort_values("날짜").reset_index(drop=True)
    return df


def add_fuel_price(df: pd.DataFrame, fuel_df: pd.DataFrame, date_col: str = "날짜") -> pd.DataFrame:
    out = df.copy()
    out[date_col] = pd.to_datetime(out[date_col], errors="coerce")

    fuel = fuel_df.copy()
    fuel["fuel_date"] = pd.to_datetime(fuel["날짜"], errors="coerce")
    fuel = fuel.drop(columns=["날짜"])

    out = out.merge(
        fuel,
        left_on=date_col,
        right_on="fuel_date",
        how="left"
    ).drop(columns=["fuel_date"])

    # 결측일은 직전 값으로 보간
    price_cols = ["fuel_premium_gasoline", "fuel_regular_gasoline", "fuel_diesel"]
    out[price_cols] = out[price_cols].ffill()

    return out


FUEL_PATH = "./data/주유소_평균판매가격_제품별.csv"

fuel_df = load_and_prepare_fuel_price(FUEL_PATH)
df_final = add_fuel_price(df_final, fuel_df, date_col="날짜")

print(df_final[[
    "날짜",
    "fuel_premium_gasoline",
    "fuel_regular_gasoline",
    "fuel_diesel"
]].head())

print("[CHECK] fuel missing rate:")
print(df_final[[
    "fuel_premium_gasoline",
    "fuel_regular_gasoline",
    "fuel_diesel"
]].isna().mean())
df_final

          날짜  fuel_premium_gasoline  fuel_regular_gasoline  fuel_diesel
0 2021-01-01                1662.86                 1424.0      1224.64
1 2021-01-01                1662.86                 1424.0      1224.64
2 2021-01-01                1662.86                 1424.0      1224.64
3 2021-01-01                1662.86                 1424.0      1224.64
4 2021-01-01                1662.86                 1424.0      1224.64
[CHECK] fuel missing rate:
fuel_premium_gasoline    0.0
fuel_regular_gasoline    0.0
fuel_diesel              0.0
dtype: float64


Unnamed: 0,날짜,line,station_number,역명_x,승객유형,승차,하차,hour,tm,ym,transfer_in,congestion_up,congestion_down,is_up,is_down,월요일,화요일,수요일,목요일,금요일,토요일,일요일,is_평일,is_주말,is_공휴일,week_number,temp_avg,rain_day,wind_avg,humid_avg,snow_day,day_label,time_range,tm_label,station_cd_map,station_cd,역명_y,up_trip_cnt,down_trip_cnt,up_mean_headway,down_mean_headway,subway_fare_adult,usdkrw,fuel_premium_gasoline,fuel_regular_gasoline,fuel_diesel
0,2021-01-01,1,150,서울역,외국인,0,0,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625,1250,1084.650024,1662.86,1424.00,1224.64
1,2021-01-01,1,150,서울역,우대권,5,7,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625,1250,1084.650024,1662.86,1424.00,1224.64
2,2021-01-01,1,150,서울역,일반,93,66,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625,1250,1084.650024,1662.86,1424.00,1224.64
3,2021-01-01,1,150,서울역,청소년,1,0,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625,1250,1084.650024,1662.86,1424.00,1224.64
4,2021-01-01,1,150,서울역,외국인,0,0,01,2021010101,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625,1250,1084.650024,1662.86,1424.00,1224.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2773955,2023-12-31,7,2748,가산디지털단지,청소년,13,5,21,2023123121,202312,771655.0,17.466667,41.433333,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,21시-22시,21시,,2748,가산디지털단지,9.0,9.0,7.000000,6.625,1400,1277.839966,1862.17,1579.31,1495.21
2773956,2023-12-31,7,2748,가산디지털단지,외국인,0,0,22,2023123122,202312,771655.0,15.600000,38.650000,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,,2748,가산디지털단지,7.0,9.0,8.833333,6.750,1400,1277.839966,1862.17,1579.31,1495.21
2773957,2023-12-31,7,2748,가산디지털단지,우대권,17,12,22,2023123122,202312,771655.0,15.600000,38.650000,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,,2748,가산디지털단지,7.0,9.0,8.833333,6.750,1400,1277.839966,1862.17,1579.31,1495.21
2773958,2023-12-31,7,2748,가산디지털단지,일반,247,247,22,2023123122,202312,771655.0,15.600000,38.650000,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,,2748,가산디지털단지,7.0,9.0,8.833333,6.750,1400,1277.839966,1862.17,1579.31,1495.21


### <실업률 지표>
- 월별 실업률 및 청년 실업률

In [20]:
# =========================================================
# 0) CSV 파일에서 실업률 데이터 로드
# =========================================================
UNEMP_CSV_PATH = "./data/korea_unemployment_monthly_release_from_tsv.csv"


# =========================================================
# 1) CSV에서 실업률 데이터 로드 함수
# =========================================================
def load_unemp_monthly_from_csv(csv_path: str = UNEMP_CSV_PATH) -> pd.DataFrame:
    """
    CSV 파일에서 실업률 데이터를 로드합니다.
    CSV 파일이 이미 release_dt까지 포함하고 있으므로 그대로 반환합니다.
    """
    df = pd.read_csv(csv_path, encoding="utf-8-sig")
    df["month"] = pd.PeriodIndex(df["month_str"], freq="M")
    df["release_dt"] = pd.to_datetime(df["release_dt"])
    return df


# =========================================================
# 2) 파서 함수 (레거시 호환성 유지, 필요시 사용)
# =========================================================
def _split_tokens(line: str) -> list[str]:
    # 탭/여러 공백을 모두 분리자로 처리
    return [t for t in re.split(r"\s+|\t+", line.strip()) if t]

def _parse_months_from_header(year_line: str, month_line: str) -> list[pd.Period]:
    years = _split_tokens(year_line)
    months = _split_tokens(month_line)

    # 첫 칸이 공백으로 시작하면 years/months 길이가 한 칸 더 짧아지는 경우가 있어 보정
    years = [y for y in years if re.fullmatch(r"\d{4}", y)]
    # month는 "12월" 형태만 남김
    months = [m for m in months if re.fullmatch(r"\d{1,2}월", m)]

    n = min(len(years), len(months))
    years, months = years[:n], months[:n]

    periods = []
    for y, m in zip(years, months):
        mm = int(m.replace("월", ""))
        periods.append(pd.Period(f"{int(y)}-{mm:02d}", freq="M"))
    return periods

def _parse_values_row(line: str) -> tuple[str, list[float]]:
    # "실업률(%) 3.4 4.1 ..." 형태
    toks = _split_tokens(line)
    label = toks[0]
    vals = []
    for x in toks[1:]:
        # 숫자만
        if re.fullmatch(r"-?\d+(?:\.\d+)?", x):
            vals.append(float(x))
    return label, vals

def build_unemp_monthly_from_tsv(text: str) -> pd.DataFrame:
    lines = [ln for ln in text.splitlines() if ln.strip()]
    if len(lines) < 4:
        raise ValueError("입력 텍스트 라인이 너무 적습니다. (헤더 2줄 + 지표 2줄 필요)")

    # year/month 헤더 2줄 찾기: 숫자(연도) 많이 있는 줄, '월' 많이 있는 줄
    year_line = None
    month_line = None
    for ln in lines[:5]:
        if len(re.findall(r"\b20\d{2}\b", ln)) >= 5 and year_line is None:
            year_line = ln
        if len(re.findall(r"\d{1,2}월", ln)) >= 5 and month_line is None:
            month_line = ln
    if year_line is None or month_line is None:
        raise ValueError("year/month 헤더 라인을 찾지 못했습니다. 표 원문을 다시 확인하세요.")

    months = _parse_months_from_header(year_line, month_line)

    # 지표 라인: "실업률(%)", "청년실업률(%)" 포함된 줄을 사용
    u_line = next((ln for ln in lines if ln.startswith("실업률(%)")), None)
    y_line = next((ln for ln in lines if ln.startswith("청년실업률(%)")), None)
    if u_line is None or y_line is None:
        raise ValueError("지표 라인('실업률(%)', '청년실업률(%)')을 찾지 못했습니다.")

    _, u_vals = _parse_values_row(u_line)
    _, y_vals = _parse_values_row(y_line)

    # 길이 정렬(마지막 값 하나 빠진 케이스 등 방어)
    n = min(len(months), len(u_vals), len(y_vals))
    months = months[:n]
    u_vals = u_vals[:n]
    y_vals = y_vals[:n]

    df = pd.DataFrame({
        "month": months,
        "month_str": [str(p) for p in months],  # "YYYY-MM"
        "unemployment_rate": u_vals,
        "youth_unemployment_rate": y_vals,
    })
    return df


# =========================================================
# 2) 누설 방지용 발표일(가정) 생성
#    - 해당 월 지표는 "다음달 release_day일 release_hour:00"부터 사용 가능
# =========================================================
def add_release_datetime(monthly_df: pd.DataFrame, release_day: int = 10, release_hour: int = 8) -> pd.DataFrame:
    out = monthly_df.copy()
    month_start = out["month"].dt.to_timestamp(how="start")
    next_month = month_start + pd.offsets.MonthBegin(1)
    release_date = next_month + pd.to_timedelta(release_day - 1, unit="D")
    out["release_dt"] = release_date + pd.to_timedelta(release_hour, unit="h")
    return out


# =========================================================
# 3) df_final(시간별) 데이터에 asof merge로 붙이기 (누설 방지)
# =========================================================
def attach_unemp_asof(df: pd.DataFrame,
                      monthly_release_df: pd.DataFrame,
                      date_col: str = "날짜",
                      hour_col: str = "hour") -> pd.DataFrame:
    out = df.copy()
    out[date_col] = pd.to_datetime(out[date_col], errors="coerce")
    out[hour_col] = out[hour_col].astype(str).str.zfill(2)

    out["dt"] = pd.to_datetime(out[date_col].dt.strftime("%Y-%m-%d") + " " + out[hour_col] + ":00:00")

    rel = monthly_release_df.sort_values("release_dt")[[
        "release_dt", "month_str", "unemployment_rate", "youth_unemployment_rate"
    ]].copy()

    out = out.sort_values("dt")
    out = pd.merge_asof(
        out,
        rel,
        left_on="dt",
        right_on="release_dt",
        direction="backward"
    )
    return out


# =========================================================
# 4) 실행
# =========================================================
# CSV에서 실업률 데이터 로드 
monthly_rel = load_unemp_monthly_from_csv(UNEMP_CSV_PATH)  

# df에 붙이기
df_final = attach_unemp_asof(df_final, monthly_rel, date_col="날짜", hour_col="hour")

# sanity check
print("[CHECK] unemployment_rate missing rate:", df_final["unemployment_rate"].isna().mean())
print(df_final[["날짜","hour","month_str","unemployment_rate","youth_unemployment_rate"]].head(30))
df_final

[CHECK] unemployment_rate missing rate: 0.0
           날짜 hour month_str  unemployment_rate  youth_unemployment_rate
0  2021-01-01   00   2020-11                3.4                      8.1
1  2021-01-01   00   2020-11                3.4                      8.1
2  2021-01-01   00   2020-11                3.4                      8.1
3  2021-01-01   00   2020-11                3.4                      8.1
4  2021-01-01   00   2020-11                3.4                      8.1
5  2021-01-01   00   2020-11                3.4                      8.1
6  2021-01-01   00   2020-11                3.4                      8.1
7  2021-01-01   00   2020-11                3.4                      8.1
8  2021-01-01   00   2020-11                3.4                      8.1
9  2021-01-01   00   2020-11                3.4                      8.1
10 2021-01-01   00   2020-11                3.4                      8.1
11 2021-01-01   00   2020-11                3.4                      8.1
12 2021

Unnamed: 0,날짜,line,station_number,역명_x,승객유형,승차,하차,hour,tm,ym,transfer_in,congestion_up,congestion_down,is_up,is_down,월요일,화요일,수요일,목요일,금요일,토요일,일요일,is_평일,is_주말,is_공휴일,week_number,temp_avg,rain_day,wind_avg,humid_avg,snow_day,day_label,time_range,tm_label,station_cd_map,station_cd,역명_y,up_trip_cnt,down_trip_cnt,up_mean_headway,down_mean_headway,subway_fare_adult,usdkrw,fuel_premium_gasoline,fuel_regular_gasoline,fuel_diesel,dt,release_dt,month_str,unemployment_rate,youth_unemployment_rate
0,2021-01-01,1,150,서울역,외국인,0,0,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1
1,2021-01-01,6,2623,합정,청소년,0,0,00,2021010100,202101,77723.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,,2623,합정,5.0,6.0,11.500000,10.600000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1
2,2021-01-01,6,2623,합정,일반,32,51,00,2021010100,202101,77723.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,,2623,합정,5.0,6.0,11.500000,10.600000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1
3,2021-01-01,6,2623,합정,우대권,0,0,00,2021010100,202101,77723.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,,2623,합정,5.0,6.0,11.500000,10.600000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1
4,2021-01-01,2,212,건대입구,외국인,0,0,00,2021010100,202101,275513.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,0212,0212,건대입구,4.0,8.0,13.000000,7.285714,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2773955,2023-12-31,1,152,종각,일반,2055,3532,22,2023123122,202312,949097.0,23.400000,18.533333,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,1806,1806,부평,9.0,8.0,6.125000,6.571429,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3
2773956,2023-12-31,1,152,종각,우대권,125,126,22,2023123122,202312,949097.0,23.400000,18.533333,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,1806,1806,부평,9.0,8.0,6.125000,6.571429,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3
2773957,2023-12-31,1,152,종각,외국인,40,81,22,2023123122,202312,949097.0,23.400000,18.533333,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,1806,1806,부평,9.0,8.0,6.125000,6.571429,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3
2773958,2023-12-31,7,2729,건대입구,청소년,20,81,22,2023123122,202312,170602.0,59.683333,24.150000,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,,2729,건대입구,8.0,8.0,6.857143,7.571429,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3


### <연도별 서울 인구와 서울 인구밀도, 수도권 인구, 수도권 인구밀도>

In [21]:
def load_population_yearly(csv_path: str = "./data/korea_population_yearly.csv") -> pd.DataFrame:

    df = pd.read_csv(csv_path, encoding="utf-8-sig")
    return df

In [22]:
def add_population_features(df: pd.DataFrame, date_col: str = "날짜") -> pd.DataFrame:
    out = df.copy()
    out[date_col] = pd.to_datetime(out[date_col], errors="coerce")
    out["year"] = out[date_col].dt.year

    pop_df = load_population_yearly()

    out = out.merge(
        pop_df,
        on="year",
        how="left"
    )

    return out

# df에 붙이기
df_final = add_population_features(df_final, date_col="날짜")

# 체크
print(df_final[
    ["날짜", "seoul_population", "seoul_population_density",
     "metro_population", "metro_population_density"]
].drop_duplicates().sort_values("날짜").head(10))

print("[CHECK] population missing rate:")
print(df_final[
    ["seoul_population", "seoul_population_density",
     "metro_population", "metro_population_density"]
].isna().mean())
df_final

              날짜  seoul_population  seoul_population_density  \
0     2021-01-01              9508                     15709   
2440  2021-01-02              9508                     15709   
4960  2021-01-03              9508                     15709   
7360  2021-01-04              9508                     15709   
9840  2021-01-05              9508                     15709   
12240 2021-01-06              9508                     15709   
14680 2021-01-07              9508                     15709   
17140 2021-01-08              9508                     15709   
19520 2021-01-09              9508                     15709   
22000 2021-01-10              9508                     15709   

       metro_population  metro_population_density  
0                 26069                      2196  
2440              26069                      2196  
4960              26069                      2196  
7360              26069                      2196  
9840              26069            

Unnamed: 0,날짜,line,station_number,역명_x,승객유형,승차,하차,hour,tm,ym,transfer_in,congestion_up,congestion_down,is_up,is_down,월요일,화요일,수요일,목요일,금요일,토요일,일요일,is_평일,is_주말,is_공휴일,week_number,temp_avg,rain_day,wind_avg,humid_avg,snow_day,day_label,time_range,tm_label,station_cd_map,station_cd,역명_y,up_trip_cnt,down_trip_cnt,up_mean_headway,down_mean_headway,subway_fare_adult,usdkrw,fuel_premium_gasoline,fuel_regular_gasoline,fuel_diesel,dt,release_dt,month_str,unemployment_rate,youth_unemployment_rate,year,seoul_population,seoul_population_density,metro_population,metro_population_density
0,2021-01-01,1,150,서울역,외국인,0,0,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196
1,2021-01-01,6,2623,합정,청소년,0,0,00,2021010100,202101,77723.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,,2623,합정,5.0,6.0,11.500000,10.600000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196
2,2021-01-01,6,2623,합정,일반,32,51,00,2021010100,202101,77723.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,,2623,합정,5.0,6.0,11.500000,10.600000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196
3,2021-01-01,6,2623,합정,우대권,0,0,00,2021010100,202101,77723.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,,2623,합정,5.0,6.0,11.500000,10.600000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196
4,2021-01-01,2,212,건대입구,외국인,0,0,00,2021010100,202101,275513.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,0212,0212,건대입구,4.0,8.0,13.000000,7.285714,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2773955,2023-12-31,1,152,종각,일반,2055,3532,22,2023123122,202312,949097.0,23.400000,18.533333,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,1806,1806,부평,9.0,8.0,6.125000,6.571429,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3,2023,9400,15533,26190,2206
2773956,2023-12-31,1,152,종각,우대권,125,126,22,2023123122,202312,949097.0,23.400000,18.533333,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,1806,1806,부평,9.0,8.0,6.125000,6.571429,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3,2023,9400,15533,26190,2206
2773957,2023-12-31,1,152,종각,외국인,40,81,22,2023123122,202312,949097.0,23.400000,18.533333,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,1806,1806,부평,9.0,8.0,6.125000,6.571429,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3,2023,9400,15533,26190,2206
2773958,2023-12-31,7,2729,건대입구,청소년,20,81,22,2023123122,202312,170602.0,59.683333,24.150000,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,,2729,건대입구,8.0,8.0,6.857143,7.571429,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3,2023,9400,15533,26190,2206


### <자동차 등록 대수>
- 자동차 등록대수(만대)
- 전년대비 증가대수(천대)
- 전년대비 증감비(%)

In [23]:
def load_vehicle_yearly(csv_path: str = "./data/korea_vehicle_yearly.csv") -> pd.DataFrame:

    df = pd.read_csv(csv_path, encoding="utf-8-sig")
    return df

def add_vehicle_features(df: pd.DataFrame, date_col: str = "날짜") -> pd.DataFrame:
    out = df.copy()
    out[date_col] = pd.to_datetime(out[date_col], errors="coerce")
    out["year"] = out[date_col].dt.year

    vehicle_df = load_vehicle_yearly()

    out = out.merge(
        vehicle_df,
        on="year",
        how="left"
    )

    return out

# df에 붙이기
df_final = add_vehicle_features(df_final, date_col="날짜")

# 체크
print(
    df_final[
        [
            "날짜",
            "vehicle_registered_total_10k",
            "vehicle_yoy_increase_1k",
            "vehicle_yoy_growth_rate",
        ]
    ]
    .drop_duplicates()
    .sort_values("날짜")
    .head(10)
)

print("[CHECK] vehicle feature missing rate:")
print(
    df_final[
        [
            "vehicle_registered_total_10k",
            "vehicle_yoy_increase_1k",
            "vehicle_yoy_growth_rate",
        ]
    ].isna().mean()
)
df_final

              날짜  vehicle_registered_total_10k  vehicle_yoy_increase_1k  \
0     2021-01-01                          2491                      545   
2440  2021-01-02                          2491                      545   
4960  2021-01-03                          2491                      545   
7360  2021-01-04                          2491                      545   
9840  2021-01-05                          2491                      545   
12240 2021-01-06                          2491                      545   
14680 2021-01-07                          2491                      545   
17140 2021-01-08                          2491                      545   
19520 2021-01-09                          2491                      545   
22000 2021-01-10                          2491                      545   

       vehicle_yoy_growth_rate  
0                          2.2  
2440                       2.2  
4960                       2.2  
7360                       2.2  
9840     

Unnamed: 0,날짜,line,station_number,역명_x,승객유형,승차,하차,hour,tm,ym,transfer_in,congestion_up,congestion_down,is_up,is_down,월요일,화요일,수요일,목요일,금요일,토요일,일요일,is_평일,is_주말,is_공휴일,week_number,temp_avg,rain_day,wind_avg,humid_avg,snow_day,day_label,time_range,tm_label,station_cd_map,station_cd,역명_y,up_trip_cnt,down_trip_cnt,up_mean_headway,down_mean_headway,subway_fare_adult,usdkrw,fuel_premium_gasoline,fuel_regular_gasoline,fuel_diesel,dt,release_dt,month_str,unemployment_rate,youth_unemployment_rate,year,seoul_population,seoul_population_density,metro_population,metro_population_density,vehicle_registered_total_10k,vehicle_yoy_increase_1k,vehicle_yoy_growth_rate
0,2021-01-01,1,150,서울역,외국인,0,0,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196,2491,545,2.2
1,2021-01-01,6,2623,합정,청소년,0,0,00,2021010100,202101,77723.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,,2623,합정,5.0,6.0,11.500000,10.600000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196,2491,545,2.2
2,2021-01-01,6,2623,합정,일반,32,51,00,2021010100,202101,77723.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,,2623,합정,5.0,6.0,11.500000,10.600000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196,2491,545,2.2
3,2021-01-01,6,2623,합정,우대권,0,0,00,2021010100,202101,77723.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,,2623,합정,5.0,6.0,11.500000,10.600000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196,2491,545,2.2
4,2021-01-01,2,212,건대입구,외국인,0,0,00,2021010100,202101,275513.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,0212,0212,건대입구,4.0,8.0,13.000000,7.285714,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196,2491,545,2.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2773955,2023-12-31,1,152,종각,일반,2055,3532,22,2023123122,202312,949097.0,23.400000,18.533333,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,1806,1806,부평,9.0,8.0,6.125000,6.571429,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3,2023,9400,15533,26190,2206,2595,446,1.7
2773956,2023-12-31,1,152,종각,우대권,125,126,22,2023123122,202312,949097.0,23.400000,18.533333,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,1806,1806,부평,9.0,8.0,6.125000,6.571429,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3,2023,9400,15533,26190,2206,2595,446,1.7
2773957,2023-12-31,1,152,종각,외국인,40,81,22,2023123122,202312,949097.0,23.400000,18.533333,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,1806,1806,부평,9.0,8.0,6.125000,6.571429,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3,2023,9400,15533,26190,2206,2595,446,1.7
2773958,2023-12-31,7,2729,건대입구,청소년,20,81,22,2023123122,202312,170602.0,59.683333,24.150000,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,,2729,건대입구,8.0,8.0,6.857143,7.571429,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3,2023,9400,15533,26190,2206,2595,446,1.7


### <기준금리>

In [24]:
def load_base_rate_monthly(csv_path: str = "./data/korea_base_rate_monthly.csv") -> pd.DataFrame:
    """
    적용시점: 해당 월 1일 00:00
    """
    df = pd.read_csv(csv_path, encoding="utf-8-sig")
    df["apply_dt"] = pd.to_datetime(df["apply_dt"])
    return df.sort_values("apply_dt")

def add_base_rate_asof(
    df: pd.DataFrame,
    base_rate_df: pd.DataFrame,
    date_col: str = "날짜",
    hour_col: str = "hour",
) -> pd.DataFrame:
    out = df.copy()

    out[date_col] = pd.to_datetime(out[date_col], errors="coerce")
    out[hour_col] = out[hour_col].astype(str).str.zfill(2)

    # datetime 생성
    out["dt"] = pd.to_datetime(
        out[date_col].dt.strftime("%Y-%m-%d") + " " +
        out[hour_col] + ":00:00"
    )

    base = base_rate_df[["apply_dt", "base_rate"]].sort_values("apply_dt")

    out = out.sort_values("dt")
    out = pd.merge_asof(
        out,
        base,
        left_on="dt",
        right_on="apply_dt",
        direction="backward"
    )

    return out


BASE_RATE_CSV = "./data/korea_base_rate_monthly.csv"

# 기준금리 데이터 로드
base_rate_df = load_base_rate_monthly(BASE_RATE_CSV)

# df에 붙이기
df_final = add_base_rate_asof(df_final, base_rate_df, date_col="날짜", hour_col="hour")

# 체크
print(df_final[["날짜", "hour", "base_rate"]].drop_duplicates().sort_values(["날짜","hour"]).head(20))
print("[CHECK] base_rate missing rate:", df_final["base_rate"].isna().mean())
df_final

             날짜 hour  base_rate
0    2021-01-01   00        0.5
122  2021-01-01   01        0.5
244  2021-01-01   05        0.5
366  2021-01-01   06        0.5
488  2021-01-01   07        0.5
610  2021-01-01   08        0.5
732  2021-01-01   09        0.5
854  2021-01-01   10        0.5
976  2021-01-01   11        0.5
1098 2021-01-01   12        0.5
1220 2021-01-01   13        0.5
1342 2021-01-01   14        0.5
1464 2021-01-01   15        0.5
1586 2021-01-01   16        0.5
1708 2021-01-01   17        0.5
1830 2021-01-01   18        0.5
1952 2021-01-01   19        0.5
2074 2021-01-01   20        0.5
2196 2021-01-01   21        0.5
2318 2021-01-01   22        0.5
[CHECK] base_rate missing rate: 0.0


Unnamed: 0,날짜,line,station_number,역명_x,승객유형,승차,하차,hour,tm,ym,transfer_in,congestion_up,congestion_down,is_up,is_down,월요일,화요일,수요일,목요일,금요일,토요일,일요일,is_평일,is_주말,is_공휴일,week_number,temp_avg,rain_day,wind_avg,humid_avg,snow_day,day_label,time_range,tm_label,station_cd_map,station_cd,역명_y,up_trip_cnt,down_trip_cnt,up_mean_headway,down_mean_headway,subway_fare_adult,usdkrw,fuel_premium_gasoline,fuel_regular_gasoline,fuel_diesel,dt,release_dt,month_str,unemployment_rate,youth_unemployment_rate,year,seoul_population,seoul_population_density,metro_population,metro_population_density,vehicle_registered_total_10k,vehicle_yoy_increase_1k,vehicle_yoy_growth_rate,apply_dt,base_rate
0,2021-01-01,1,150,서울역,외국인,0,0,00,2021010100,202101,555352.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196,2491,545,2.2,2021-01-01,0.5
1,2021-01-01,2,226,사당,일반,88,173,00,2021010100,202101,285035.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,0226,0226,사당,7.0,5.0,8.333333,8.750000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196,2491,545,2.2,2021-01-01,0.5
2,2021-01-01,2,226,사당,우대권,1,9,00,2021010100,202101,285035.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,0226,0226,사당,7.0,5.0,8.333333,8.750000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196,2491,545,2.2,2021-01-01,0.5
3,2021-01-01,2,226,사당,외국인,0,0,00,2021010100,202101,285035.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,0226,0226,사당,7.0,5.0,8.333333,8.750000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196,2491,545,2.2,2021-01-01,0.5
4,2021-01-01,4,424,명동,외국인,0,0,00,2021010100,202101,227930.0,0.000000,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,0424,0424,명동,6.0,5.0,9.600000,12.250000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196,2491,545,2.2,2021-01-01,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2773955,2023-12-31,2,201,시청,일반,812,535,22,2023123122,202312,352955.0,0.000000,0.000000,0,0,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,0201,0201,시청,8.0,10.0,7.000000,6.333333,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3,2023,9400,15533,26190,2206,2595,446,1.7,2023-12-01,3.5
2773956,2023-12-31,2,201,시청,청소년,42,34,22,2023123122,202312,352955.0,0.000000,0.000000,0,0,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,0201,0201,시청,8.0,10.0,7.000000,6.333333,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3,2023,9400,15533,26190,2206,2595,446,1.7,2023-12-01,3.5
2773957,2023-12-31,3,319,종로3가,청소년,6,14,22,2023123122,202312,214865.0,41.516667,15.666667,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,0309,0309,지축,5.0,5.0,12.750000,10.750000,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3,2023,9400,15533,26190,2206,2595,446,1.7,2023-12-01,3.5
2773958,2023-12-31,2,234,신도림,우대권,92,92,22,2023123122,202312,786991.0,0.000000,0.000000,0,0,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,0234,0234,신도림,9.0,15.0,6.125000,3.857143,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3,2023,9400,15533,26190,2206,2595,446,1.7,2023-12-01,3.5


### <월별 동행지수 순환변동치와 월별 선행지수 순환변동치>

In [25]:
# =========================================================
# 1) CSV 파일에서 월별 동행지수/선행지수 데이터 로드
#    적용시점: 해당 월 1일 00:00 (월별 정리 데이터로 사용)
# =========================================================
def load_cli_lei_monthly(csv_path: str = "./data/korea_cli_lei_monthly.csv") -> pd.DataFrame:

    df = pd.read_csv(csv_path, encoding="utf-8-sig")
    df["apply_dt"] = pd.to_datetime(df["apply_dt"])
    return df.sort_values("apply_dt")


# =========================================================
# 2) 누설 방지 asof merge (월초 apply_dt 기준 backward)
# =========================================================
def add_cli_lei_asof(df: pd.DataFrame,
                     cli_lei_df: pd.DataFrame,
                     date_col: str = "날짜",
                     hour_col: str = "hour") -> pd.DataFrame:
    out = df.copy()

    out[date_col] = pd.to_datetime(out[date_col], errors="coerce")
    out[hour_col] = out[hour_col].astype(str).str.zfill(2)
    out["dt"] = pd.to_datetime(out[date_col].dt.strftime("%Y-%m-%d") + " " + out[hour_col] + ":00:00")

    ref = cli_lei_df[["apply_dt","coincident_index","leading_index"]].sort_values("apply_dt")

    out = out.sort_values("dt")
    out = pd.merge_asof(out, ref, left_on="dt", right_on="apply_dt", direction="backward")

    return out


# =========================================================
# 3) 실행 + 최종 저장
# =========================================================
CLI_LEI_CSV = "./data/korea_cli_lei_monthly.csv"
FINAL_OUT_PATH = "./data/df_final_top30_stations_with_headway_tm_final.csv"

# CSV에서 동행지수/선행지수 데이터 로드
cli_lei_df = load_cli_lei_monthly(CLI_LEI_CSV)

df_final = add_cli_lei_asof(df_final, cli_lei_df, date_col="날짜", hour_col="hour")

# 최종 저장
df_final.to_csv(FINAL_OUT_PATH, index=False, encoding="utf-8-sig")
print(f"[INFO] 최종 데이터 저장 완료 -> {FINAL_OUT_PATH}")

print("[CHECK] coincident_index missing rate:", df_final["coincident_index"].isna().mean())
print("[CHECK] leading_index missing rate:", df_final["leading_index"].isna().mean())
print(df_final[["날짜","hour","coincident_index","leading_index"]].head(20))
print(f"\n[FINAL] 최종 데이터 shape: {df_final.shape}")
df_final

[INFO] 최종 데이터 저장 완료 -> ./data/df_final_top30_stations_with_headway_tm_final.csv
[CHECK] coincident_index missing rate: 0.0
[CHECK] leading_index missing rate: 0.0
           날짜 hour  coincident_index  leading_index
0  2021-01-01   00              98.1          101.2
1  2021-01-01   00              98.1          101.2
2  2021-01-01   00              98.1          101.2
3  2021-01-01   00              98.1          101.2
4  2021-01-01   00              98.1          101.2
5  2021-01-01   00              98.1          101.2
6  2021-01-01   00              98.1          101.2
7  2021-01-01   00              98.1          101.2
8  2021-01-01   00              98.1          101.2
9  2021-01-01   00              98.1          101.2
10 2021-01-01   00              98.1          101.2
11 2021-01-01   00              98.1          101.2
12 2021-01-01   00              98.1          101.2
13 2021-01-01   00              98.1          101.2
14 2021-01-01   00              98.1          101.2
15 20

Unnamed: 0,날짜,line,station_number,역명_x,승객유형,승차,하차,hour,tm,ym,transfer_in,congestion_up,congestion_down,is_up,is_down,월요일,화요일,수요일,목요일,금요일,토요일,일요일,is_평일,is_주말,is_공휴일,week_number,temp_avg,rain_day,wind_avg,humid_avg,snow_day,day_label,time_range,tm_label,station_cd_map,station_cd,역명_y,up_trip_cnt,down_trip_cnt,up_mean_headway,down_mean_headway,subway_fare_adult,usdkrw,fuel_premium_gasoline,fuel_regular_gasoline,fuel_diesel,dt,release_dt,month_str,unemployment_rate,youth_unemployment_rate,year,seoul_population,seoul_population_density,metro_population,metro_population_density,vehicle_registered_total_10k,vehicle_yoy_increase_1k,vehicle_yoy_growth_rate,apply_dt_x,base_rate,apply_dt_y,coincident_index,leading_index
0,2021-01-01,1,150,서울역,외국인,0,0,00,2021010100,202101,555352.0,0.0,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,1805,1805,송내,7.0,10.0,6.514286,5.625000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196,2491,545,2.2,2021-01-01,0.5,2021-01-01,98.1,101.2
1,2021-01-01,4,420,혜화,일반,93,81,00,2021010100,202101,393273.0,0.0,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,0420,0420,혜화,7.0,5.0,9.333333,12.250000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196,2491,545,2.2,2021-01-01,0.5,2021-01-01,98.1,101.2
2,2021-01-01,6,2623,합정,청소년,0,0,00,2021010100,202101,77723.0,0.0,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,,2623,합정,5.0,6.0,11.500000,10.600000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196,2491,545,2.2,2021-01-01,0.5,2021-01-01,98.1,101.2
3,2021-01-01,6,2623,합정,일반,32,51,00,2021010100,202101,77723.0,0.0,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,,2623,합정,5.0,6.0,11.500000,10.600000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196,2491,545,2.2,2021-01-01,0.5,2021-01-01,98.1,101.2
4,2021-01-01,6,2623,합정,우대권,0,0,00,2021010100,202101,77723.0,0.0,0.000000,0,0,0,0,0,0,1,0,0,1,0,1,53,-4.2,0.0,2.0,64.0,0.0,공휴일,23시이후,00시,,2623,합정,5.0,6.0,11.500000,10.600000,1250,1084.650024,1662.86,1424.00,1224.64,2021-01-01 00:00:00,2020-12-10 08:00:00,2020-11,3.4,8.1,2021,9508,15709,26069,2196,2491,545,2.2,2021-01-01,0.5,2021-01-01,98.1,101.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2773955,2023-12-31,1,152,종각,일반,2055,3532,22,2023123122,202312,949097.0,23.4,18.533333,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,1806,1806,부평,9.0,8.0,6.125000,6.571429,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3,2023,9400,15533,26190,2206,2595,446,1.7,2023-12-01,3.5,2023-12-01,100.2,100.4
2773956,2023-12-31,1,152,종각,우대권,125,126,22,2023123122,202312,949097.0,23.4,18.533333,1,1,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,1806,1806,부평,9.0,8.0,6.125000,6.571429,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3,2023,9400,15533,26190,2206,2595,446,1.7,2023-12-01,3.5,2023-12-01,100.2,100.4
2773957,2023-12-31,2,234,신도림,청소년,48,63,22,2023123122,202312,786991.0,0.0,0.000000,0,0,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,0234,0234,신도림,9.0,15.0,6.125000,3.857143,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3,2023,9400,15533,26190,2206,2595,446,1.7,2023-12-01,3.5,2023-12-01,100.2,100.4
2773958,2023-12-31,2,234,신도림,일반,896,1254,22,2023123122,202312,786991.0,0.0,0.000000,0,0,0,0,0,0,0,0,1,0,1,0,52,2.1,4.7,2.0,95.5,8.8,공휴일,22시-23시,22시,0234,0234,신도림,9.0,15.0,6.125000,3.857143,1400,1277.839966,1862.17,1579.31,1495.21,2023-12-31 22:00:00,2023-12-10 08:00:00,2023-11,2.3,5.3,2023,9400,15533,26190,2206,2595,446,1.7,2023-12-01,3.5,2023-12-01,100.2,100.4


### 변수 정리 및 최종 저장

In [26]:
def clean_station_columns(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # 역명 통합
    if "역명_x" in out.columns:
        out["역명"] = out["역명_x"]
    elif "역명_y" in out.columns:
        out["역명"] = out["역명_y"]

    drop_cols = [c for c in ["역명_x", "역명_y"] if c in out.columns]
    out = out.drop(columns=drop_cols)

    # station_cd 통합
    if "station_cd" in out.columns:
        out["station_cd"] = out["station_cd"].astype("Int64")

    if "station_cd_map" in out.columns:
        out = out.drop(columns=["station_cd_map"])

    return out


def drop_intermediate_columns(df: pd.DataFrame) -> pd.DataFrame:
    drop_cols = [
        "dt",
        "apply_dt_x", "apply_dt_y",
        "release_dt", "month_str"
    ]
    drop_cols = [c for c in drop_cols if c in df.columns]
    return df.drop(columns=drop_cols)


def clean_cpi_columns(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # 최종 CPI 변수만 유지
    keep = ["cpi_yoy_actual_filled"]
    drop = [c for c in out.columns if c.startswith("cpi_yoy_") and c not in keep]
    out = out.drop(columns=drop)

    out = out.rename(columns={
        "cpi_yoy_actual_filled": "cpi_yoy"
    })

    return out


def reorder_columns(df: pd.DataFrame) -> pd.DataFrame:
    priority = [
        "날짜", "line", "station_number", "station_cd", "역명",
        "승객유형", "승차", "하차",
        "hour", "tm", "ym",
        "up_trip_cnt", "down_trip_cnt",
        "up_mean_headway", "down_mean_headway"
    ]

    rest = [c for c in df.columns if c not in priority]
    return df[priority + rest]

OUT_PATH = "./data/df_final_top30_stations_with_headway_tm_real_final.csv"

df = pd.read_csv(
    "./data/df_final_top30_stations_with_headway_tm_final.csv",
    encoding="utf-8-sig"
)

df = clean_station_columns(df)
df = drop_intermediate_columns(df)
df = clean_cpi_columns(df)
df = reorder_columns(df)

df.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")

print(f"[INFO] FINAL dataset saved -> {OUT_PATH}")
print(df.shape)
print(df.columns.tolist())

[INFO] FINAL dataset saved -> ./data/df_final_top30_stations_with_headway_tm_real_final.csv
(2773960, 57)
['날짜', 'line', 'station_number', 'station_cd', '역명', '승객유형', '승차', '하차', 'hour', 'tm', 'ym', 'up_trip_cnt', 'down_trip_cnt', 'up_mean_headway', 'down_mean_headway', 'transfer_in', 'congestion_up', 'congestion_down', 'is_up', 'is_down', '월요일', '화요일', '수요일', '목요일', '금요일', '토요일', '일요일', 'is_평일', 'is_주말', 'is_공휴일', 'week_number', 'temp_avg', 'rain_day', 'wind_avg', 'humid_avg', 'snow_day', 'day_label', 'time_range', 'tm_label', 'subway_fare_adult', 'usdkrw', 'fuel_premium_gasoline', 'fuel_regular_gasoline', 'fuel_diesel', 'unemployment_rate', 'youth_unemployment_rate', 'year', 'seoul_population', 'seoul_population_density', 'metro_population', 'metro_population_density', 'vehicle_registered_total_10k', 'vehicle_yoy_increase_1k', 'vehicle_yoy_growth_rate', 'base_rate', 'coincident_index', 'leading_index']
