In [None]:
!pip install requests

In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import time
from datetime import datetime
import os
import re
import matplotlib.pyplot as plt
import numpy as np

## Data collection

In [None]:
# API key 입력하기
keys = ['', '', '', '', '', '', '', '', '', '', '', '']

In [None]:
time_of_day = 'morning' # 시간대 변경하기
duration_min = 120
interval_sec = 10
today = datetime.now().strftime('%m%d')
dfs = pd.DataFrame()
key_idx = 0

def collect_all_data(key):
    try:
        url = f'http://swopenAPI.seoul.go.kr/api/subway/{key}/xml/realtimeStationArrival/ALL'
        response = requests.get(url)
        response.encoding = 'utf-8'
        root = ET.fromstring(response.content)
        rows = root.findall('row')
    except Exception as e:
        print(f'API request failed ({e})')
        return pd.DataFrame()
    
    if not rows:
        print('No data received')
        return pd.DataFrame()
    
    records = [{child.tag: child.text for child in row} for row in rows]
    df_new = pd.DataFrame(records)
    print(f'Collected {len(records)} records')
    return df_new

print(f'Starting {interval_sec} seconds interval data collection for {duration_min} minutes...\n')
try:
    total_rounds = (duration_min * 60) // interval_sec

    for i in range(total_rounds):
        current_key = keys[key_idx]
        df_round = collect_all_data(current_key)
        dfs = pd.concat([dfs, df_round], ignore_index=True)
        key_idx = (key_idx + 1) % len(keys)
        time.sleep(interval_sec)
    
    filename = f'all_{today}_{time_of_day}.csv'
    dfs.to_csv(filename, index=False, encoding='utf-8-sig')
    print(f'Data saved as {filename}')
    print('\nAll data collection complete')

except KeyboardInterrupt:
    print('\nData collection interrupted by user')

## Preprocessing

### 서울 도시철도 열차운행시각표 전처리

In [None]:
stations = ['서울'] # 역 변경하기
time_of_day = 'morning' # 시간대 변경하기
schedule_filename = '서울교통공사_서울 도시철도 열차운행시각표.csv'
output_folder = './schedule_filtered'
os.makedirs(output_folder, exist_ok=True)

# 시간 범위 변경하기
start_time = datetime.strptime('07:30:00', '%H:%M:%S').time()
end_time = datetime.strptime('09:30:00', '%H:%M:%S').time()

station_name_map = {
    '서울': '서울역'
}

schedule_df = pd.read_csv(schedule_filename, encoding='cp949')
schedule_df['열차도착시간(dt)'] = pd.to_datetime(schedule_df['열차도착시간'], format='%H:%M:%S', errors='coerce')
schedule_df = schedule_df[schedule_df['열차도착시간(dt)'].notna()].copy()

for station in stations:
    station_name = station_name_map.get(station, station)
    filtered = schedule_df[
        (schedule_df['역사명'] == station_name) &
        (schedule_df['열차도착시간(dt)'].dt.time >= start_time) &
        (schedule_df['열차도착시간(dt)'].dt.time <= end_time)
    ].copy()

    filtered = filtered.sort_values(by=['호선', '방향', '열차도착시간(dt)'])

    output_filename = f'{station_name}_schedule_{time_of_day}.csv'
    output_filepath = os.path.join(output_folder, output_filename)

    filtered.to_csv(output_filepath, index=False, encoding='utf-8-sig')
    print(f'{station} saved as {output_filename}')

### 수집 데이터 전처리

In [None]:
date = '0524' # 날짜 변경하기
time_of_day = 'morning' # 시간대 변경하기
filename = f'all_{date}_{time_of_day}.csv'

# 시간 범위 변경하기
start_time = datetime.strptime('07:30:00', '%H:%M:%S').time()
end_time = datetime.strptime('09:30:00', '%H:%M:%S').time()

try:
    df_all = pd.read_csv(filename)
    df_all['recptnDt'] = pd.to_datetime(df_all['recptnDt'], errors='coerce')

    df_station = df_all[df_all['statnNm'] == '서울'].copy() # 서울역 필터링

    arrived_df = df_station[df_station['arvlCd'] == 1]
    arrived_df = arrived_df[
        arrived_df['recptnDt'].dt.time.between(start_time, end_time)
    ]
    arrived_df = arrived_df[
        arrived_df['subwayId'].astype(str).str[:4].astype(int).between(1001, 1009)
    ]

    arrived_df = arrived_df.sort_values('recptnDt').drop_duplicates(
        subset=['btrainNo', 'subwayId', 'updnLine', 'statnNm'], keep='first'
    )

    filename_new = f'서울_{date}_{time_of_day}_clean.csv'
    arrived_df.to_csv(filename_new, index=False, encoding='utf-8-sig')
    print(f'서울 saved as {filename_new}')

except Exception as e:
    print(f'Preprocessing failed ({e})')

## Data analysis

In [None]:
date = '0524' # 날짜 변경하기
time_of_day = 'morning' # 시간대 변경하기
schedule_folder = './schedule_filtered'
realtime_path = f'서울_{date}_{time_of_day}_clean.csv'
schedule_path = os.path.join(schedule_folder, f'서울역_schedule_{time_of_day}.csv')

subway_map = {1001: 1, 1002: 2, 1003: 3, 1004: 4, 1005: 5, 1006: 6, 1007: 7, 1008: 8, 1009: 9}
direction_map = {'상행': 'UP', '하행': 'DOWN', '내선': 'IN', '외선': 'OUT'}
express_map = {'급행': 1, '일반': 0}

def get_day_type(dt):
    return 'SAT' if dt.weekday() == 5 else 'END' if dt.weekday() == 6 else 'DAY'

try:
    realtime_df = pd.read_csv(realtime_path)
    schedule_df = pd.read_csv(schedule_path)

    realtime_df['recptnDt'] = pd.to_datetime(realtime_df['recptnDt'], errors='coerce')
    realtime_df = realtime_df[realtime_df['recptnDt'].notna()].copy()
    realtime_df['btrainNo'] = realtime_df['btrainNo'].astype(str)
    realtime_df['btrainNo_num'] = realtime_df['btrainNo'].apply(lambda x: str(int(re.sub(r'\D', '', x))))
    realtime_df['direction'] = realtime_df['updnLine'].map(direction_map)
    realtime_df['express'] = realtime_df['btrainSttus'].map(express_map)
    realtime_df['line'] = realtime_df['subwayId'].map(subway_map)
    realtime_df['day_type'] = realtime_df['recptnDt'].apply(get_day_type)

    schedule_df['열차코드'] = schedule_df['열차코드'].astype(str)
    schedule_df['열차코드_num'] = schedule_df['열차코드'].apply(lambda x: str(int(re.sub(r'\D', '', x))))
    schedule_df['열차도착시간(dt)'] = pd.to_datetime(schedule_df['열차도착시간'], format='%H:%M:%S', errors='coerce')
    schedule_df = schedule_df[schedule_df['열차도착시간(dt)'].notna()].copy()

    all_results = []

    for line in sorted(realtime_df['line'].dropna().unique()):
        realtime_line = realtime_df[realtime_df['line'] == line]
        schedule_line = schedule_df[schedule_df['호선'] == line]

        if realtime_line.empty or schedule_line.empty:
            continue

        merged = pd.merge(
            realtime_line,
            schedule_line,
            left_on=['btrainNo_num', 'direction', 'express', 'line', 'day_type', 'bstatnNm'],
            right_on=['열차코드_num', '방향', '급행여부', '호선', '주중주말', '도착역'],
            how='inner'
        )

        if merged.empty:
            print(f'No matches for 서울 line {line}')
            continue

        merged['scheduled_dt'] = merged['열차도착시간(dt)'].apply(lambda t: datetime.combine(datetime.today(), t.time()))
        merged['actual_dt'] = merged['recptnDt'].apply(lambda t: datetime.combine(datetime.today(), t.time()))
        merged['delay_min'] = (merged['actual_dt'] - merged['scheduled_dt']).dt.total_seconds() / 60

        output_name = f'서울_line{line}_{date}_{time_of_day}_delay.csv'

        merged[['statnNm', 'line', 'btrainNo', 'recptnDt', 'updnLine', '열차도착시간(dt)', 'delay_min']].to_csv(
            output_name, index=False, encoding='utf-8-sig'
        )

        summary = merged['delay_min'].agg(['count', 'mean', 'std', 'min', 'max']).round(2)
        summary['station'] = '서울'
        summary['line'] = line
        all_results.append(summary)
        print(f'서울 line {line} saved as {output_name}')
    
    if all_results:
        final_summary = pd.DataFrame(all_results)[['station', 'line', 'count', 'mean', 'std', 'min', 'max']]
        print('\n서울역 delay summary')
        print(final_summary.to_string(index=False))
    else:
        print('No analysis result')

except Exception as e:
    print(f'Analysis failed ({e})')

## Visualization

In [None]:
line = 1 # 호선 변경하기
time_of_day = 'morning' # 시간대 변경하기
dates = {
    'Weekday (0519)': '0519',
    'Weekend (0525)': '0525'
}

direction_label_map = {
    '상행': 'Up',
    '하행': 'Down'
}

for direction in ['상행', '하행']:
    plt.figure(figsize=(8, 5))

    delay_all = []

    for label, date in dates.items():
        filename = f'서울_line{line}_{date}_{time_of_day}_delay.csv'

        if not os.path.exists(filename):
            continue

        df = pd.read_csv(filename)
        if 'updnLine' not in df.columns or df.empty:
            continue

        df_dir = df[df['updnLine'] == direction]
        if df_dir.empty:
            continue

        delay_all.extend(df_dir['delay_min'].dropna().tolist())
    
    if not delay_all:
        print(f'No delay data found for direction {direction}')
        continue

    dmin, dmax = int(np.floor(min(delay_all))), int(np.ceil(max(delay_all)))
    bins = np.arange(dmin, dmax + 1)

    for label, date in dates.items():
        filename = f'서울_line{line}_{date}_{time_of_day}_delay.csv'

        if not os.path.exists(filename):
            continue

        df = pd.read_csv(filename)
        df_dir = df[df['updnLine'] == direction]
        if df_dir.empty:
            continue

        counts, edges = np.histogram(df_dir['delay_min'], bins=bins)
        centers = (edges[:-1] + edges[1:]) / 2
        plt.plot(centers, counts, label=label, marker='o')
    
    label_eng = direction_label_map.get(direction, direction)
    plt.axvline(0, color='gray', linestyle='--', label='Scheduled Time')
    plt.title(f'Seoul Station Line {line} - {label_eng}', fontsize=14)
    plt.xlabel('Delay (minutes)')
    plt.ylabel('Number of subways')
    plt.xticks(bins)
    plt.grid(axis='y', linestyle=':', alpha=0.6)
    plt.legend()
    plt.tight_layout()
    plt.show()