In [None]:
import pandas as pd
import os
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['axes.unicode_minus'] = False

base_dir = 'data'
time_slots = ['morning', 'afternoon', 'evening']
holiday_dates = {'0603', '0606'}
year = 2025

all_records = []

for date in sorted(os.listdir(base_dir)):
    if not date.isdigit():
        continue
    for time in time_slots:
        path = f'{base_dir}/{date}/delay_{date}_{time}.csv'
        if not os.path.exists(path):
            continue
        df = pd.read_csv(path, parse_dates=['예정시간', '실제시간'])
        df['날짜'] = date
        df['시간대'] = time

        dt = datetime.strptime(f'{year}{date}', '%Y%m%d')
        weekday = dt.weekday()
        if date in holiday_dates:
            df['요일구분'] = '공휴일'
        elif weekday < 5:
            df['요일구분'] = '평일'
        else:
            df['요일구분'] = '주말'

        all_records.append(df)

all_df = pd.concat(all_records, ignore_index=True)

all_df['지연여부'] = all_df['지연시간(분)'] >= 1

시간대_순서 = ['morning', 'afternoon', 'evening']
요일_순서 = ['평일', '주말', '공휴일']

all_df['시간대'] = pd.Categorical(all_df['시간대'], categories=시간대_순서, ordered=True)
all_df['요일구분'] = pd.Categorical(all_df['요일구분'], categories=요일_순서, ordered=True)

line_time_stats = (
    all_df
    .groupby(['호선', '요일구분', '시간대'])
    .agg(지연횟수=('지연여부', 'sum'),
         전체횟수=('지연여부', 'count'),
         평균지연=('지연시간(분)', 'mean'),
         지연된평균지연=('지연시간(분)', lambda x: x[x >= 1].mean()))
    .reset_index()
)

line_time_stats['지연비율(%)'] = (line_time_stats['지연횟수'] / line_time_stats['전체횟수']) * 100

import seaborn as sns
import matplotlib.pyplot as plt

g = sns.catplot(
    data=line_time_stats,
    x='시간대',
    y='지연비율(%)',
    hue='요일구분',
    col='호선',
    kind='bar',
    col_wrap=3,
    height=4,
    aspect=1.2
)

g.set_titles("{col_name}호선")
g.fig.suptitle("요일·시간대별 지연 비율 (호선별)", fontsize=16)

g.set_axis_labels("시간대", "지연 비율 (%)")
for ax in g.axes.flatten():
    ax.tick_params(axis='x', rotation=0)
    ax.grid(True, axis='y')

g.fig.subplots_adjust(top=0.92, bottom=0.08, left=0.08, right=0.95, hspace=0.35, wspace=0.25)

plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['axes.unicode_minus'] = False

# 1. 지연된 열차만 필터링
df_violin = all_df[all_df['지연시간(분)'] >= 1]

# 2. 분석 대상 설정 (역사 + 호선 기준)
top5_station_lines = [
    ('잠실', 2),
    ('홍대입구', 2),
    ('강남', 2),
    ('구로디지털단지', 2),
    ('서울', 1)
]

bottom5_station_lines = [
    ('도림천', 2),
    ('남태령', 4),
    ('신답', 2),
    ('동작', 4),
    ('버티고개', 6),
]

# 3. Top 5 시각화
fig, axes = plt.subplots(1, 5, figsize=(22, 5), sharey=True)
axes = axes.flatten()

for i, (station, line) in enumerate(top5_station_lines):
    ax = axes[i]
    subset = df_violin[
        (df_violin['역사명'] == station) & (df_violin['호선'] == line)
    ]

    sns.violinplot(data=subset, x='요일구분', y='지연시간(분)', inner='quartile', ax=ax)
    ax.set_title(f"{station} ({line})")
    ax.set_xlabel('')
    ax.set_ylabel('지연시간 (분)' if i == 0 else '')
    ax.grid(True, axis='y')

plt.suptitle("요일별 지연시간 분포 (Top 5: 역사 + 호선 기준)", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.92])
plt.show()

# 4. Bottom 5 시각화
fig, axes = plt.subplots(1, 5, figsize=(22, 5), sharey=True)
axes = axes.flatten()

for i, (station, line) in enumerate(bottom5_station_lines):
    ax = axes[i]
    subset = df_violin[
        (df_violin['역사명'] == station) & (df_violin['호선'] == line)
    ]

    sns.violinplot(data=subset, x='요일구분', y='지연시간(분)', inner='quartile', ax=ax)
    ax.set_title(f"{station} ({line})")
    ax.set_xlabel('')
    ax.set_ylabel('지연시간 (분)' if i == 0 else '')
    ax.grid(True, axis='y')

plt.suptitle("요일별 지연시간 분포 (Bottom 5: 역사 + 호선 기준)", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.92])
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. 데이터 요약 함수
def summarize_delays_with_rate(station_line_list, label):
    query_str = " | ".join([
        f"(역사명 == '{station}' and 호선 == {line})"
        for station, line in station_line_list
    ])

    df_all_filtered = all_df.query(query_str)
    df_delay_filtered = df_violin.query(query_str)

    total_counts = (
        df_all_filtered
        .groupby(['역사명', '호선', '요일구분'])
        .size()
        .reset_index(name='전체횟수')
    )

    delay_counts = (
        df_delay_filtered
        .groupby(['역사명', '호선', '요일구분'])
        .size()
        .reset_index(name='지연횟수')
    )

    summary = pd.merge(total_counts, delay_counts, on=['역사명', '호선', '요일구분'], how='left')
    summary['지연횟수'] = summary['지연횟수'].fillna(0).astype(int)
    summary['지연비율(%)'] = (summary['지연횟수'] / summary['전체횟수'] * 100).round(1)
    summary = summary.sort_values(['역사명', '호선', '요일구분'])
    summary = summary[summary['전체횟수'] > 0]

    print(f"\n📊 {label} 지연 횟수 및 비율 요약:")
    print(summary.to_string(index=False))


# 2. 시각화 함수
def plot_delay_rates(station_line_list, label):
    query_str = " | ".join([
        f"(역사명 == '{station}' and 호선 == {line})"
        for station, line in station_line_list
    ])
    
    df_all_filtered = all_df.query(query_str)
    df_delay_filtered = df_violin.query(query_str)

    total_counts = (
        df_all_filtered
        .groupby(['역사명', '호선', '요일구분'], observed=True)
        .size()
        .reset_index(name='전체횟수')
    )

    delay_counts = (
        df_delay_filtered
        .groupby(['역사명', '호선', '요일구분'], observed=True)
        .size()
        .reset_index(name='지연횟수')
    )

    summary = pd.merge(total_counts, delay_counts, on=['역사명', '호선', '요일구분'], how='left')
    summary['지연횟수'] = summary['지연횟수'].fillna(0).astype(int)
    summary['지연비율(%)'] = (summary['지연횟수'] / summary['전체횟수'] * 100).round(1)
    summary = summary[summary['전체횟수'] > 0]

    # 역+호선 이름 병합
    summary['역_호선'] = summary['역사명'] + ' (' + summary['호선'].astype(str) + '호선)'

    # 순서 고정
    station_order = [f"{station} ({line}호선)" for station, line in station_line_list]
    summary['역_호선'] = pd.Categorical(summary['역_호선'], categories=station_order, ordered=True)

    # 시각화
    sns.set_palette('Set2')
    plt.figure(figsize=(10, 6))
    sns.barplot(
        data=summary,
        x='요일구분',
        y='지연비율(%)',
        hue='역_호선'
    )
    plt.title(f'{label} - 요일별 지연비율(%)', fontsize=15)
    plt.ylabel('지연 비율 (%)')
    plt.xlabel('요일 구분')
    plt.ylim(0, 30)
    plt.legend(title='역사 (호선)', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.grid(True, axis='y')
    plt.show()

# Top 5 & Bottom 5 역 정의
top5_station_lines = [
    ('잠실', 2),
    ('홍대입구', 2),
    ('강남', 2),
    ('구로디지털단지', 2),
    ('서울', 1)
]

bottom5_station_lines = [
    ('도림천', 2),
    ('남태령', 4),
    ('신답', 2),
    ('동작', 4),
    ('버티고개', 6),
]

# 표 요약
summarize_delays_with_rate(top5_station_lines, "Top 5")
summarize_delays_with_rate(bottom5_station_lines, "Bottom 5")

# 시각화
plot_delay_rates(top5_station_lines, "Top 5")
plot_delay_rates(bottom5_station_lines, "Bottom 5")