In [1]:
import os
import pandas as pd
import geopandas as gpd

## Calculate Annual average of daily traffic

In [2]:
output_dir = '../../../data/subway_traffic/processed'

# import data
df_traffic_hourly = pd.read_csv('../../../data/subway_traffic/raw_hourly/서울시 지하철 호선별 역별 시간대별 승하차 인원 정보.csv', encoding='cp949')

In [3]:
traffic_columns = [c for c in df_traffic_hourly.columns if c.endswith('승차인원')]
df_traffic_hourly.loc[:,'hourly_avg'] =  df_traffic_hourly.loc[:,traffic_columns].mean(axis=1)

df_traffic_hourly

Unnamed: 0,사용월,호선명,지하철역,04시-05시 승차인원,04시-05시 하차인원,05시-06시 승차인원,05시-06시 하차인원,06시-07시 승차인원,06시-07시 하차인원,07시-08시 승차인원,...,00시-01시 승차인원,00시-01시 하차인원,01시-02시 승차인원,01시-02시 하차인원,02시-03시 승차인원,02시-03시 하차인원,03시-04시 승차인원,03시-04시 하차인원,작업일자,hourly_avg
0,202312,1호선,동대문,876,24,11117,2085,8840,5685,14001,...,993,3017,157,464,0,80,0,0,20240103,15194.000000
1,202312,1호선,동묘앞,186,2,2785,956,3330,4326,7069,...,142,2285,19,574,0,292,0,0,20240103,12123.250000
2,202312,1호선,서울역,700,35,7812,8436,12190,50415,37075,...,3170,4088,140,1884,0,283,0,0,20240103,68939.458333
3,202312,1호선,시청,73,1,2208,4356,3731,21903,7341,...,4290,1523,1053,383,0,10,0,0,20240103,33194.416667
4,202312,1호선,신설동,416,30,8628,1943,9192,8105,19443,...,382,2775,20,362,0,0,0,0,20240103,17070.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7381,202301,중앙선,원덕,43,0,257,7,504,47,646,...,0,57,0,0,0,0,0,0,20230203,405.458333
7382,202301,중앙선,중랑,93,2,5354,570,8444,1583,20180,...,34,410,0,0,0,0,0,0,20230203,6283.833333
7383,202301,중앙선,지평,0,0,0,0,0,0,231,...,0,0,0,0,0,0,0,0,20230203,61.791667
7384,202301,중앙선,팔당,0,0,124,59,443,344,1166,...,3,0,0,1,0,0,0,0,20230203,983.833333


In [4]:
df_traffic_hourly_avg = df_traffic_hourly.loc[:,['사용월','호선명','지하철역','hourly_avg']].groupby(['사용월','지하철역'], as_index=False).sum()
df_traffic_hourly_avg = df_traffic_hourly_avg.loc[:,['지하철역','hourly_avg']].groupby('지하철역', as_index=False).mean()
df_traffic_hourly_avg = df_traffic_hourly_avg.rename(columns={'지하철역':'station_name'})
df_traffic_hourly_avg.loc[:,'station_name'] = df_traffic_hourly_avg.loc[:,'station_name'].str.replace(r'\(.*\)', '', regex=True)

  df_traffic_hourly_avg = df_traffic_hourly.loc[:,['사용월','호선명','지하철역','hourly_avg']].groupby(['사용월','지하철역'], as_index=False).sum()


In [5]:
df_traffic_hourly_avg

Unnamed: 0,station_name,hourly_avg
0,4.19민주묘지,4318.423611
1,가능,8020.222222
2,가락시장,20162.673611
3,가산디지털단지,66706.829861
4,가양,26242.604167
...,...,...
528,회기,33102.246528
529,회룡,15615.156250
530,회현,31638.479167
531,효창공원앞,12702.413194


In [6]:
## merge with subway station location

# import subway station location
df_station = pd.read_csv('../../../data/subway_station/서울시 역사마스터 정보_2024.csv', encoding='cp949')
df_station

Unnamed: 0,역사_ID,역사명,호선,위도,경도
0,9996,미사,5호선,37.560927,127.193877
1,9995,강일,5호선,37.557490,127.175930
2,9010,동탄,수도권 광역급행철도,37.200340,127.095690
3,9009,구성,수도권 광역급행철도,37.299130,127.103890
4,9008,성남,수도권 광역급행철도,37.394670,127.120580
...,...,...,...,...,...
783,154,종로5가,1호선,37.570926,127.001849
784,153,종로3가,1호선,37.570406,126.991847
785,152,종각,1호선,37.570161,126.982923
786,151,시청,1호선,37.565715,126.977088


In [7]:
# Group by station name and calculate the mean of latitude and longitude
gdf_station_avg = df_station.groupby('역사명').agg({'위도': 'mean', '경도': 'mean'}).reset_index()

# Create a geometry column with the calculated mean coordinates
gdf_station_avg['geometry'] = gpd.points_from_xy(gdf_station_avg['경도'], gdf_station_avg['위도'])

# Convert to GeoDataFrame
gdf_station_avg = gpd.GeoDataFrame(gdf_station_avg, geometry='geometry')

gdf_station_avg.loc[:,'역사명'] = gdf_station_avg.loc[:,'역사명'].str.replace(r'\(.*\)', '', regex=True)

In [8]:
gdf_station = gpd.GeoDataFrame(df_traffic_hourly_avg.merge(gdf_station_avg, left_on='station_name', right_on='역사명', how='left').drop('역사명', axis=1))
gdf_station.loc[gdf_station.loc[:,'위도'].isnull()]

Unnamed: 0,station_name,hourly_avg,위도,경도,geometry
501,평택지제,5625.065972,,,


In [9]:
gdf_station = gdf_station.rename(columns={'역명': 'station_name', 'geometry': 'geometry', '위도': 'latitude', '경도': 'longitude'})
gdf_station

Unnamed: 0,station_name,hourly_avg,latitude,longitude,geometry
0,4.19민주묘지,4318.423611,37.649502,127.013684,POINT (127.01368 37.64950)
1,가능,8020.222222,37.748577,127.044213,POINT (127.04421 37.74858)
2,가락시장,20162.673611,37.492566,127.118077,POINT (127.11808 37.49257)
3,가산디지털단지,66706.829861,37.480959,126.882619,POINT (126.88262 37.48096)
4,가양,26242.604167,37.561391,126.854456,POINT (126.85446 37.56139)
...,...,...,...,...,...
529,회기,33102.246528,37.589460,127.057583,POINT (127.05758 37.58946)
530,회룡,15615.156250,37.724711,127.047216,POINT (127.04722 37.72471)
531,회현,31638.479167,37.558514,126.978246,POINT (126.97825 37.55851)
532,효창공원앞,12702.413194,37.538906,126.961797,POINT (126.96180 37.53891)


## Calculation of annual average of hourly peak time traffic

In [10]:
df_traffic_hourly_peak = df_traffic_hourly.loc[:,['사용월','호선명','지하철역','08시-09시 승차인원']].groupby(['사용월','지하철역'], as_index=False).sum()
df_traffic_hourly_peak_avg = df_traffic_hourly_peak.loc[:,['지하철역','08시-09시 승차인원']].groupby('지하철역', as_index=False).mean()
df_traffic_hourly_peak_avg = df_traffic_hourly_peak_avg.rename(columns={'지하철역':'station_name', '08시-09시 승차인원':'peak_avg'})
df_traffic_hourly_peak_avg.loc[:,'station_name'] = df_traffic_hourly_peak_avg.loc[:,'station_name'].str.replace(r'\(.*\)', '', regex=True)

  df_traffic_hourly_peak = df_traffic_hourly.loc[:,['사용월','호선명','지하철역','08시-09시 승차인원']].groupby(['사용월','지하철역'], as_index=False).sum()


In [11]:
df_traffic_hourly_peak_avg

Unnamed: 0,station_name,peak_avg
0,4.19민주묘지,10010.000000
1,가능,21737.416667
2,가락시장,35219.666667
3,가산디지털단지,41987.166667
4,가양,79651.416667
...,...,...
528,회기,75904.333333
529,회룡,47957.500000
530,회현,8638.416667
531,효창공원앞,34738.833333


In [12]:
gdf_station = gpd.GeoDataFrame(df_traffic_hourly_peak_avg.merge(gdf_station, on='station_name', how='left'))
gdf_station.loc[gdf_station.loc[:,'longitude'].isnull()]

Unnamed: 0,station_name,peak_avg,hourly_avg,latitude,longitude,geometry
501,평택지제,9777.0,5625.065972,,,


In [15]:
gdf_station = gdf_station.loc[~gdf_station.loc[:,'longitude'].isnull()]
gdf_station.drop('geometry', axis=1).to_json(f'{output_dir}/subway_traffic_hourly_avg.json', orient='records')