# 데이터 불러오기 & 전처리

In [1]:
!pip install dbfread
!pip install haversine
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dbfread import DBF
from haversine import haversine
from tqdm.notebook import tqdm
%matplotlib inline
import platform
import pickle
import re
platform.system()
if platform.system() == 'Darwin': # Mac 환경 폰트 설정
    plt.rc('font', family='AppleGothic')
elif platform.system() == 'Windows': # Windows 환경 폰트 설정
    plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정
%config InlineBackend.figure_format = 'retina'




In [2]:
bus_date_1 = pd.read_csv('2001버스 일자별 승하차인원.csv', index_col=0)
bus_date_2 = pd.read_csv('2002버스 일자별 승하차인원.csv', index_col=0)
bus_date_3 = pd.read_csv('2003버스 일자별 승하차인원.csv', index_col=0)
bus_date_4 = pd.read_csv('2004버스 일자별 승하차인원.csv', index_col=0)
bus_date_5 = pd.read_csv('2005버스 일자별 승하차인원.csv', index_col=0)
bus_date_6 = pd.read_csv('2006버스 일자별 승하차인원.csv', index_col=0)
bus_date_7 = pd.read_csv('2007버스 일자별 승하차인원.csv', index_col=0)

In [3]:
with open('./지하철 승차데이터_규호.pkl','rb') as up:
    train_up = pickle.load(up)

with open('./지하철 하차데이터_규호.pkl','rb') as down:
    train_down = pickle.load(down)

In [4]:
#지하철유동인구 파일에서 가져온 중구 노원구 전철역
Seoul_junggu = ['서대문', '충정로', '시청', '회현', '서울역', '을지로입구', '명동', '을지로3가', '을지로4가', '충무로', '동대입구', '약수', '버티고개', '청구', '신당', '동대문역사문화공원', ]
Seoul_nowon = ['수락산', '마들', '노원', '중계', '하계', '월계', '광운대', '석계', '태릉입구', '화랑대', '공릉', '상계', '당고개']

## 지하철역 기준 주위에 있는 버스정류장의 승하차인원을 각각 볼 수 있는 함수 

In [5]:
def up_prepro(df):
    df_station = df.reset_index()
    df_up = df_station[['사용일자', '역명', '승차총승객수','지하철역']].groupby(['지하철역','사용일자', '역명']).sum()
    df_up_new = df_up.reset_index()
    df_new = df_up_new.set_index('지하철역')
    return df_new

In [6]:
def down_prepro(df):
    df_station = df.reset_index()
    df_down = df_station[['사용일자', '역명', '하차총승객수','지하철역']].groupby(['지하철역','사용일자', '역명']).sum()
    df_down_new = df_down.reset_index()
    df_new = df_down_new.set_index('지하철역')
    return df_new

In [7]:
#up은 승차, down은 하차, 1-7은 2020년의 몇월인지.
bt_up_1 = up_prepro(bus_date_1)
bt_down_1 = down_prepro(bus_date_1)
bt_up_2 = up_prepro(bus_date_2)
bt_down_2 = down_prepro(bus_date_2)
bt_up_3 = up_prepro(bus_date_3)
bt_down_3 = down_prepro(bus_date_3)
bt_up_4 = up_prepro(bus_date_4)
bt_down_4 = down_prepro(bus_date_4)
bt_up_5 = up_prepro(bus_date_5)
bt_down_5 = down_prepro(bus_date_5)
bt_up_6 = up_prepro(bus_date_6)
bt_down_6 = down_prepro(bus_date_6)
bt_up_7 = up_prepro(bus_date_7)
bt_down_7 = down_prepro(bus_date_7)

## 지하철역 기준 주위에 있는 버스정류장의 승하차인원을 모두 합쳐 볼 수 있는 함수

In [8]:
def up_preproT(df):
    df_station = df.reset_index()
    df_up = df_station[['사용일자', '승차총승객수','지하철역']].groupby(['지하철역','사용일자']).sum()
    df_up_new = df_up.reset_index()
    df_new = df_up_new.set_index('지하철역')
    return df_new

In [9]:
def down_preproT(df):
    df_station = df.reset_index()
    df_down = df_station[['사용일자', '하차총승객수','지하철역']].groupby(['지하철역','사용일자']).sum()
    df_down_new = df_down.reset_index()
    df_new = df_down_new.set_index('지하철역')
    return df_new

In [10]:
#up은 승차, down은 하차, 1-7은 2020년의 몇월인지.
btT_up_1 = up_preproT(bus_date_1)
btT_down_1 = down_preproT(bus_date_1)
btT_up_2 = up_preproT(bus_date_2)
btT_down_2 = down_preproT(bus_date_2)
btT_up_3 = up_preproT(bus_date_3)
btT_down_3 = down_preproT(bus_date_3)
btT_up_4 = up_preproT(bus_date_4)
btT_down_4 = down_preproT(bus_date_4)
btT_up_5 = up_preproT(bus_date_5)
btT_down_5 = down_preproT(bus_date_5)
btT_up_6 = up_preproT(bus_date_6)
btT_down_6 = down_preproT(bus_date_6)
btT_up_7 = up_preproT(bus_date_7)
btT_down_7 = down_preproT(bus_date_7)

## 지하철 승하차인원 데이터 전처리

In [11]:
train_up = train_up.reset_index()
train_down = train_down.reset_index()

In [12]:
train_col_need = ['사용일자', '호선', '지하철역', '06시 이전', '06 ~ 07', '07 ~ 08', '08 ~ 09', '09 ~ 10', '10 ~ 11', '11 ~ 12', '12 ~ 13', '13 ~ 14', '14 ~ 15', '15 ~ 16', '16 ~ 17', '17 ~ 18', '18 ~ 19', '19 ~ 20', '20 ~ 21', '21 ~ 22', '22 ~ 23', '23 ~ 24', '24시 이후', '합 계']

In [13]:
train_up.columns = train_col_need
train_down.columns = train_col_need

In [14]:
date_list = []
for day in list(train_down['사용일자']):
    new_day = re.sub('-','', day)
    date_list.append(int(new_day))
len(date_list)

41878

In [15]:
train_up['사용일자'] = date_list
train_up['사용일자'][0]

20200101

In [16]:
train_down['사용일자'] = date_list
train_down

Unnamed: 0,사용일자,호선,지하철역,06시 이전,06 ~ 07,07 ~ 08,08 ~ 09,09 ~ 10,10 ~ 11,11 ~ 12,...,16 ~ 17,17 ~ 18,18 ~ 19,19 ~ 20,20 ~ 21,21 ~ 22,22 ~ 23,23 ~ 24,24시 이후,합 계
0,20200101,1호선,서울역,235,952,828,1043,1694,1655,2047,...,2354,2261,1922,1696,1620,1181,768,426,77,30085
1,20200101,1호선,시청,81,223,334,570,611,612,822,...,783,849,602,435,293,273,174,149,26,10608
2,20200101,1호선,종각,71,191,295,511,855,1026,1323,...,1450,1352,1132,725,545,344,225,146,25,16401
3,20200101,1호선,종로3가,53,147,190,363,594,952,1515,...,1129,928,637,448,320,232,230,172,61,15360
4,20200101,1호선,종로5가,22,124,170,240,403,595,994,...,1099,1006,605,397,264,190,136,101,15,11997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41873,20200531,8호선,남한산성입구(성남법원.검찰청),30,80,102,151,273,329,206,...,389,366,490,404,407,492,453,218,46,5656
41874,20200531,8호선,단대오거리,42,101,124,111,147,176,169,...,300,372,351,373,321,374,352,234,41,4588
41875,20200531,8호선,신흥,13,51,42,63,80,117,120,...,186,211,223,200,133,200,124,92,7,2551
41876,20200531,8호선,수진,15,48,35,66,95,107,101,...,187,210,191,139,148,189,149,93,8,2327


### 지하철역 다른호선 같은역의 인원은 한 역으로 보고 합쳐주기

In [17]:
# 같은 역은 호선이 달라도 그냥 합쳐주자
group_up = train_up.groupby(['사용일자', '지하철역']).sum()
group_up = group_up.reset_index()
group_up

Unnamed: 0,사용일자,지하철역,06시 이전,06 ~ 07,07 ~ 08,08 ~ 09,09 ~ 10,10 ~ 11,11 ~ 12,12 ~ 13,...,16 ~ 17,17 ~ 18,18 ~ 19,19 ~ 20,20 ~ 21,21 ~ 22,22 ~ 23,23 ~ 24,24시 이후,합 계
0,20200101,가락시장,126,177,163,204,283,390,427,513,...,513,534,546,365,356,328,270,85,3,6940
1,20200101,가산디지털단지,120,144,155,237,273,255,374,482,...,1203,1322,1235,967,664,965,300,136,0,11290
2,20200101,강남,1844,1465,877,763,763,908,1167,1514,...,2687,3108,3006,2676,3082,3779,3859,1761,6,39422
3,20200101,강남구청,81,123,82,102,120,123,117,157,...,202,291,265,222,251,312,286,108,1,3562
4,20200101,강동,217,299,309,408,476,579,599,693,...,700,698,681,464,396,392,214,96,0,9223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36786,20200531,홍제,134,245,313,500,656,656,659,840,...,610,552,426,324,278,218,126,65,2,8557
36787,20200531,화곡,290,379,587,992,1140,893,940,1014,...,721,658,530,451,368,326,246,91,0,12211
36788,20200531,화랑대(서울여대입구),81,126,180,330,347,372,316,413,...,292,277,193,148,121,110,69,39,0,4424
36789,20200531,회현(남대문시장),36,64,73,100,126,184,232,392,...,552,598,902,484,839,256,119,37,1,6656


In [18]:
# 같은 역은 호선이 달라도 그냥 합쳐주자
group_down = train_down.groupby(['사용일자', '지하철역']).sum()
group_down = group_down.reset_index()
group_down

Unnamed: 0,사용일자,지하철역,06시 이전,06 ~ 07,07 ~ 08,08 ~ 09,09 ~ 10,10 ~ 11,11 ~ 12,12 ~ 13,...,16 ~ 17,17 ~ 18,18 ~ 19,19 ~ 20,20 ~ 21,21 ~ 22,22 ~ 23,23 ~ 24,24시 이후,합 계
0,20200101,가락시장,40,181,172,325,412,377,468,466,...,553,630,516,393,318,308,288,248,55,7220
1,20200101,가산디지털단지,72,161,191,250,549,511,806,1030,...,915,783,516,454,330,302,251,170,8,10585
2,20200101,강남,199,725,848,1054,1780,1683,1924,2592,...,3255,3360,3274,2362,1768,1520,1234,803,6,37007
3,20200101,강남구청,26,131,123,255,341,243,220,183,...,259,306,221,180,176,189,116,110,16,3818
4,20200101,강동,25,279,220,272,310,348,390,430,...,625,678,693,577,565,606,586,403,74,8719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36786,20200531,홍제,28,171,158,271,366,427,303,384,...,661,656,697,546,522,570,509,289,3,8118
36787,20200531,화곡,28,180,150,212,329,365,353,570,...,904,1009,899,891,967,961,903,501,35,11321
36788,20200531,화랑대(서울여대입구),16,60,75,153,217,183,203,171,...,253,279,270,260,253,294,234,195,13,3795
36789,20200531,회현(남대문시장),7,144,226,363,1047,951,819,748,...,342,280,190,130,89,146,95,48,4,7249


In [19]:
with open('train_group_up.pkl', 'wb') as f:
    pickle.dump(group_up, f)
with open('train_group_down.pkl', 'wb') as f:
    pickle.dump(group_down, f)

# 지하철의 각 시간별 승하차 인원을 합계로 나눠주고(비율), 버스 승하차인구 곱해주는 함수

In [20]:
def extrapolating(bus_df, subway_df):
    bus_df.reset_index()
    bt_group = pd.merge(subway_df, bus_df, how='inner', on=['사용일자', '지하철역'])
    bus_df_list=[]
    for i in range(len(bt_group)):
        bus_df_list.append([bt_group.loc[i][2:-3] * bt_group.loc[i][-1] / bt_group.loc[i][-3]])
    temp_df = pd.DataFrame(bus_df_list[0][0])
    for idx, i in enumerate(bus_df_list):
        temp_df[idx] = i[0].to_frame()
    df = temp_df.T
    df['사용일자'] = bt_group['사용일자']
    df['지하철역'] = bt_group['지하철역']
    df['역명'] = bt_group['역명']
    df = df[['사용일자', '지하철역', '역명', '06시 이전', '06 ~ 07', '07 ~ 08', '08 ~ 09', '09 ~ 10', '10 ~ 11', '11 ~ 12', '12 ~ 13', '13 ~ 14', '14 ~ 15', '15 ~ 16', '16 ~ 17','17 ~ 18', '18 ~ 19', '19 ~ 20', '20 ~ 21', '21 ~ 22', '22 ~ 23', '23 ~ 24', '24시 이후']]
    return df

In [21]:
bt_down_2

Unnamed: 0_level_0,사용일자,역명,하차총승객수
지하철역,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
가락시장,20200201,가락시장.가락몰,455
가락시장,20200201,가락시장.가락시장역,3990
가락시장,20200201,가락시장남문,219
가락시장,20200201,가락시장역,379
가락시장,20200201,건너말공원입구,467
...,...,...,...
효창공원앞,20200229,효창공원삼거리.윤봉길의사등묘역,68
효창공원앞,20200229,효창공원앞역,759
효창공원앞,20200229,효창공원역,149
효창공원앞,20200229,효창공원후문,291


In [22]:
group_down

Unnamed: 0,사용일자,지하철역,06시 이전,06 ~ 07,07 ~ 08,08 ~ 09,09 ~ 10,10 ~ 11,11 ~ 12,12 ~ 13,...,16 ~ 17,17 ~ 18,18 ~ 19,19 ~ 20,20 ~ 21,21 ~ 22,22 ~ 23,23 ~ 24,24시 이후,합 계
0,20200101,가락시장,40,181,172,325,412,377,468,466,...,553,630,516,393,318,308,288,248,55,7220
1,20200101,가산디지털단지,72,161,191,250,549,511,806,1030,...,915,783,516,454,330,302,251,170,8,10585
2,20200101,강남,199,725,848,1054,1780,1683,1924,2592,...,3255,3360,3274,2362,1768,1520,1234,803,6,37007
3,20200101,강남구청,26,131,123,255,341,243,220,183,...,259,306,221,180,176,189,116,110,16,3818
4,20200101,강동,25,279,220,272,310,348,390,430,...,625,678,693,577,565,606,586,403,74,8719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36786,20200531,홍제,28,171,158,271,366,427,303,384,...,661,656,697,546,522,570,509,289,3,8118
36787,20200531,화곡,28,180,150,212,329,365,353,570,...,904,1009,899,891,967,961,903,501,35,11321
36788,20200531,화랑대(서울여대입구),16,60,75,153,217,183,203,171,...,253,279,270,260,253,294,234,195,13,3795
36789,20200531,회현(남대문시장),7,144,226,363,1047,951,819,748,...,342,280,190,130,89,146,95,48,4,7249


In [None]:
bus_down_2 = extrapolating(bt_down_2, group_down)
bus_down_2

In [97]:
bus_up_1 = extrapolating(bt_up_1, group_up)
bus_down_1 = extrapolating(bt_down_1, group_down)
bus_up_2 = extrapolating(bt_up_2, group_up)
bus_down_2 = extrapolating(bt_down_2, group_down)#위에서 했음
bus_up_3 = extrapolating(bt_up_3, group_up)
bus_down_3 = extrapolating(bt_down_3, group_down)
bus_up_4 = extrapolating(bt_up_4, group_up)
bus_down_4 = extrapolating(bt_down_4, group_down)
bus_up_5 = extrapolating(bt_up_5, group_up)
bus_down_5 = extrapolating(bt_down_5, group_down)


In [98]:
with open('bus_up_1.pkl', 'wb') as f:
    pickle.dump(bus_up_1, f)
with open('bus_down_1.pkl', 'wb') as f:
    pickle.dump(bus_down_1, f)   
with open('bus_up_2.pkl', 'wb') as f:
    pickle.dump(bus_up_2, f)
with open('bus_down_2.pkl', 'wb') as f:
    pickle.dump(bus_down_2, f)   
with open('bus_up_3.pkl', 'wb') as f:
    pickle.dump(bus_up_3, f)
with open('bus_down_3.pkl', 'wb') as f:
    pickle.dump(bus_down_3, f)
with open('bus_up_4.pkl', 'wb') as f:
    pickle.dump(bus_up_4, f)
with open('bus_down_4.pkl', 'wb') as f:
    pickle.dump(bus_down_4, f)
with open('bus_up_5.pkl', 'wb') as f:
    pickle.dump(bus_up_5, f)
with open('bus_down_5.pkl', 'wb') as f:
    pickle.dump(bus_down_5, f)


In [99]:
bus_down_5

Unnamed: 0,사용일자,지하철역,역명,06시 이전,06 ~ 07,07 ~ 08,08 ~ 09,09 ~ 10,10 ~ 11,11 ~ 12,...,15 ~ 16,16 ~ 17,17 ~ 18,18 ~ 19,19 ~ 20,20 ~ 21,21 ~ 22,22 ~ 23,23 ~ 24,24시 이후
0,20200501,가락시장,가락시장.가락몰,4.11457,16.1497,17.1783,33.7909,33.0195,29.6764,32.4023,...,40.1685,40.5286,45.4146,44.4374,36.5683,28.442,24.9446,25.2532,14.6582,6.58332
1,20200501,가락시장,가락시장.가락시장역,29.0758,114.123,121.392,238.785,233.334,209.71,228.972,...,283.853,286.397,320.925,314.019,258.412,200.987,176.272,178.453,103.583,46.5213
2,20200501,가락시장,가락시장남문,2.14736,8.42839,8.96523,17.6352,17.2326,15.4878,16.9105,...,20.9636,21.1515,23.7015,23.1915,19.0847,14.8436,13.0184,13.1794,7.64997,3.43578
3,20200501,가락시장,가락시장역,2.42839,9.53144,10.1385,19.9432,19.4878,17.5148,19.1236,...,23.7072,23.9197,26.8034,26.2266,21.5823,16.7863,14.7221,14.9043,8.65114,3.88543
4,20200501,가락시장,건너말공원입구,2.47163,9.70113,10.319,20.2982,19.8348,17.8266,19.4641,...,24.1293,24.3455,27.2806,26.6936,21.9666,17.0851,14.9842,15.1696,8.80517,3.9546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197640,20200531,효창공원앞,효창공원삼거리.윤봉길의사등묘역,0.418023,1.83287,2.12227,2.66892,3.85867,3.85867,3.34418,...,5.7237,5.98094,7.1707,5.56292,6.27035,5.24137,4.98412,4.50179,2.82969,0.192934
197641,20200531,효창공원앞,효창공원앞역,3.8551,16.9031,19.5721,24.6133,35.5855,35.5855,30.8408,...,52.7852,55.1576,66.1298,51.3025,57.8265,48.337,45.9647,41.5165,26.0961,1.77928
197642,20200531,효창공원앞,효창공원역,0.768956,3.37158,3.90393,4.90949,7.09805,7.09805,6.15165,...,10.5288,11.002,13.1906,10.233,11.5343,9.64152,9.16832,8.28106,5.20524,0.354903
197643,20200531,효창공원앞,효창공원후문,1.78563,7.8293,9.0655,11.4006,16.4827,16.4827,14.285,...,24.4494,25.5482,30.6304,23.7626,26.7844,22.389,21.2902,19.2299,12.0873,0.824137


# 버스 하차 가중치가 적용된 지하철 승하차 인원

In [73]:
bt_up_1

Unnamed: 0_level_0,사용일자,역명,승차총승객수
지하철역,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
가락시장,20200101,가락시장.가락몰,307
가락시장,20200101,가락시장.가락시장역,2920
가락시장,20200101,가락시장남문,127
가락시장,20200101,가락시장역,515
가락시장,20200101,건너말공원입구,92
...,...,...,...
효창공원앞,20200131,효창공원삼거리.윤봉길의사등묘역,16
효창공원앞,20200131,효창공원앞역,2279
효창공원앞,20200131,효창공원역,186
효창공원앞,20200131,효창공원후문,372


In [77]:
btT_up_1.reset_index()

Unnamed: 0,지하철역,사용일자,승차총승객수
0,가락시장,20200101,4766
1,가락시장,20200102,10665
2,가락시장,20200103,12115
3,가락시장,20200104,9038
4,가락시장,20200105,5944
...,...,...,...
7063,효창공원앞,20200127,6500
7064,효창공원앞,20200128,15955
7065,효창공원앞,20200129,16917
7066,효창공원앞,20200130,16949


In [75]:
train_up

Unnamed: 0,사용일자,호선,지하철역,06시 이전,06 ~ 07,07 ~ 08,08 ~ 09,09 ~ 10,10 ~ 11,11 ~ 12,...,16 ~ 17,17 ~ 18,18 ~ 19,19 ~ 20,20 ~ 21,21 ~ 22,22 ~ 23,23 ~ 24,24시 이후,합 계
0,20200101,1호선,서울역,356,280,313,774,923,1418,2288,...,3194,3342,3002,2857,2311,2523,1830,987,25,36641
1,20200101,1호선,시청,105,118,123,206,318,309,442,...,1019,1234,1383,1271,1061,991,572,217,2,12212
2,20200101,1호선,종각,798,366,198,236,310,379,666,...,1748,1890,1879,1827,1955,1800,1259,406,6,20523
3,20200101,1호선,종로3가,338,189,153,198,311,466,673,...,1843,1898,1561,1265,1103,1009,718,288,30,17292
4,20200101,1호선,종로5가,60,50,67,143,228,368,626,...,1473,1257,976,884,823,617,361,111,2,12741
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41873,20200531,8호선,남한산성입구(성남법원.검찰청),149,190,290,508,537,500,462,...,426,382,276,207,133,133,80,48,1,6149
41874,20200531,8호선,단대오거리,133,146,244,321,411,380,319,...,290,238,232,196,123,109,76,35,0,4625
41875,20200531,8호선,신흥,75,70,114,143,174,160,131,...,183,160,125,98,112,93,45,24,1,2392
41876,20200531,8호선,수진,55,76,116,179,202,160,171,...,192,174,137,99,96,89,49,35,1,2612


In [79]:
bt_group = pd.merge(group_up, btT_up_1, how='inner', on=['사용일자', '지하철역'])
bt_group

Unnamed: 0,사용일자,지하철역,06시 이전,06 ~ 07,07 ~ 08,08 ~ 09,09 ~ 10,10 ~ 11,11 ~ 12,12 ~ 13,...,17 ~ 18,18 ~ 19,19 ~ 20,20 ~ 21,21 ~ 22,22 ~ 23,23 ~ 24,24시 이후,합 계,승차총승객수
0,20200101,가락시장,126,177,163,204,283,390,427,513,...,534,546,365,356,328,270,85,3,6940,4766
1,20200101,가산디지털단지,120,144,155,237,273,255,374,482,...,1322,1235,967,664,965,300,136,0,11290,37263
2,20200101,강남,1844,1465,877,763,763,908,1167,1514,...,3108,3006,2676,3082,3779,3859,1761,6,39422,23452
3,20200101,강남구청,81,123,82,102,120,123,117,157,...,291,265,222,251,312,286,108,1,3562,5678
4,20200101,강동,217,299,309,408,476,579,599,693,...,698,681,464,396,392,214,96,0,9223,3747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6970,20200131,홍제,315,788,2590,3260,1805,1090,949,950,...,1100,1210,672,430,355,327,159,31,19689,63618
6971,20200131,화곡,852,1605,5173,6855,2927,1599,1370,1386,...,1422,1799,1049,653,594,512,326,48,32910,38015
6972,20200131,화랑대(서울여대입구),193,766,2936,2580,1102,601,449,552,...,628,592,326,187,155,133,69,10,13255,19373
6973,20200131,회현(남대문시장),78,127,221,384,442,633,921,1144,...,4536,5286,2484,2250,986,653,295,43,28075,22482


In [None]:
bus_df_list=[]
for i in range(len(bt_group)):
    bus_df_list.append([bt_group.loc[i][2:-2]+(bt_group.loc[i][2:-2] * bt_group.loc[i][-1] / bt_group.loc[i][-2])])
temp_df = pd.DataFrame(bus_df_list[0][0])

# 지하철 승하차 인원을 바탕으로 버스 승하차인원을 분배해주는 함수 짜기

In [80]:
def extrapolatingT(bus_df, subway_df):
    bus_df.reset_index()
    bt_group = pd.merge(subway_df, bus_df, how='inner', on=['사용일자', '지하철역'])
    bus_df_list=[]
    for i in range(len(bt_group)):
        bus_df_list.append([bt_group.loc[i][2:-2]+(bt_group.loc[i][2:-2] * bt_group.loc[i][-1] / bt_group.loc[i][-2])])
    temp_df = pd.DataFrame(bus_df_list[0][0])
    for idx, i in enumerate(bus_df_list):
        temp_df[idx] = i[0].to_frame()
    df = temp_df.T
    df['사용일자'] = bt_group['사용일자']
    df['지하철역'] = bt_group['지하철역']
    df = df[['사용일자', '지하철역', '06시 이전', '06 ~ 07', '07 ~ 08', '08 ~ 09', '09 ~ 10', '10 ~ 11', '11 ~ 12', '12 ~ 13', '13 ~ 14', '14 ~ 15', '15 ~ 16', '16 ~ 17','17 ~ 18', '18 ~ 19', '19 ~ 20', '20 ~ 21', '21 ~ 22', '22 ~ 23', '23 ~ 24', '24시 이후']]
    return df

In [81]:
busT_up_1 = extrapolatingT(btT_up_1, group_up)

In [83]:
busT_up_1

Unnamed: 0,사용일자,지하철역,06시 이전,06 ~ 07,07 ~ 08,08 ~ 09,09 ~ 10,10 ~ 11,11 ~ 12,12 ~ 13,...,15 ~ 16,16 ~ 17,17 ~ 18,18 ~ 19,19 ~ 20,20 ~ 21,21 ~ 22,22 ~ 23,23 ~ 24,24시 이후
0,20200101,가락시장,212.53,298.554,274.939,344.096,477.348,657.83,720.239,865.299,...,1050.84,865.299,900.721,920.962,615.661,600.481,553.252,455.421,143.373,5.06023
1,20200101,가산디지털단지,516.064,619.277,666.582,1019.23,1174.05,1096.64,1608.4,2072.86,...,4485.45,5173.54,5685.3,5311.16,4158.61,2855.55,4150.01,1290.16,584.872,0
2,20200101,강남,2940.99,2336.52,1398.72,1216.91,1216.91,1448.17,1861.24,2414.67,...,3786.28,4285.49,4956.94,4794.26,4267.94,4915.47,6027.11,6154.7,2808.61,9.56938
3,20200101,강남구청,210.118,319.068,212.712,264.593,311.286,319.068,303.504,407.266,...,671.859,523.998,754.868,687.423,575.879,651.106,809.343,741.898,280.157,2.59405
4,20200101,강동,305.16,420.474,434.536,573.757,669.383,814.229,842.354,974.543,...,968.918,984.387,981.574,957.668,652.508,556.882,551.257,300.941,135.002,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6970,20200131,홍제,1332.81,3334.14,10958.7,13793.5,7637.22,4611.95,4015.36,4019.59,...,3892.65,3964.58,4654.26,5119.68,2843.33,1819.39,1502.06,1383.58,672.752,131.165
6971,20200131,화곡,1836.16,3458.97,11148.4,14773.3,6308.04,3446.04,2952.51,2987,...,2362.01,2480.54,3064.58,3877.06,2260.72,1407.29,1280.14,1103.42,702.569,103.446
6972,20200131,화랑대(서울여대입구),475.081,1885.56,7227.15,6350.83,2712.64,1479.4,1105.24,1358.78,...,1294.78,1432.63,1545.86,1457.24,802.469,460.312,381.542,327.388,169.848,24.6156
6973,20200131,회현(남대문시장),140.461,228.7,397.973,691.501,795.946,1139.9,1658.52,2060.1,...,3408.88,4673.03,8168.35,9518.94,4473.15,4051.76,1775.57,1175.91,531.231,77.4337


In [84]:
busT_up_1 = extrapolatingT(btT_up_1, group_up)
busT_down_1 = extrapolatingT(btT_down_1, group_down)
busT_up_2 = extrapolatingT(btT_up_2, group_up)
busT_down_2 = extrapolatingT(btT_down_2, group_down)
busT_up_3 = extrapolatingT(btT_up_3, group_up)
busT_down_3 = extrapolatingT(btT_down_3, group_down)
busT_up_4 = extrapolatingT(btT_up_4, group_up)
busT_down_4 = extrapolatingT(btT_down_4, group_down)
busT_up_5 = extrapolatingT(btT_up_5, group_up)
busT_down_5 = extrapolatingT(btT_down_5, group_down)

In [85]:
with open('busT_up_1.pkl', 'wb') as f:
    pickle.dump(busT_up_1, f)
with open('busT_down_1.pkl', 'wb') as f:
    pickle.dump(busT_down_1, f)   
with open('busT_up_2.pkl', 'wb') as f:
    pickle.dump(busT_up_2, f)
with open('busT_down_2.pkl', 'wb') as f:
    pickle.dump(busT_down_2, f)   
with open('busT_up_3.pkl', 'wb') as f:
    pickle.dump(busT_up_3, f)
with open('busT_down_3.pkl', 'wb') as f:
    pickle.dump(busT_down_3, f)
with open('busT_up_4.pkl', 'wb') as f:
    pickle.dump(busT_up_4, f)
with open('busT_down_4.pkl', 'wb') as f:
    pickle.dump(busT_down_4, f)
with open('busT_up_5.pkl', 'wb') as f:
    pickle.dump(busT_up_5, f)
with open('busT_down_5.pkl', 'wb') as f:
    pickle.dump(busT_down_5, f)

In [None]:
def pklopen(route):
    with open('{}'.format(route), 'rb') as f:
        df = pickle.load(f)
    return df

In [None]:
busT_up_1 = pklopen('./busT_up_1.pkl')
busT_down_1 = pklopen('./busT_down_1.pkl')
busT_up_2 = pklopen('./busT_up_2.pkl')
busT_down_2 = pklopen('./busT_down_2.pkl')
busT_up_3 = pklopen('./busT_up_3.pkl')
busT_down_3 = pklopen('./busT_down_3.pkl')
busT_up_4 = pklopen('./busT_up_4.pkl')
busT_down_4 = pklopen('./busT_down_4.pkl')
busT_up_5 = pklopen('./busT_up_5.pkl')
busT_down_5 = pklopen('./busT_down_5.pkl')

### 열고 찾기 편하게 함수 짜주자

In [100]:
def pklopen(route):
    with open('{}'.format(route), 'rb') as f:
        df = pickle.load(f)
    return df

In [None]:
bus_up_1 = pklopen()
bus_down_1 = pklopen()
bus_up_2 = pklopen()
bus_down_2 = pklopen()
bus_up_3 = pklopen()
bus_down_3 = pklopen()
bus_up_4 = pklopen()
bus_down_4 = pklopen()
bus_up_5 = pklopen()
bus_down_5 = pklopen()

### 불러온것 토대로 인덱스, 서브인덱스로 해서 날짜/역/시간 찾을 수 있게 하는 함수짜기
### + 똑같이 하되, 지하철역 기준으로 모두 합친 groupby_sum도 짜기

In [None]:
def eachbus(df, date, station, *time):
    if time:
        df_sum = df.groupby(['사용일자', '지하철역']).sum()
        df_date = df_sum.loc[date]
        df_station = df_date.loc[station]
        people_num = df_station.loc[time]
        return people_num
    else:
        df_sum = df.groupby(['사용일자', '지하철역']).sum()
        df_date = df_sum.loc[date]
        df_station = df_date.loc[station]
        return df_station.to_frame().T

In [101]:
def gbs(df):
    del df['역명']
    df_gbs = df.groupby(['사용일자', '지하철역']).sum()
    return df_gbs

In [105]:
busT_up_1 = gbs(bus_up_1)
busT_down_1 = gbs(bus_down_1)
busT_up_2 = gbs(bus_up_2)
busT_down_2 = gbs(bus_down_2)
busT_up_3 = gbs(bus_up_3)
busT_down_3 = gbs(bus_down_3)
busT_up_4 = gbs(bus_up_4)
busT_down_4 = gbs(bus_down_4)
busT_up_5 = gbs(bus_up_5)
busT_down_5 = gbs(bus_down_5)

In [111]:
busT_up_total = pd.concat([busT_up_1,busT_up_2, busT_up_3, busT_up_4, busT_up_5])
busT_up_total

Unnamed: 0_level_0,Unnamed: 1_level_0,06시 이전,06 ~ 07,07 ~ 08,08 ~ 09,09 ~ 10,10 ~ 11,11 ~ 12,12 ~ 13,13 ~ 14,14 ~ 15,15 ~ 16,16 ~ 17,17 ~ 18,18 ~ 19,19 ~ 20,20 ~ 21,21 ~ 22,22 ~ 23,23 ~ 24,24시 이후
사용일자,지하철역,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20200101,가락시장,86.529683,121.553602,111.939193,140.095677,194.348415,267.829971,293.239481,352.299424,332.383862,377.708934,427.841210,352.299424,366.721037,374.961960,250.661383,244.480692,225.251873,185.420749,58.373199,2.060231
20200101,가산디지털단지,396.063773,475.276528,511.582374,782.225952,901.045084,841.635518,1234.398760,1590.856156,2000.122055,2670.129938,3442.454296,3970.539327,4363.302569,4076.156333,3191.613906,2191.552879,3185.012843,990.159433,448.872276,0.000000
20200101,강남,1096.988687,871.523007,521.724012,453.905839,453.905839,540.165796,694.243925,900.672924,1086.280554,1164.211963,1412.283699,1598.486226,1848.937548,1788.258130,1591.942367,1833.470245,2248.112932,2295.704632,1047.612298,3.569378
20200101,강남구청,129.117911,196.067939,130.711960,162.592925,191.285794,196.067939,186.503650,250.265581,349.096575,384.165637,412.858506,321.997754,463.868052,422.422796,353.878720,400.106120,497.343066,455.897810,172.157215,1.594048
20200101,강동,88.159926,121.473815,125.536485,165.756912,193.383064,235.228559,243.353898,281.542990,280.730456,252.698038,279.917923,284.386859,283.574325,276.667787,188.507861,160.881709,159.256641,86.941125,39.001626,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200531,홍제,477.808578,873.605235,1116.075260,1782.867828,2339.122590,2339.122590,2349.819797,2995.217950,2517.409372,2321.293911,2125.178450,2175.098750,1968.286082,1519.003389,1155.298352,991.274512,777.330373,449.282693,231.772818,7.131471
20200531,화곡,410.906560,537.012366,831.731554,1405.583818,1615.287855,1265.308820,1331.904021,1436.756040,1391.414626,1197.296700,1074.024732,1021.598722,932.332815,750.967161,639.030546,521.426255,461.915650,348.562116,128.939645,0.000000
20200531,화랑대(서울여대입구),152.314421,236.933544,338.476492,620.540235,652.507459,699.518083,594.214286,776.615506,718.322333,631.822785,549.084087,549.084087,520.877712,362.922016,278.302893,227.531420,206.846745,129.749322,73.336573,0.000000
20200531,회현(남대문시장),38.785457,68.951923,78.648287,107.737380,135.749099,198.236779,249.950721,422.330529,522.526292,586.091346,680.900240,594.710337,644.269531,971.791166,521.448918,903.916617,275.807692,128.207482,39.862831,1.077374


In [112]:
busT_down_total = pd.concat([busT_down_1,busT_down_2, busT_down_3, busT_down_4, busT_down_5])
busT_down_total

Unnamed: 0_level_0,Unnamed: 1_level_0,06시 이전,06 ~ 07,07 ~ 08,08 ~ 09,09 ~ 10,10 ~ 11,11 ~ 12,12 ~ 13,13 ~ 14,14 ~ 15,15 ~ 16,16 ~ 17,17 ~ 18,18 ~ 19,19 ~ 20,20 ~ 21,21 ~ 22,22 ~ 23,23 ~ 24,24시 이후
사용일자,지하철역,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20200101,가락시장,26.858726,121.535734,115.492521,218.227147,276.644875,253.143490,314.247091,312.904155,325.662050,306.860942,354.535180,371.321884,423.024931,346.477562,263.886981,213.526870,206.812188,193.382825,166.524100,36.930748
20200101,가산디지털단지,240.807180,538.471611,638.807936,836.136042,1836.154747,1709.062069,2695.702598,3444.880491,3745.889466,3879.671233,3364.611431,3060.257912,2618.778082,1725.784790,1518.423051,1103.699575,1010.052338,839.480586,568.572508,26.756353
20200101,강남,111.246845,405.296295,474.056908,589.216959,995.072284,940.846434,1075.572513,1449.004134,1606.650417,1500.434837,1711.747940,1819.640609,1878.338693,1830.262167,1320.427379,988.363931,849.724647,689.842246,448.900586,3.354176
20200101,강남구청,36.793609,185.383185,174.062074,360.860398,482.562336,343.878732,311.330540,258.970403,336.803038,329.727344,356.614982,366.520953,433.032478,312.745678,254.724987,249.064432,267.461236,164.156103,155.665270,22.642221
20200101,강동,12.303590,137.308063,108.271591,133.863058,152.564514,171.265971,191.936002,211.621746,222.448905,290.856864,292.825439,307.589747,333.673357,341.055511,283.966854,278.061131,298.239018,288.396146,198.333869,36.418626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200531,홍제,93.060852,568.335920,525.129096,900.696107,1216.438285,1419.178000,1007.051367,1276.263119,1651.830131,1724.949372,1798.068613,2196.900838,2180.282828,2316.550505,1814.686622,1734.920177,1894.453067,1691.713353,960.520941,9.970806
20200531,화곡,42.683862,274.396255,228.663546,323.177811,501.535377,556.414628,538.121544,868.921473,963.435739,1030.510379,1152.464270,1378.078968,1538.143450,1370.456850,1358.261461,1474.117657,1464.971116,1376.554545,763.736242,53.354827
20200531,화랑대(서울여대입구),36.165481,135.620553,169.525692,345.832411,490.494335,413.642688,458.849539,386.518577,490.494335,488.233992,526.659816,571.866667,630.635573,610.292490,587.689065,571.866667,664.540711,528.920158,440.766798,29.384453
20200531,회현(남대문시장),6.522003,134.166920,210.567527,338.212443,975.505311,886.060698,763.074355,696.922610,595.365706,495.672231,418.339909,318.646434,260.880121,177.025797,121.122914,82.922610,136.030349,88.512898,44.722307,3.726859


In [113]:
busT_up_total
busT_down_total

with open('busT_up_total.pkl', 'wb') as f:
    pickle.dump(busT_up_total, f)
with open('busT_down_total.pkl', 'wb') as f:
    pickle.dump(busT_down_total, f)

In [115]:
import pandas as pd
import pickle

def pklopen(route):
    with open('./{}'.format(route), 'rb') as f:
        df = pickle.load(f)
    return df

In [119]:
bus_up = pklopen('busT_up_total.pkl')
bus_down = pklopen('busT_down_total.pkl')

In [120]:
bus_up

Unnamed: 0_level_0,Unnamed: 1_level_0,06시 이전,06 ~ 07,07 ~ 08,08 ~ 09,09 ~ 10,10 ~ 11,11 ~ 12,12 ~ 13,13 ~ 14,14 ~ 15,15 ~ 16,16 ~ 17,17 ~ 18,18 ~ 19,19 ~ 20,20 ~ 21,21 ~ 22,22 ~ 23,23 ~ 24,24시 이후
사용일자,지하철역,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20200101,가락시장,86.529683,121.553602,111.939193,140.095677,194.348415,267.829971,293.239481,352.299424,332.383862,377.708934,427.841210,352.299424,366.721037,374.961960,250.661383,244.480692,225.251873,185.420749,58.373199,2.060231
20200101,가산디지털단지,396.063773,475.276528,511.582374,782.225952,901.045084,841.635518,1234.398760,1590.856156,2000.122055,2670.129938,3442.454296,3970.539327,4363.302569,4076.156333,3191.613906,2191.552879,3185.012843,990.159433,448.872276,0.000000
20200101,강남,1096.988687,871.523007,521.724012,453.905839,453.905839,540.165796,694.243925,900.672924,1086.280554,1164.211963,1412.283699,1598.486226,1848.937548,1788.258130,1591.942367,1833.470245,2248.112932,2295.704632,1047.612298,3.569378
20200101,강남구청,129.117911,196.067939,130.711960,162.592925,191.285794,196.067939,186.503650,250.265581,349.096575,384.165637,412.858506,321.997754,463.868052,422.422796,353.878720,400.106120,497.343066,455.897810,172.157215,1.594048
20200101,강동,88.159926,121.473815,125.536485,165.756912,193.383064,235.228559,243.353898,281.542990,280.730456,252.698038,279.917923,284.386859,283.574325,276.667787,188.507861,160.881709,159.256641,86.941125,39.001626,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200531,홍제,477.808578,873.605235,1116.075260,1782.867828,2339.122590,2339.122590,2349.819797,2995.217950,2517.409372,2321.293911,2125.178450,2175.098750,1968.286082,1519.003389,1155.298352,991.274512,777.330373,449.282693,231.772818,7.131471
20200531,화곡,410.906560,537.012366,831.731554,1405.583818,1615.287855,1265.308820,1331.904021,1436.756040,1391.414626,1197.296700,1074.024732,1021.598722,932.332815,750.967161,639.030546,521.426255,461.915650,348.562116,128.939645,0.000000
20200531,화랑대(서울여대입구),152.314421,236.933544,338.476492,620.540235,652.507459,699.518083,594.214286,776.615506,718.322333,631.822785,549.084087,549.084087,520.877712,362.922016,278.302893,227.531420,206.846745,129.749322,73.336573,0.000000
20200531,회현(남대문시장),38.785457,68.951923,78.648287,107.737380,135.749099,198.236779,249.950721,422.330529,522.526292,586.091346,680.900240,594.710337,644.269531,971.791166,521.448918,903.916617,275.807692,128.207482,39.862831,1.077374


In [122]:
bus_down

Unnamed: 0_level_0,Unnamed: 1_level_0,06시 이전,06 ~ 07,07 ~ 08,08 ~ 09,09 ~ 10,10 ~ 11,11 ~ 12,12 ~ 13,13 ~ 14,14 ~ 15,15 ~ 16,16 ~ 17,17 ~ 18,18 ~ 19,19 ~ 20,20 ~ 21,21 ~ 22,22 ~ 23,23 ~ 24,24시 이후
사용일자,지하철역,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20200101,가락시장,26.858726,121.535734,115.492521,218.227147,276.644875,253.143490,314.247091,312.904155,325.662050,306.860942,354.535180,371.321884,423.024931,346.477562,263.886981,213.526870,206.812188,193.382825,166.524100,36.930748
20200101,가산디지털단지,240.807180,538.471611,638.807936,836.136042,1836.154747,1709.062069,2695.702598,3444.880491,3745.889466,3879.671233,3364.611431,3060.257912,2618.778082,1725.784790,1518.423051,1103.699575,1010.052338,839.480586,568.572508,26.756353
20200101,강남,111.246845,405.296295,474.056908,589.216959,995.072284,940.846434,1075.572513,1449.004134,1606.650417,1500.434837,1711.747940,1819.640609,1878.338693,1830.262167,1320.427379,988.363931,849.724647,689.842246,448.900586,3.354176
20200101,강남구청,36.793609,185.383185,174.062074,360.860398,482.562336,343.878732,311.330540,258.970403,336.803038,329.727344,356.614982,366.520953,433.032478,312.745678,254.724987,249.064432,267.461236,164.156103,155.665270,22.642221
20200101,강동,12.303590,137.308063,108.271591,133.863058,152.564514,171.265971,191.936002,211.621746,222.448905,290.856864,292.825439,307.589747,333.673357,341.055511,283.966854,278.061131,298.239018,288.396146,198.333869,36.418626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200531,홍제,93.060852,568.335920,525.129096,900.696107,1216.438285,1419.178000,1007.051367,1276.263119,1651.830131,1724.949372,1798.068613,2196.900838,2180.282828,2316.550505,1814.686622,1734.920177,1894.453067,1691.713353,960.520941,9.970806
20200531,화곡,42.683862,274.396255,228.663546,323.177811,501.535377,556.414628,538.121544,868.921473,963.435739,1030.510379,1152.464270,1378.078968,1538.143450,1370.456850,1358.261461,1474.117657,1464.971116,1376.554545,763.736242,53.354827
20200531,화랑대(서울여대입구),36.165481,135.620553,169.525692,345.832411,490.494335,413.642688,458.849539,386.518577,490.494335,488.233992,526.659816,571.866667,630.635573,610.292490,587.689065,571.866667,664.540711,528.920158,440.766798,29.384453
20200531,회현(남대문시장),6.522003,134.166920,210.567527,338.212443,975.505311,886.060698,763.074355,696.922610,595.365706,495.672231,418.339909,318.646434,260.880121,177.025797,121.122914,82.922610,136.030349,88.512898,44.722307,3.726859
