# 데이터 불러오기

In [1]:
!pip install dbfread
!pip install haversine
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dbfread import DBF
from haversine import haversine
from tqdm.notebook import tqdm
%matplotlib inline
import platform
platform.system()
if platform.system() == 'Darwin': # Mac 환경 폰트 설정
    plt.rc('font', family='AppleGothic')
elif platform.system() == 'Windows': # Windows 환경 폰트 설정
    plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정
%config InlineBackend.figure_format = 'retina'



# 날짜별 버스 사용하기 위해 ARS라벨링
버스정류소 위치:https://data.seoul.go.kr/dataList/OA-15067/S/1/datasetView.do

일자별버스:http://115.84.165.39/dataList/OA-12912/S/1/datasetView.do

### 불러오기

In [2]:
bus_loc = pd.read_excel(r'./서울시버스정류소좌표데이터(2020.03.06).xlsx')
train_loc = pd.read_csv(r'./서울 지하철역 좌표.csv')
bus_date = pd.read_csv('./BUS_STATION_BOARDING_MONTH_202002.csv',encoding = 'cp949')

## 라벨링을 위한 기본 전처리

In [3]:
train_loc

Unnamed: 0,역명,상세주소,경도,위도,상태
0,서울,서울특별시 중구 봉래동2가 122 서울역(1호선),126.969563,37.558156,정좌표
1,시청,서울특별시 중구 정동 5-5 시청역(1호선),126.976919,37.565597,정좌표
2,종각,서울특별시 종로구 종로1가 54 종각지하철역사,126.981390,37.570222,정좌표
3,종로5가,서울특별시 종로구 종로5가 82-1 1호선 종로5가역,127.001925,37.571020,정좌표
4,동대문,서울특별시 종로구 창신동 492-1 1호선 동대문역(1호선),127.011233,37.571759,정좌표
...,...,...,...,...,...
273,부천종합운동장,경기도 부천시 원미구 춘의동 15-1 부천종합운동장역,126.797466,37.505639,실패
274,춘의,경기도 부천시 원미구 춘의동 145 춘의역,126.787091,37.503859,실패
275,신중동,경기도 부천시 원미구 중동 1140-6 신중동역,126.775952,37.503189,실패
276,부천시청,경기도 부천시 원미구 중동 1243 부천시청역,126.764025,37.504748,실패


In [4]:
#지하철 위치 전처리(역명, 경도, 위도)
train_loc = train_loc[['역명', '경도', '위도']]
train_loc=train_loc.drop_duplicates(['역명'], keep = 'first')
train_loc.set_index('역명', drop=True, inplace=True)
train_loc

Unnamed: 0_level_0,경도,위도
역명,Unnamed: 1_level_1,Unnamed: 2_level_1
서울,126.969563,37.558156
시청,126.976919,37.565597
종각,126.981390,37.570222
종로5가,127.001925,37.571020
동대문,127.011233,37.571759
...,...,...
부천종합운동장,126.797466,37.505639
춘의,126.787091,37.503859
신중동,126.775952,37.503189
부천시청,126.764025,37.504748


In [5]:
bus_col = bus_loc.columns
bus_col

Index(['표준ID', 'ARS-ID', '정류장명', 'X좌표', 'Y좌표', '비고'], dtype='object')

In [6]:
bus_loc_need = bus_loc[['ARS-ID', '정류장명', 'X좌표', 'Y좌표']]
bus_loc_need.columns =  ['ARS', '역명', '경도', '위도']
bus_loc_need=bus_loc_need.drop_duplicates(['ARS'], keep = 'first')
bus_loc_need.set_index('ARS', drop=True, inplace=True)
bus_loc_need

Unnamed: 0_level_0,역명,경도,위도
ARS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1001,종로2가사거리,126.987750,37.569765
1002,창경궁.서울대학교병원,126.996566,37.579183
1003,명륜3가.성대입구,126.998340,37.582671
1004,종로2가.삼일교,126.987613,37.568579
1005,혜화동로터리.여운형활동터,127.001744,37.586243
...,...,...,...
25995,우성아파트,127.139339,37.550386
25996,우성아파트,127.140046,37.550643
25997,조일약국,127.123596,37.533630
25998,성내시장,127.125497,37.536155


## 버스 일별 데이터 전처리

In [7]:
#버스 일별 데이터 확인
bus_date

Unnamed: 0,사용일자,노선번호,노선명,버스정류장ARS번호,역명,승차총승객수,하차총승객수,등록일자
0,20200201,100,100번(하계동~용산구청),11428,한성여객종점,14,4,20200204
1,20200201,100,100번(하계동~용산구청),11387,노원평생학습관,7,0,20200204
2,20200201,100,100번(하계동~용산구청),11373,중계역,27,2,20200204
3,20200201,100,100번(하계동~용산구청),11381,중계목화아파트4단지,120,13,20200204
4,20200201,100,100번(하계동~용산구청),11311,인덕대학,172,123,20200204
...,...,...,...,...,...,...,...,...
1118042,20200229,1111,1111번(번동~성북동),08128,KT월곡지사,43,38,20200303
1118043,20200229,1111,1111번(번동~성북동),08130,종암SK아파트,41,73,20200303
1118044,20200229,1111,1111번(번동~성북동),08134,동덕여대앞,75,58,20200303
1118045,20200229,1111,1111번(번동~성북동),08153,종암동주민센터,134,188,20200303


In [8]:
#ARS 갯수 확인
len(bus_date['버스정류장ARS번호'].unique())

12754

In [9]:
#역명 갯수 확인
len(bus_date['역명'].unique())

8065

In [10]:
bus_date[bus_date['버스정류장ARS번호']=='01001']

Unnamed: 0,사용일자,노선번호,노선명,버스정류장ARS번호,역명,승차총승객수,하차총승객수,등록일자
8862,20200201,N37,N37번(송파공영차고지~진관공영차고지),01001,종로2가사거리,9,14,20200204
9169,20200201,N37,N37번(진관공영차고지~송파공영차고지),01001,종로2가사거리,25,23,20200204
11257,20200201,741,741번(진관차고지~헌인릉입구),01001,종로2가사거리,114,189,20200204
35474,20200201,470,470번(상암차고지~안골마을),01001,종로2가사거리,178,220,20200204
52300,20200202,741,741번(진관차고지~헌인릉입구),01001,종로2가사거리,78,98,20200205
...,...,...,...,...,...,...,...,...
1072093,20200228,N37,N37번(진관공영차고지~송파공영차고지),01001,종로2가사거리,15,20,20200302
1092430,20200229,741,741번(진관차고지~헌인릉입구),01001,종로2가사거리,69,100,20200303
1104430,20200229,470,470번(상암차고지~안골마을),01001,종로2가사거리,83,128,20200303
1110375,20200229,N37,N37번(송파공영차고지~진관공영차고지),01001,종로2가사거리,4,10,20200303


In [11]:
bus_date[bus_date['역명']=='종로2가사거리']

Unnamed: 0,사용일자,노선번호,노선명,버스정류장ARS번호,역명,승차총승객수,하차총승객수,등록일자
8862,20200201,N37,N37번(송파공영차고지~진관공영차고지),01001,종로2가사거리,9,14,20200204
9169,20200201,N37,N37번(진관공영차고지~송파공영차고지),01001,종로2가사거리,25,23,20200204
11257,20200201,741,741번(진관차고지~헌인릉입구),01001,종로2가사거리,114,189,20200204
35474,20200201,470,470번(상암차고지~안골마을),01001,종로2가사거리,178,220,20200204
52300,20200202,741,741번(진관차고지~헌인릉입구),01001,종로2가사거리,78,98,20200205
...,...,...,...,...,...,...,...,...
1072093,20200228,N37,N37번(진관공영차고지~송파공영차고지),01001,종로2가사거리,15,20,20200302
1092430,20200229,741,741번(진관차고지~헌인릉입구),01001,종로2가사거리,69,100,20200303
1104430,20200229,470,470번(상암차고지~안골마을),01001,종로2가사거리,83,128,20200303
1110375,20200229,N37,N37번(송파공영차고지~진관공영차고지),01001,종로2가사거리,4,10,20200303


In [12]:
bus_col = bus_date.columns

In [13]:
bus_col = list(bus_col)
bus_col

['사용일자', '노선번호', '노선명', '버스정류장ARS번호', '역명', '승차총승객수', '하차총승객수', '등록일자']

In [14]:
#ARS기준으로 묶어줘야 할듯
bus_date_need = bus_date[['사용일자', '버스정류장ARS번호', '역명', '승차총승객수', '하차총승객수']]
bus_date_need.columns =  ['사용일자', 'ARS', '역명', '승차총승객수', '하차총승객수']
bus_date_need

Unnamed: 0,사용일자,ARS,역명,승차총승객수,하차총승객수
0,20200201,11428,한성여객종점,14,4
1,20200201,11387,노원평생학습관,7,0
2,20200201,11373,중계역,27,2
3,20200201,11381,중계목화아파트4단지,120,13
4,20200201,11311,인덕대학,172,123
...,...,...,...,...,...
1118042,20200229,08128,KT월곡지사,43,38
1118043,20200229,08130,종암SK아파트,41,73
1118044,20200229,08134,동덕여대앞,75,58
1118045,20200229,08153,종암동주민센터,134,188


In [15]:
bus_date_need['ARS']

0          11428
1          11387
2          11373
3          11381
4          11311
           ...  
1118042    08128
1118043    08130
1118044    08134
1118045    08153
1118046    08151
Name: ARS, Length: 1118047, dtype: object

## 같은날짜, 같은 역의 승하차 총 승객수로 묶어주자. 그러려면 ARS만 가지고 묶고 더하면 사용일자가 뭉개지니까 사용일자, ARS, 역명 기준으로 groupby랑 sum해주기

In [16]:
bus_date_updown = bus_date_need.groupby(['사용일자', 'ARS']).sum()
bus_date_updown

Unnamed: 0_level_0,Unnamed: 1_level_0,승차총승객수,하차총승객수
사용일자,ARS,Unnamed: 2_level_1,Unnamed: 3_level_1
20200201,01001,326,446
20200201,01002,1063,859
20200201,01003,3076,1839
20200201,01004,491,440
20200201,01005,1135,2253
...,...,...,...
20200229,68271,7,10
20200229,68290,34,5
20200229,68351,60,12
20200229,68352,11,50


## 우선, ARS와 지하철역 거리기준으로 매칭

In [17]:
bus_date_need

Unnamed: 0,사용일자,ARS,역명,승차총승객수,하차총승객수
0,20200201,11428,한성여객종점,14,4
1,20200201,11387,노원평생학습관,7,0
2,20200201,11373,중계역,27,2
3,20200201,11381,중계목화아파트4단지,120,13
4,20200201,11311,인덕대학,172,123
...,...,...,...,...,...
1118042,20200229,08128,KT월곡지사,43,38
1118043,20200229,08130,종암SK아파트,41,73
1118044,20200229,08134,동덕여대앞,75,58
1118045,20200229,08153,종암동주민센터,134,188


In [18]:
bus_loc_need

Unnamed: 0_level_0,역명,경도,위도
ARS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1001,종로2가사거리,126.987750,37.569765
1002,창경궁.서울대학교병원,126.996566,37.579183
1003,명륜3가.성대입구,126.998340,37.582671
1004,종로2가.삼일교,126.987613,37.568579
1005,혜화동로터리.여운형활동터,127.001744,37.586243
...,...,...,...
25995,우성아파트,127.139339,37.550386
25996,우성아파트,127.140046,37.550643
25997,조일약국,127.123596,37.533630
25998,성내시장,127.125497,37.536155


In [19]:
#혹시 에러뜰까봐 일단 이것도 만들어 둠
bus_loc_need_ = bus_loc_need[['경도', '위도']]
bus_loc_need_

Unnamed: 0_level_0,경도,위도
ARS,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,126.987750,37.569765
1002,126.996566,37.579183
1003,126.998340,37.582671
1004,126.987613,37.568579
1005,127.001744,37.586243
...,...,...
25995,127.139339,37.550386
25996,127.140046,37.550643
25997,127.123596,37.533630
25998,127.125497,37.536155


In [20]:
train_loc

Unnamed: 0_level_0,경도,위도
역명,Unnamed: 1_level_1,Unnamed: 2_level_1
서울,126.969563,37.558156
시청,126.976919,37.565597
종각,126.981390,37.570222
종로5가,127.001925,37.571020
동대문,127.011233,37.571759
...,...,...
부천종합운동장,126.797466,37.505639
춘의,126.787091,37.503859
신중동,126.775952,37.503189
부천시청,126.764025,37.504748


In [90]:
close_station = {'지하철역' : [],
                'ARS' : []}

for b_station in tqdm(bus_loc_need.index):
    start = (float(bus_loc_need.loc[b_station]['위도']), float(bus_loc_need.loc[b_station]['경도']))
    distance_dic = {}
    for t_station in train_loc.index:
        goal = (train_loc.loc[t_station]['위도'].mean(), train_loc.loc[t_station]['경도'].mean())
        distance_dic[t_station] = float(haversine(start, goal))
    close_ = sorted((value, key) for key, value in distance_dic.items())[0][1]
    close_station['ARS'].append(b_station)
    close_station['지하철역'].append(close_)

HBox(children=(FloatProgress(value=0.0, max=11279.0), HTML(value='')))




In [96]:
station_mapping = pd.DataFrame(close_station)
station_mapping = station_mapping[['ARS', '지하철역']]
station_mapping['ARS']

0         1001
1         1002
2         1003
3         1004
4         1005
         ...  
11274    25995
11275    25996
11276    25997
11277    25998
11278    25999
Name: ARS, Length: 11279, dtype: int64

In [92]:
station_mapping.to_csv("busARS_train_station_mapping.csv")

In [147]:
station_mapping = pd.read_csv('busARS_Train_station_mapping.csv', index_col=0)
station_mapping

Unnamed: 0,ARS,지하철역
0,1001,종로3가(탑골공원)
1,1002,혜화
2,1003,혜화
3,1004,을지로3가
4,1005,한성대입구(삼선교)
...,...,...
11274,25995,명일
11275,25996,명일
11276,25997,강동구청
11277,25998,천호(풍납토성)


## 해결해야 하는것
1. ARS가 4자리로 되어있는걸 5자리로 만들어줘야 한다.-앞에 0붙여서 해결
2. 날짜/ARS로 하니까 인덱스가 겹쳐버리는데, 날짜로만 인덱스 하고 ARS는 그냥 나중에 붙이는 식으로 해야할 듯.-더하고 인덱스 초기화 해서 해결 

#### 1번 왜 문제되었는지 알겠네... 서울시에서 좌표 제공 할 때 ARS를 int로넣어서 앞의 0이 사라졌나보다. 그러면 그냥 append이런거 안하고 그걸int로 바꿔도 되었겠군 해봐야지

In [148]:
#1번 해결
ARS_list = []
for ars in list(station_mapping['ARS']):
    if len(str(ars)) == 5:
        ARS_list.append(ars)
    else:
        str_ars = '0' + str(ars)
        ARS_list.append(str_ars)
station_mapping['ARS'] = ARS_list
station_mapping

Unnamed: 0,ARS,지하철역
0,01001,종로3가(탑골공원)
1,01002,혜화
2,01003,혜화
3,01004,을지로3가
4,01005,한성대입구(삼선교)
...,...,...
11274,25995,명일
11275,25996,명일
11276,25997,강동구청
11277,25998,천호(풍납토성)


In [149]:
group = bus_date_need.groupby(['사용일자', 'ARS']).sum()
group.reset_index(inplace=True)
group=group.dropna()
group

Unnamed: 0,사용일자,ARS,승차총승객수,하차총승객수
0,20200201,01001,326,446
1,20200201,01002,1063,859
2,20200201,01003,3076,1839
3,20200201,01004,491,440
4,20200201,01005,1135,2253
...,...,...,...,...
368008,20200229,68271,7,10
368009,20200229,68290,34,5
368010,20200229,68351,60,12
368011,20200229,68352,11,50


In [150]:
busARS_train_num = pd.merge(group, station_mapping, how='inner', on='ARS')
busARS_train_num.set_index('ARS', drop=True, inplace=True)
busARS_train_num = busARS_train_num.dropna()
busARS_train_num

Unnamed: 0_level_0,사용일자,승차총승객수,하차총승객수,지하철역
ARS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
01001,20200201,326,446,종로3가(탑골공원)
01001,20200202,204,226,종로3가(탑골공원)
01001,20200203,569,526,종로3가(탑골공원)
01001,20200204,539,508,종로3가(탑골공원)
01001,20200205,595,518,종로3가(탑골공원)
...,...,...,...,...
08563,20200218,2,0,성신여대입구(돈암)
08563,20200219,1,0,성신여대입구(돈암)
08563,20200223,1,0,성신여대입구(돈암)
08563,20200224,3,0,성신여대입구(돈암)


In [151]:
busARS_train_num.head(60)

Unnamed: 0_level_0,사용일자,승차총승객수,하차총승객수,지하철역
ARS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1001,20200201,326,446,종로3가(탑골공원)
1001,20200202,204,226,종로3가(탑골공원)
1001,20200203,569,526,종로3가(탑골공원)
1001,20200204,539,508,종로3가(탑골공원)
1001,20200205,595,518,종로3가(탑골공원)
1001,20200206,547,533,종로3가(탑골공원)
1001,20200207,601,594,종로3가(탑골공원)
1001,20200208,276,423,종로3가(탑골공원)
1001,20200209,182,230,종로3가(탑골공원)
1001,20200210,607,541,종로3가(탑골공원)


In [152]:
#ARS로 살아남은 정류장 갯수
len(busARS_train_num.index.unique())

3276

## 3천개밖에 못 살림.
## 역명으로 묶기

In [125]:
bus_date_updown = bus_date_need.groupby(['사용일자', '역명']).sum()
bus_date_updown

Unnamed: 0_level_0,Unnamed: 1_level_0,승차총승객수,하차총승객수
사용일자,역명,Unnamed: 2_level_1,Unnamed: 3_level_1
20200201,(구)단대동주민센터,107,148
20200201,11단지,138,27
20200201,123전자타운.2001아울렛,545,541
20200201,12번지건영아파트,262,297
20200201,13단지아파트입구,159,233
...,...,...,...
20200229,힐스테이트서울숲리버,53,78
20200229,힐스테이트송파.송파한라비발디,6,110
20200229,힐스테이트입구,23,33
20200229,힐탑,0,4


In [126]:
bus_date_need

Unnamed: 0,사용일자,ARS,역명,승차총승객수,하차총승객수
0,20200201,11428,한성여객종점,14,4
1,20200201,11387,노원평생학습관,7,0
2,20200201,11373,중계역,27,2
3,20200201,11381,중계목화아파트4단지,120,13
4,20200201,11311,인덕대학,172,123
...,...,...,...,...,...
1118042,20200229,08128,KT월곡지사,43,38
1118043,20200229,08130,종암SK아파트,41,73
1118044,20200229,08134,동덕여대앞,75,58
1118045,20200229,08153,종암동주민센터,134,188


In [130]:
bus_loc_need = bus_loc[['ARS-ID', '정류장명', 'X좌표', 'Y좌표']]
bus_loc_need.columns =  ['ARS', '역명', '경도', '위도']
bus_loc_need=bus_loc_need.drop_duplicates(['역명'], keep = 'first')
bus_loc_need.set_index('역명', drop=True, inplace=True)
bus_loc_need

Unnamed: 0_level_0,ARS,경도,위도
역명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
종로2가사거리,1001,126.987750,37.569765
창경궁.서울대학교병원,1002,126.996566,37.579183
명륜3가.성대입구,1003,126.998340,37.582671
종로2가.삼일교,1004,126.987613,37.568579
혜화동로터리.여운형활동터,1005,127.001744,37.586243
...,...,...,...
강일리버파크1단지정문,25753,127.172870,37.568097
굽은다리역,25781,127.142663,37.544322
천호우체국.로데오거리,25994,127.127274,37.540556
조일약국,25997,127.123596,37.533630


In [131]:
#혹시 에러뜰까봐 일단 이것도 만들어 둠
bus_loc_need_ = bus_loc_need[['경도', '위도']]
bus_loc_need_

Unnamed: 0_level_0,경도,위도
역명,Unnamed: 1_level_1,Unnamed: 2_level_1
종로2가사거리,126.987750,37.569765
창경궁.서울대학교병원,126.996566,37.579183
명륜3가.성대입구,126.998340,37.582671
종로2가.삼일교,126.987613,37.568579
혜화동로터리.여운형활동터,127.001744,37.586243
...,...,...
강일리버파크1단지정문,127.172870,37.568097
굽은다리역,127.142663,37.544322
천호우체국.로데오거리,127.127274,37.540556
조일약국,127.123596,37.533630


In [132]:
train_loc

Unnamed: 0_level_0,경도,위도
역명,Unnamed: 1_level_1,Unnamed: 2_level_1
서울,126.969563,37.558156
시청,126.976919,37.565597
종각,126.981390,37.570222
종로5가,127.001925,37.571020
동대문,127.011233,37.571759
...,...,...
부천종합운동장,126.797466,37.505639
춘의,126.787091,37.503859
신중동,126.775952,37.503189
부천시청,126.764025,37.504748


In [133]:
close_station = {'지하철역' : [],
                '역명' : []}

for b_station in tqdm(bus_loc_need.index):
    start = (float(bus_loc_need.loc[b_station]['위도']), float(bus_loc_need.loc[b_station]['경도']))
    distance_dic = {}
    for t_station in train_loc.index:
        goal = (train_loc.loc[t_station]['위도'].mean(), train_loc.loc[t_station]['경도'].mean())
        distance_dic[t_station] = float(haversine(start, goal))
    close_ = sorted((value, key) for key, value in distance_dic.items())[0][1]
    close_station['역명'].append(b_station)
    close_station['지하철역'].append(close_)

HBox(children=(FloatProgress(value=0.0, max=7004.0), HTML(value='')))




In [134]:
station_mapping = pd.DataFrame(close_station)
station_mapping = station_mapping[['역명', '지하철역']]
station_mapping['역명']

0             종로2가사거리
1         창경궁.서울대학교병원
2           명륜3가.성대입구
3            종로2가.삼일교
4       혜화동로터리.여운형활동터
            ...      
6999      강일리버파크1단지정문
7000            굽은다리역
7001      천호우체국.로데오거리
7002             조일약국
7003             성내시장
Name: 역명, Length: 7004, dtype: object

In [135]:
station_mapping.to_csv("busNAME_train_station_mapping.csv")

In [136]:
station_mapping = pd.read_csv('busNAME_Train_station_mapping.csv', index_col=0)
station_mapping

Unnamed: 0,역명,지하철역
0,종로2가사거리,종로3가(탑골공원)
1,창경궁.서울대학교병원,혜화
2,명륜3가.성대입구,혜화
3,종로2가.삼일교,을지로3가
4,혜화동로터리.여운형활동터,한성대입구(삼선교)
...,...,...
6999,강일리버파크1단지정문,상일동
7000,굽은다리역,굽은다리(강동구민회관앞)
7001,천호우체국.로데오거리,천호(풍납토성)
7002,조일약국,강동구청


In [137]:
group = bus_date_need.groupby(['사용일자', '역명']).sum()
group.reset_index(inplace=True)
group=group.dropna()
group

Unnamed: 0,사용일자,역명,승차총승객수,하차총승객수
0,20200201,(구)단대동주민센터,107,148
1,20200201,11단지,138,27
2,20200201,123전자타운.2001아울렛,545,541
3,20200201,12번지건영아파트,262,297
4,20200201,13단지아파트입구,159,233
...,...,...,...,...
232268,20200229,힐스테이트서울숲리버,53,78
232269,20200229,힐스테이트송파.송파한라비발디,6,110
232270,20200229,힐스테이트입구,23,33
232271,20200229,힐탑,0,4


In [140]:
busNAME_train_num = pd.merge(group, station_mapping, how='inner', on='역명')
busNAME_train_num.set_index('역명', drop=True, inplace=True)
busNAME_train_num = busARS_train_num.dropna()
busNAME_train_num

Unnamed: 0_level_0,사용일자,승차총승객수,하차총승객수,지하철역
역명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
123전자타운.2001아울렛,20200201,545,541,양천구청
123전자타운.2001아울렛,20200202,322,351,양천구청
123전자타운.2001아울렛,20200203,509,638,양천구청
123전자타운.2001아울렛,20200204,637,814,양천구청
123전자타운.2001아울렛,20200205,664,777,양천구청
...,...,...,...,...
별장쉐르빌앞,20200229,6,9,천왕
동자경로당,20200226,69,219,서울역
동자경로당,20200227,60,239,서울역
동자경로당,20200228,74,202,서울역


In [177]:
#역명으로 살아남은 정류장 갯수
len(busNAME_train_num.index.unique())

6501

## 6000개 구함

### 인덱스에서 중복제거해주고 리스트로 뽑기

In [165]:
busNAME_index = list(set(list(busNAME_train_num.index)))
busNAME_index

['하계극동아파트',
 '흑석동한강현대아파트',
 '고은초등학교.홍제센트럴아이파크',
 '한마음아파트',
 '경부고속터미널',
 '역촌센트레빌',
 '약수역3번출구.약수하이츠',
 '신정1동주민센터',
 '신트리공원.목동아파트11단지',
 '신촌전철역',
 '유원하나아파트정문',
 '송파파인타운7단지',
 '두산.벽산아파트.서울관광고등학교',
 '롯데캐슬3차',
 '몽촌토성역.한성백제역',
 '서울다원학교.한용운활동터',
 '서울지방병무청',
 '서울스퀘어앞',
 '삼환아파트후문',
 '마곡엠밸리7단지',
 '시흥유통센터',
 '종암SK아파트',
 '생명의전화',
 '엘지아파트앞',
 '훌랄라치킨',
 '금천폭포공원',
 '서울시품질시험소한국교원단체총연합회',
 '홈플러스중계점',
 '고척중학교',
 '성우맨션',
 '신림중.삼성고.관악문화관도서관',
 '건영아파트앞',
 '신월동남부주유소',
 '영천시장',
 '장지동주민센터.송파파인타운1단지',
 '무애사',
 '창3동염광아트빌',
 '서울과학기술대학교후문',
 '송파한라비발디.힐스테이트송파',
 '하나은행망우동지점',
 '현대2.3차아파트',
 '문성골',
 '강북문화정보도서관.해모로아파트',
 '정독도서관',
 '이화동(이화장)',
 '강일리버파크9단지후문',
 '구로역.구로기계공구상가',
 '행당역',
 '원효2동산호아파트후문',
 '봉우재고개',
 '신길4동주민센터',
 'KT용산지사',
 '왕십리로교차로',
 '남성역',
 '삼전동현대APT',
 '강동첨단업무단지.상일여고입구',
 '쌍문1치안센터',
 '위례중앙푸르지오.신안인스빌',
 '서울신문사',
 '벽산아파트1단지.호압사입구',
 '강서농수산물도매시장',
 '시흥2동주민센터.동일여고',
 '옛고을길입구',
 '중화중학교삼거리',
 '현대아파트입구',
 '신동아아파트정문',
 '대림역',
 '둘리뮤지엄.극동대우이안아파트',
 '하나은행신월동지점',
 '둔촌동프라자아파트.GS강동자이아파트',
 '동원베네스트',
 '서울항공비즈니스고등학교',
 '

In [173]:
for idx in busNAME_index:
    if len(busNAME_train_num.loc[idx]) > 27:
        pass
    else:
        busNAME_train_num = busNAME_train_num.drop([idx])

In [174]:
busNAME_train_num

Unnamed: 0_level_0,사용일자,승차총승객수,하차총승객수,지하철역
역명,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
123전자타운.2001아울렛,20200201,545,541,양천구청
123전자타운.2001아울렛,20200202,322,351,양천구청
123전자타운.2001아울렛,20200203,509,638,양천구청
123전자타운.2001아울렛,20200204,637,814,양천구청
123전자타운.2001아울렛,20200205,664,777,양천구청
...,...,...,...,...
힐튼호텔,20200225,187,412,회현(남대문시장)
힐튼호텔,20200226,198,454,회현(남대문시장)
힐튼호텔,20200227,185,433,회현(남대문시장)
힐튼호텔,20200228,182,399,회현(남대문시장)


In [176]:
#역명으로 살아남고 날짜 다 있는 정류장 갯수
len(busNAME_train_num.index.unique())

6501

In [178]:
busNAME_train_num.to_csv('2002버스일자별승하차인원.csv')