# 🚆Train_timetable
지하철 시간표 데이터 전처리

### TODO
- [X] 서울시 공공데이터를 통해 가능한 호선 데이터 불러오기
- [X] 가공하기
  - [X] 기본 가공
- [X] 부족한 시간표 채우기
  - [X] 부족한 데이터셋 구하기
  - [X] 샘플 코드 작성
  - [X] 같은 형식으로 변형
  - [X] 급행 정보 추가
  - [X] 경의중앙선_임진강, 경춘선_광운대 시간표 통합 (in excel)
  - [X] 나머지 부족 시간표 크롤링으로 생성 및 추가
    - [X] 신분당선
    - [X] 신림선
    - [X] 우이신설선
    - [X] 결측값 수동 처리
    - [X] 신분당선 출발 및 도착역 정보 수동 맞추기 (in excel)
    - [X] 도착역 시간표 수동 구축 (in excel)
- [X] 시간표 형태 변경  
  - [X] 열차번호 있는 시간표
  - [X] 열차번호 없는 시간표 
    - [X] 열차번호 수동 추가 (in excel)
    - [X] 함수 적용
  - [X] 공항철도선
- [X] 역번호로 변경
  - [X] 역사명 통일
  - [X] 역번호로 컬럼 변경
- [X] 세부 사항 변경
  - [X] 도착시간 형식 변경
  - [X] 필요 없는 컬럼 제거
  - [X] 인코딩 변경
  - [X] 역 순서 맞추기
  - [X] 정렬 맞추기
  - [X] 9시 이전 차 없애기 
  - [X] result에 대해서도 전부 수행
- [X] result 시간표 전부 통합 후 저장

### 0. import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from IPython.display import set_matplotlib_formats
import koreanize_matplotlib
import folium
import urllib.request
import json
import time

### 1. 서울시 공공데이터를 통해 가능한 호선 데이터 불러오기

In [182]:
# 작업했던 역번호 데이터 읽어오기
station = pd.read_csv('result_train_station.csv')

station

Unnamed: 0.1,Unnamed: 0,역사명,노선명,주소,위도,경도,역번호,외부코드
0,0,계양,인천1호선,인천광역시 계양구 다남로 24,37.571449,126.735780,3110,I110
1,1,귤현,인천1호선,인천광역시 계양구 장제로 1136,37.566379,126.742654,3111,I111
2,2,박촌,인천1호선,인천광역시 계양구 장제로 992,37.553703,126.745077,3112,I112
3,3,임학,인천1호선,인천광역시 계양구 장제로 875,37.545059,126.738665,3113,I113
4,4,계산,인천1호선,인천광역시 계양구 경명대로 1089,37.543238,126.728128,3114,I114
...,...,...,...,...,...,...,...,...
764,764,옥수,경의중앙선,서울시 성동구 동호로 지하21(옥수동),37.540446,127.018672,,
765,765,응봉,경의중앙선,서울시 성동구 고산자로 123(응봉동),37.549946,127.034538,,
766,766,까치산,2호선,서울특별시 강서구 강서로 54,37.531394,126.846987,0200,
767,767,도라산,경의중앙선,경기도 파주시 장단면 희망로 307,37.898307,126.709193,,


In [183]:
# key 가져오기
from dotenv import load_dotenv
import os

load_dotenv()

url = os.environ.get('train_seoul_timetable_url')

In [184]:
# 함수 정의
# 호선명을 입력받아 해당하는 시간표 호출해옴
def getTimetableByRouteName(station, err) :
    result = pd.DataFrame({
        'LINE_NUM': [],
        'FR_CODE': [],
        'STATION_CD': [],
        'STATION_NM': [],
        'TRAIN_NO': [],
        'ARRIVETIME': [],
        'LEFTTIME': [],
        'ORIGINSTATION': [],
        'DESTSTATION': [],
        'SUBWAYSNAME': [],
        'SUBWAYENAME': [],
        'WEEK_TAG': [], 
        'INOUT_TAG': [], 
        'FL_FLAG': [], 
        'DESTSTATION2': [], 
        'EXPRESS_YN': [], 
        'BRANCH_LINE': []
    })
    
    codes = list(station["역번호"])
    
    for code in codes:
        for week in range(1, 4): # 요일
            for inout in range(1, 3): # 상/하행
                
                response = urllib.request.urlopen(url.format(code, week, inout))
                json_str = response.read().decode("utf-8")
                json_obj = json.loads(json_str)
                
                if 'RESULT' in json_obj and json_obj['RESULT']['CODE'] == 'INFO-000':
                    # 정상
                    temp = pd.json_normalize(json_obj['row'])
                    result = pd.concat([result, temp], axis=0)
                elif 'SearchSTNTimeTableByIDService' in json_obj and json_obj['SearchSTNTimeTableByIDService']['RESULT']['CODE'] == 'INFO-000':
                    # 정상
                    temp = pd.json_normalize(
                        json_obj['SearchSTNTimeTableByIDService']['row'])
                    result = pd.concat([result, temp], axis=0)
                else :
                    # 없는 경우
                    err.append(
                        {"code": code, "err": json_obj['RESULT']['CODE']})
                
    return result

In [185]:
# 노선 개수 확인
routes = sorted(list(set(station["노선명"])))
cntRoutes = len(station["노선명"].value_counts())

errs = {}
results = {}

print(routes)

['1호선', '2호선', '3호선', '4호선', '5호선', '6호선', '7호선', '8호선', '9호선', '경강선', '경의중앙선', '경춘선', '공항철도선', '김포골드라인', '서해선', '수인분당선', '신림선', '신분당선', '에버라인선', '우이신설선', '의정부선', '인천1호선', '인천2호선']


In [186]:
# # API 호출
# for route in routes:    
#     err = []
    
#     results[route] = getTimetableByRouteName(station[station['노선명'] == route], err)
#     errs[route] = err
    
#     time.sleep(10)

# # 시간표 엑셀로 저장 (반드시 필요, 호출 시간 오래 걸리므로)
# now_routes = ["1호선", "2호선", "3호선", "4호선", "5호선", "6호선", "7호선", "8호선", "9호선"]

# for route in now_routes:
#     results[route].to_csv("timetable/" + route + ".csv")

In [187]:
# # 에러 여부
# errs

In [188]:
# routes = sorted(routes)

# for route in routes:
#     station_cnt = len(set(station['역사명'][station['노선명'] == route]))
#     err_cnt = len(errs[route])
#     if err_cnt != 0:
#         print(route, "역 개수 :", station_cnt)
#         print(route, "오류 개수 :", err_cnt, ", 비교값 :", station_cnt * 6) # 한 역에 대해 API 6번 호출하므로
#         print()
    
# # 확인 결과, 6호선을 제외하고는 다 API가 제공하지 않는 호선임
# # 6호선 몇몇 역들에 대해서만 API 호출하면 됨

In [189]:
# for obj in errs["6호선"]:
#     print(station['역사명'][station['역번호'] == obj['code']])

# # 확인 결과, 해당 방향으로는 원래 열차가 없는 역 -> 응암루프선
# # 그래도, 역 도착 시간을 알기 위해 시간표 정보가 필요

In [190]:
# 엑셀 파일에서 불러오기
now_routes = ["1호선", "2호선", "3호선", "4호선", "5호선", "6호선", "7호선", "8호선", "9호선"]

for route in now_routes:
    results[route] = pd.read_csv("timetable/origin/" + route + ".csv")

  results[route] = pd.read_csv("timetable/origin/" + route + ".csv")
  results[route] = pd.read_csv("timetable/origin/" + route + ".csv")
  results[route] = pd.read_csv("timetable/origin/" + route + ".csv")


In [191]:
# 확인 및 데이터 형식 살펴보기 위한 파일 저장
results["1호선"]

Unnamed: 0.1,Unnamed: 0,LINE_NUM,FR_CODE,STATION_CD,STATION_NM,TRAIN_NO,ARRIVETIME,LEFTTIME,ORIGINSTATION,DESTSTATION,SUBWAYSNAME,SUBWAYENAME,WEEK_TAG,INOUT_TAG,FL_FLAG,DESTSTATION2,EXPRESS_YN,BRANCH_LINE
0,0,01호선,124,158,청량리,K1940,18:54:00,00:00:00,1728,158,천안,청량리,1,1,,,D,
1,1,01호선,124,158,청량리,K630,11:48:00,00:00:00,1728,158,천안,청량리,1,1,,,G,
2,2,01호선,124,158,청량리,K1906,09:40:30,00:00:00,1408,158,신창,청량리,1,1,,,D,
3,3,01호선,124,158,청량리,K1908,10:16:30,00:00:00,1408,158,신창,청량리,1,1,,,D,
4,4,01호선,124,158,청량리,K1910,11:11:30,00:00:00,1728,158,천안,청량리,1,1,,,D,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81814,104,01호선,161,1812,인천,K107,15:05:30,00:00:00,1916,1812,소요산,인천,3,2,,,G,
81815,105,01호선,161,1812,인천,K105,14:56:30,00:00:00,1909,1812,양주,인천,3,2,,,G,
81816,106,01호선,161,1812,인천,K103,14:49:30,00:00:00,1701,1812,구로,인천,3,2,,,G,
81817,107,01호선,161,1812,인천,K101,14:42:30,00:00:00,1916,1812,소요산,인천,3,2,,,G,


In [192]:
results["1호선"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81819 entries, 0 to 81818
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     81819 non-null  int64  
 1   LINE_NUM       81819 non-null  object 
 2   FR_CODE        81819 non-null  object 
 3   STATION_CD     81819 non-null  int64  
 4   STATION_NM     81819 non-null  object 
 5   TRAIN_NO       81819 non-null  object 
 6   ARRIVETIME     81819 non-null  object 
 7   LEFTTIME       81819 non-null  object 
 8   ORIGINSTATION  81819 non-null  int64  
 9   DESTSTATION    81819 non-null  int64  
 10  SUBWAYSNAME    81819 non-null  object 
 11  SUBWAYENAME    81819 non-null  object 
 12  WEEK_TAG       81819 non-null  int64  
 13  INOUT_TAG      81819 non-null  int64  
 14  FL_FLAG        0 non-null      float64
 15  DESTSTATION2   0 non-null      float64
 16  EXPRESS_YN     81819 non-null  object 
 17  BRANCH_LINE    0 non-null      float64
dtypes: flo

In [193]:
results["2호선"]

Unnamed: 0.1,Unnamed: 0,LINE_NUM,FR_CODE,STATION_CD,STATION_NM,TRAIN_NO,ARRIVETIME,LEFTTIME,ORIGINSTATION,DESTSTATION,SUBWAYSNAME,SUBWAYENAME,WEEK_TAG,INOUT_TAG,FL_FLAG,DESTSTATION2,EXPRESS_YN,BRANCH_LINE
0,0,02호선,201,201,시청,2004,05:39:00,05:39:30,239,211,홍대입구,성수,1,1,,,G,
1,1,02호선,201,201,시청,2006,05:50:30,05:51:00,234,211,신도림,성수,1,1,,,G,
2,2,02호선,201,201,시청,2008,05:57:30,05:58:00,234,211,신도림,성수,1,1,,,G,
3,3,02호선,201,201,시청,2010,06:04:00,06:04:30,228,211,서울대입구,성수,1,1,,,G,
4,4,02호선,201,201,시청,2012,06:10:30,06:11:00,234,211,신도림,성수,1,1,,,G,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61864,101,02호선,234-4,200,까치산,5705,00:00:00,23:02:00,200,234,까치산,신도림,3,2,,,G,
61865,102,02호선,234-4,200,까치산,5707,00:00:00,23:17:00,200,234,까치산,신도림,3,2,,,G,
61866,103,02호선,234-4,200,까치산,5711,00:00:00,23:33:00,200,234,까치산,신도림,3,2,,,G,
61867,104,02호선,234-4,200,까치산,5713,00:00:00,23:47:00,200,234,까치산,신도림,3,2,,,G,


In [194]:
results["2호선"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61869 entries, 0 to 61868
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     61869 non-null  int64  
 1   LINE_NUM       61869 non-null  object 
 2   FR_CODE        61869 non-null  object 
 3   STATION_CD     61869 non-null  int64  
 4   STATION_NM     61869 non-null  object 
 5   TRAIN_NO       61869 non-null  int64  
 6   ARRIVETIME     61869 non-null  object 
 7   LEFTTIME       61869 non-null  object 
 8   ORIGINSTATION  61869 non-null  int64  
 9   DESTSTATION    61869 non-null  int64  
 10  SUBWAYSNAME    61869 non-null  object 
 11  SUBWAYENAME    61869 non-null  object 
 12  WEEK_TAG       61869 non-null  int64  
 13  INOUT_TAG      61869 non-null  int64  
 14  FL_FLAG        0 non-null      float64
 15  DESTSTATION2   0 non-null      float64
 16  EXPRESS_YN     61869 non-null  object 
 17  BRANCH_LINE    0 non-null      float64
dtypes: flo

In [195]:
results["3호선"]

Unnamed: 0.1,Unnamed: 0,LINE_NUM,FR_CODE,STATION_CD,STATION_NM,TRAIN_NO,ARRIVETIME,LEFTTIME,ORIGINSTATION,DESTSTATION,SUBWAYSNAME,SUBWAYENAME,WEEK_TAG,INOUT_TAG,FL_FLAG,DESTSTATION2,EXPRESS_YN,BRANCH_LINE
0,0,03호선,319,309,지축,3012,05:59:00,05:59:30,323,1958,약수,대화,1,1,,,G,
1,1,03호선,319,309,지축,3016,06:16:30,06:17:00,334,1958,도곡,대화,1,1,,,G,
2,2,03호선,319,309,지축,3022,06:32:00,06:32:30,339,1958,수서,대화,1,1,,,G,
3,3,03호선,319,309,지축,3024K,06:38:30,06:39:00,342,1958,오금,대화,1,1,,,G,
4,4,03호선,319,309,지축,3026,06:45:30,06:46:00,342,1958,오금,대화,1,1,,,G,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42768,103,03호선,309,1958,대화,3353,00:00:00,22:34:00,1958,339,대화,수서,3,2,,,G,
42769,104,03호선,309,1958,대화,3355,00:00:00,22:44:00,1958,334,대화,도곡,3,2,,,G,
42770,105,03호선,309,1958,대화,3357,00:00:00,23:03:00,1958,323,대화,약수,3,2,,,G,
42771,106,03호선,309,1958,대화,3359,00:00:00,23:20:00,1958,310,대화,구파발,3,2,,,G,


In [196]:
results["3호선"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42773 entries, 0 to 42772
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     42773 non-null  int64  
 1   LINE_NUM       42773 non-null  object 
 2   FR_CODE        42773 non-null  int64  
 3   STATION_CD     42773 non-null  int64  
 4   STATION_NM     42773 non-null  object 
 5   TRAIN_NO       42773 non-null  object 
 6   ARRIVETIME     42773 non-null  object 
 7   LEFTTIME       42773 non-null  object 
 8   ORIGINSTATION  42773 non-null  int64  
 9   DESTSTATION    42773 non-null  int64  
 10  SUBWAYSNAME    42773 non-null  object 
 11  SUBWAYENAME    42773 non-null  object 
 12  WEEK_TAG       42773 non-null  int64  
 13  INOUT_TAG      42773 non-null  int64  
 14  FL_FLAG        0 non-null      float64
 15  DESTSTATION2   0 non-null      float64
 16  EXPRESS_YN     42773 non-null  object 
 17  BRANCH_LINE    0 non-null      float64
dtypes: flo

In [197]:
results["4호선"]

Unnamed: 0.1,Unnamed: 0,LINE_NUM,FR_CODE,STATION_CD,STATION_NM,TRAIN_NO,ARRIVETIME,LEFTTIME,ORIGINSTATION,DESTSTATION,SUBWAYSNAME,SUBWAYENAME,WEEK_TAG,INOUT_TAG,FL_FLAG,DESTSTATION2,EXPRESS_YN,BRANCH_LINE
0,0,04호선,408,408,별내별가람,S4014,05:37:00,05:37:30,409,405,당고개,진접,1,1,,,G,
1,1,04호선,408,408,별내별가람,S4020,05:59:00,05:59:30,409,405,당고개,진접,1,1,,,G,
2,2,04호선,408,408,별내별가람,S4002,06:15:00,06:15:30,426,405,서울역,진접,1,1,,,G,
3,3,04호선,408,408,별내별가람,S4004,06:31:00,06:31:30,434,405,남태령,진접,1,1,,,G,
4,4,04호선,408,408,별내별가람,S4032,06:48:00,06:48:30,409,405,당고개,진접,1,1,,,G,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47842,106,04호선,443,1458,금정,K4685,23:11:00,23:11:30,409,1762,당고개,오이도,3,2,,,G,
47843,107,04호선,443,1458,금정,K4687,23:27:00,23:27:30,409,1762,당고개,오이도,3,2,,,G,
47844,108,04호선,443,1458,금정,K4329,23:40:00,23:40:30,409,1759,당고개,안산,3,2,,,G,
47845,109,04호선,443,1458,금정,K4331,24:03:00,24:03:30,409,1759,당고개,안산,3,2,,,G,


In [198]:
results["4호선"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47847 entries, 0 to 47846
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     47847 non-null  int64  
 1   LINE_NUM       47847 non-null  object 
 2   FR_CODE        47847 non-null  int64  
 3   STATION_CD     47847 non-null  int64  
 4   STATION_NM     47847 non-null  object 
 5   TRAIN_NO       47847 non-null  object 
 6   ARRIVETIME     47847 non-null  object 
 7   LEFTTIME       47847 non-null  object 
 8   ORIGINSTATION  47847 non-null  int64  
 9   DESTSTATION    47847 non-null  int64  
 10  SUBWAYSNAME    47847 non-null  object 
 11  SUBWAYENAME    47847 non-null  object 
 12  WEEK_TAG       47847 non-null  int64  
 13  INOUT_TAG      47847 non-null  int64  
 14  FL_FLAG        0 non-null      float64
 15  DESTSTATION2   0 non-null      float64
 16  EXPRESS_YN     47847 non-null  object 
 17  BRANCH_LINE    0 non-null      float64
dtypes: flo

In [199]:
results["5호선"]

Unnamed: 0.1,Unnamed: 0,LINE_NUM,FR_CODE,STATION_CD,STATION_NM,TRAIN_NO,ARRIVETIME,LEFTTIME,ORIGINSTATION,DESTSTATION,SUBWAYSNAME,SUBWAYENAME,WEEK_TAG,INOUT_TAG,FL_FLAG,DESTSTATION2,EXPRESS_YN,BRANCH_LINE
0,0,05호선,510,2511,방화,5620,17:44:00,00:00:00,2561,2511,마천,방화,1,1,,,G,
1,1,05호선,510,2511,방화,5618,17:31:00,00:00:00,2561,2511,마천,방화,1,1,,,G,
2,2,05호선,510,2511,방화,5616,17:18:00,00:00:00,2561,2511,마천,방화,1,1,,,G,
3,3,05호선,510,2511,방화,5614,17:05:00,00:00:00,2561,2511,마천,방화,1,1,,,G,
4,4,05호선,510,2511,방화,5612,16:52:00,00:00:00,2561,2511,마천,방화,1,1,,,G,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49957,81,05호선,P555,2561,마천,5533,09:21:30,00:00:00,2511,2561,방화,마천,3,2,,,G,
49958,82,05호선,P555,2561,마천,5531,09:08:30,00:00:00,2511,2561,방화,마천,3,2,,,G,
49959,83,05호선,P555,2561,마천,5529,08:55:30,00:00:00,2511,2561,방화,마천,3,2,,,G,
49960,84,05호선,P555,2561,마천,5527,08:42:30,00:00:00,2511,2561,방화,마천,3,2,,,G,


In [200]:
results["5호선"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49962 entries, 0 to 49961
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     49962 non-null  int64  
 1   LINE_NUM       49962 non-null  object 
 2   FR_CODE        49962 non-null  object 
 3   STATION_CD     49962 non-null  int64  
 4   STATION_NM     49962 non-null  object 
 5   TRAIN_NO       49962 non-null  int64  
 6   ARRIVETIME     49962 non-null  object 
 7   LEFTTIME       49962 non-null  object 
 8   ORIGINSTATION  49962 non-null  int64  
 9   DESTSTATION    49962 non-null  int64  
 10  SUBWAYSNAME    49962 non-null  object 
 11  SUBWAYENAME    49962 non-null  object 
 12  WEEK_TAG       49962 non-null  int64  
 13  INOUT_TAG      49962 non-null  int64  
 14  FL_FLAG        0 non-null      float64
 15  DESTSTATION2   0 non-null      float64
 16  EXPRESS_YN     49962 non-null  object 
 17  BRANCH_LINE    0 non-null      float64
dtypes: flo

In [201]:
results["6호선"]

Unnamed: 0.1,Unnamed: 0,LINE_NUM,FR_CODE,STATION_CD,STATION_NM,TRAIN_NO,ARRIVETIME,LEFTTIME,ORIGINSTATION,DESTSTATION,SUBWAYSNAME,SUBWAYENAME,WEEK_TAG,INOUT_TAG,FL_FLAG,DESTSTATION2,EXPRESS_YN,BRANCH_LINE
0,0,06호선,610,2611,응암,6316,23:29:30,00:00:00,2649,2611,신내,응암,1,1,,,G,
1,1,06호선,610,2611,응암,6314,23:20:00,00:00:00,2648,2611,봉화산,응암,1,1,,,G,
2,2,06호선,610,2611,응암,6312,23:10:00,00:00:00,2648,2611,봉화산,응암,1,1,,,G,
3,3,06호선,610,2611,응암,6310,23:01:00,00:00:00,2649,2611,신내,응암,1,1,,,G,
4,4,06호선,610,2611,응암,6308,22:52:00,00:00:00,2648,2611,봉화산,응암,1,1,,,G,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30496,41,06호선,648,2649,신내,6011,06:52:00,00:00:00,2611,2649,응암,신내,3,2,,,G,
30497,42,06호선,648,2649,신내,6007,06:30:20,00:00:00,2617,2649,새절,신내,3,2,,,G,
30498,43,06호선,648,2649,신내,6005,06:16:00,00:00:00,2627,2649,공덕,신내,3,2,,,G,
30499,44,06호선,648,2649,신내,6003,06:03:10,00:00:00,2632,2649,한강진,신내,3,2,,,G,


In [202]:
results["6호선"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30501 entries, 0 to 30500
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     30501 non-null  int64  
 1   LINE_NUM       30501 non-null  object 
 2   FR_CODE        30501 non-null  int64  
 3   STATION_CD     30501 non-null  int64  
 4   STATION_NM     30501 non-null  object 
 5   TRAIN_NO       30501 non-null  int64  
 6   ARRIVETIME     30501 non-null  object 
 7   LEFTTIME       30501 non-null  object 
 8   ORIGINSTATION  30501 non-null  int64  
 9   DESTSTATION    30501 non-null  int64  
 10  SUBWAYSNAME    30501 non-null  object 
 11  SUBWAYENAME    30501 non-null  object 
 12  WEEK_TAG       30501 non-null  int64  
 13  INOUT_TAG      30501 non-null  int64  
 14  FL_FLAG        0 non-null      float64
 15  DESTSTATION2   0 non-null      float64
 16  EXPRESS_YN     30501 non-null  object 
 17  BRANCH_LINE    0 non-null      float64
dtypes: flo

In [203]:
results["7호선"]

Unnamed: 0.1,Unnamed: 0,LINE_NUM,FR_CODE,STATION_CD,STATION_NM,TRAIN_NO,ARRIVETIME,LEFTTIME,ORIGINSTATION,DESTSTATION,SUBWAYSNAME,SUBWAYENAME,WEEK_TAG,INOUT_TAG,FL_FLAG,DESTSTATION2,EXPRESS_YN,BRANCH_LINE
0,0,07호선,751,3753,까치울,7016,05:48:00,05:48:30,3763,2711,석남,장암,1,1,,,G,
1,1,07호선,751,3753,까치울,7020,06:03:00,06:03:30,3763,2711,석남,장암,1,1,,,G,
2,2,07호선,751,3753,까치울,7024,06:16:00,06:16:30,3763,2712,석남,도봉산,1,1,,,G,
3,3,07호선,751,3753,까치울,7028,06:29:00,06:29:30,3763,2712,석남,도봉산,1,1,,,G,
4,4,07호선,751,3753,까치울,7032,06:40:30,06:41:00,3763,2712,석남,도봉산,1,1,,,G,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49436,157,07호선,750,2752,온수,7301,22:46:30,22:47:00,2711,3763,장암,석남,3,2,,,G,
49437,158,07호선,750,2752,온수,7305,23:00:00,23:00:30,2712,3763,도봉산,석남,3,2,,,G,
49438,159,07호선,750,2752,온수,7309,23:15:30,23:16:00,2712,3763,도봉산,석남,3,2,,,G,
49439,160,07호선,750,2752,온수,7313,23:31:30,23:32:00,2712,3763,도봉산,석남,3,2,,,G,


In [204]:
results["7호선"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49441 entries, 0 to 49440
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     49441 non-null  int64  
 1   LINE_NUM       49441 non-null  object 
 2   FR_CODE        49441 non-null  int64  
 3   STATION_CD     49441 non-null  int64  
 4   STATION_NM     49441 non-null  object 
 5   TRAIN_NO       49441 non-null  int64  
 6   ARRIVETIME     49441 non-null  object 
 7   LEFTTIME       49441 non-null  object 
 8   ORIGINSTATION  49441 non-null  int64  
 9   DESTSTATION    49441 non-null  int64  
 10  SUBWAYSNAME    49441 non-null  object 
 11  SUBWAYENAME    49441 non-null  object 
 12  WEEK_TAG       49441 non-null  int64  
 13  INOUT_TAG      49441 non-null  int64  
 14  FL_FLAG        0 non-null      float64
 15  DESTSTATION2   0 non-null      float64
 16  EXPRESS_YN     49441 non-null  object 
 17  BRANCH_LINE    0 non-null      float64
dtypes: flo

In [205]:
results["8호선"]

Unnamed: 0.1,Unnamed: 0,LINE_NUM,FR_CODE,STATION_CD,STATION_NM,TRAIN_NO,ARRIVETIME,LEFTTIME,ORIGINSTATION,DESTSTATION,SUBWAYSNAME,SUBWAYENAME,WEEK_TAG,INOUT_TAG,FL_FLAG,DESTSTATION2,EXPRESS_YN,BRANCH_LINE
0,0,08호선,810,2811,암사,8200,18:18:00,00:00:00,2827,2811,모란,암사,1,1,,,G,
1,1,08호선,810,2811,암사,8198,18:13:00,00:00:00,2827,2811,모란,암사,1,1,,,G,
2,2,08호선,810,2811,암사,8196,18:08:00,00:00:00,2827,2811,모란,암사,1,1,,,G,
3,3,08호선,810,2811,암사,8194,18:03:00,00:00:00,2827,2811,모란,암사,1,1,,,G,
4,4,08호선,810,2811,암사,8192,17:57:00,00:00:00,2827,2811,모란,암사,1,1,,,G,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14192,120,08호선,827,2827,모란,8051,09:39:00,00:00:00,2811,2827,암사,모란,3,2,,,G,
14193,121,08호선,827,2827,모란,8049,09:30:30,00:00:00,2811,2827,암사,모란,3,2,,,G,
14194,122,08호선,827,2827,모란,8047,09:22:00,00:00:00,2811,2827,암사,모란,3,2,,,G,
14195,123,08호선,827,2827,모란,8045,09:13:30,00:00:00,2811,2827,암사,모란,3,2,,,G,


In [206]:
results["8호선"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14197 entries, 0 to 14196
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     14197 non-null  int64  
 1   LINE_NUM       14197 non-null  object 
 2   FR_CODE        14197 non-null  int64  
 3   STATION_CD     14197 non-null  int64  
 4   STATION_NM     14197 non-null  object 
 5   TRAIN_NO       14197 non-null  int64  
 6   ARRIVETIME     14197 non-null  object 
 7   LEFTTIME       14197 non-null  object 
 8   ORIGINSTATION  14197 non-null  int64  
 9   DESTSTATION    14197 non-null  int64  
 10  SUBWAYSNAME    14197 non-null  object 
 11  SUBWAYENAME    14197 non-null  object 
 12  WEEK_TAG       14197 non-null  int64  
 13  INOUT_TAG      14197 non-null  int64  
 14  FL_FLAG        0 non-null      float64
 15  DESTSTATION2   0 non-null      float64
 16  EXPRESS_YN     14197 non-null  object 
 17  BRANCH_LINE    0 non-null      float64
dtypes: flo

In [207]:
results["9호선"]

Unnamed: 0.1,Unnamed: 0,LINE_NUM,FR_CODE,STATION_CD,STATION_NM,TRAIN_NO,ARRIVETIME,LEFTTIME,ORIGINSTATION,DESTSTATION,SUBWAYSNAME,SUBWAYENAME,WEEK_TAG,INOUT_TAG,FL_FLAG,DESTSTATION2,EXPRESS_YN,BRANCH_LINE
0,0,09호선,901,4101,개화,C9014,00:00:00,05:30:00,4101,4138,개화,중앙보훈병원,1,1,,,G,
1,1,09호선,901,4101,개화,C9016,00:00:00,05:42:00,4101,4138,개화,중앙보훈병원,1,1,,,G,
2,2,09호선,901,4101,개화,C9018,00:00:00,05:54:00,4101,4138,개화,중앙보훈병원,1,1,,,G,
3,3,09호선,901,4101,개화,C9020,00:00:00,06:06:00,4101,4138,개화,중앙보훈병원,1,1,,,G,
4,4,09호선,901,4101,개화,C9022,00:00:00,06:14:15,4101,4138,개화,중앙보훈병원,1,1,,,G,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32889,91,09호선,926,4126,언주,C9191,23:08:50,23:09:15,4138,4101,중앙보훈병원,개화,3,2,,,G,
32890,92,09호선,926,4126,언주,C9193,23:23:50,23:24:15,4138,4110,중앙보훈병원,염창,3,2,,,G,
32891,93,09호선,926,4126,언주,C9195,23:34:45,23:35:10,4138,4115,중앙보훈병원,여의도,3,2,,,G,
32892,94,09호선,926,4126,언주,C9197,23:48:45,23:49:10,4138,4120,중앙보훈병원,동작,3,2,,,G,


In [208]:
results["9호선"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32894 entries, 0 to 32893
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     32894 non-null  int64  
 1   LINE_NUM       32894 non-null  object 
 2   FR_CODE        32894 non-null  int64  
 3   STATION_CD     32894 non-null  int64  
 4   STATION_NM     32894 non-null  object 
 5   TRAIN_NO       32894 non-null  object 
 6   ARRIVETIME     32894 non-null  object 
 7   LEFTTIME       32894 non-null  object 
 8   ORIGINSTATION  32894 non-null  int64  
 9   DESTSTATION    32894 non-null  int64  
 10  SUBWAYSNAME    32894 non-null  object 
 11  SUBWAYENAME    32894 non-null  object 
 12  WEEK_TAG       32894 non-null  int64  
 13  INOUT_TAG      32894 non-null  int64  
 14  FL_FLAG        0 non-null      float64
 15  DESTSTATION2   0 non-null      float64
 16  EXPRESS_YN     32894 non-null  object 
 17  BRANCH_LINE    0 non-null      float64
dtypes: flo

### 2. 가공하기

In [209]:
for route in now_routes:
    # 컬럼명 변경
    results[route].rename(columns={
        "LINE_NUM": "노선명", 
        "FR_CODE": "외부코드", 
        "STATION_CD": "역번호",
        "STATION_NM": "역사명",
        "TRAIN_NO": "열차번호", 
        "ARRIVETIME": "도착시간", 
        "LEFTTIME": "출발시간", 
        "ORIGINSTATION": "출발역번호",
        "DESTSTATION": "도착역번호",
        "SUBWAYSNAME": "출발역사명",
        "SUBWAYENAME": "도착역사명",
        "WEEK_TAG": "요일",
        "INOUT_TAG": "방향",
        "EXPRESS_YN": "급행선"
    }, inplace=True)
    
    # 노선명 앞의 0 제거
    results[route]['노선명'] = results[route]['노선명'].str.replace(
        pat=r'^0', repl=r'', regex=True)
    
    # 필요없는 컬럼 삭제
    results[route].drop(columns=["Unnamed: 0", "FL_FLAG",
                        "DESTSTATION2", "BRANCH_LINE"], inplace=True)
    
    # 데이터 타입 통일
    results[route] = results[route].astype({'외부코드': 'str', '열차번호': 'str'})
    
    # 시간 데이터
    # 1. 도착시간 없이 출발시간만 있는 경우, 출발시간 = 도착시간
    results[route]["도착시간"][(results[route]["도착시간"] == "00:00:00") &
                           (results[route]["출발시간"] != "00:00:00")] = results[route]["출발시간"][(results[route]["도착시간"] == "00:00:00") &
                                                                                            (results[route]["출발시간"] != "00:00:00")]
    # 2. 출발시간 삭제
    results[route].drop(columns=["출발시간"], inplace=True)
    # 3. 초단위 버림
    results[route]["도착시간"] = results[route]["도착시간"].str.slice(start=0, stop=5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results[route]["도착시간"][(results[route]["도착시간"] == "00:00:00") &
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results[route]["도착시간"][(results[route]["도착시간"] == "00:00:00") &
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results[route]["도착시간"][(results[route]["도착시간"] == "00:00:00") &
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

In [210]:
# # 테이블 저장
# now_routes = ["1호선", "2호선", "3호선", "4호선", "5호선", "6호선", "7호선", "8호선", "9호선"]

# for route in now_routes:
#     results[route].to_csv("timetable/result/" + route + ".csv")

### 3. 부족한 시간표 채우기

##### 3-a. 부족한 데이터셋 구하기

In [211]:
# 경의중앙선
temp = [[0] for i in range(3)]

for i in range(1, 3) :
    for j in range(1, 3) :
        temp[i].append(pd.read_csv(
            'timetable/origin/경의중앙선_{0}_{1}.csv'.format(i, j), header=None))


In [212]:
# 한 종류의 데이터만 시범삼아 처리해 보기

temp[1][1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,82,83,84,85,86,87,88,89,90,91
0,시발역,청량리,덕소,용문,팔당,용문,청량리,용문,용문,용문,...,청량리,용문,덕소,용문,용문,덕소,지평,덕소,용문,덕소
1,종착역,문산,문산,문산,문산,일산,문산,문산,문산,문산,...,일산,문산,일산,문산,문산,문산,능곡,능곡,용산,용산
2,열차번호,K5002,K5004,K5010,K5008,K5012,K5006,K5014,K5016,K5018,...,K5142,K5152,K5146,K5156,K5158,K5154,K5162,K5160,K5166,K5164
3,지평,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,22:02:00,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,월롱,6:19:30,6:48:00,7:33:30,7:15:00,,6:59:30,7:57:30,8:08:30,8:26:00,...,,23:25:30,,23:56:00,0:15:30,23:44:00,,,,
104,,6:20:00,6:48:30,7:34:00,7:15:30,,7:00:00,7:58:00,8:09:00,8:26:30,...,,23:26:00,,23:56:30,0:16:00,23:44:30,,,,
105,파주,6:22:30,6:51:00,7:36:30,7:18:00,,7:02:30,8:00:30,8:11:30,8:29:00,...,,23:28:30,,23:59:00,0:18:30,23:47:00,,,,
106,,6:23:00,6:51:30,7:37:00,7:18:30,,7:03:00,8:01:00,8:12:00,8:29:30,...,,23:29:00,,23:59:30,0:19:00,23:47:30,,,,


In [213]:
temp[1][1].info(max_cols=150)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 92 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       56 non-null     object
 1   1       86 non-null     object
 2   2       95 non-null     object
 3   3       106 non-null    object
 4   4       97 non-null     object
 5   5       99 non-null     object
 6   6       86 non-null     object
 7   7       106 non-null    object
 8   8       106 non-null    object
 9   9       106 non-null    object
 10  10      106 non-null    object
 11  11      106 non-null    object
 12  12      106 non-null    object
 13  13      106 non-null    object
 14  14      107 non-null    object
 15  15      107 non-null    object
 16  16      91 non-null     object
 17  17      100 non-null    object
 18  18      91 non-null     object
 19  19      107 non-null    object
 20  20      107 non-null    object
 21  21      98 non-null     object
 22  22      107 non-null    ob

##### 3-b. 샘플 코드 작성

In [214]:
# # 확인 결과, 공백은 null이 아닌 것으로 취급
# # 공백을 null로 변환하기
# temp[1][1] = temp[1][1].replace(to_replace= r'^[\s]+$', value=np.NaN, regex=True)

In [215]:
# # 재확인
# temp[1][1].info(max_cols=150)

In [216]:
# # 데이터 저장할 새 데이터프레임 생성

# temp_1_1_result = pd.DataFrame([], columns=["노선명", "외부코드", "역번호", "역사명", "열차번호", "도착시간", "출발역번호", "도착역번호", "출발역사명", "도착역사명", "요일", "방향", "급행선"])
# temp_1_1_result

In [217]:
# # 데이터 형태 보고, 기존 시간표 형식과 잘 맞춰보기

# cols = len(temp[1][1].columns)
# rows = len(temp[1][1].index)

# print(cols, rows)

# temp[1][1]

In [218]:
# cnt = 0

# for i in range(3, rows - 1, 2) :
#     name = temp[1][1][0][i]
#     time = ''
    
#     for j in range(1, cols) :
#         if pd.isnull(temp[1][1][j][i]) and pd.isnull(temp[1][1][j][i + 1]):
#             # 저장할 데이터 없음
#             continue
        
#         # 시간 데이터 저장
#         if pd.notnull(temp[1][1][j][i]) :
#             # 기본적으로, 도착  시간으로 저장
#             time = temp[1][1][j][i]
#         else :
#             # 도착 시간이 없는 경우, 출발 시간으로 저장
#             time = temp[1][1][j][i + 1]
            
#         # 시간 정보 가공
#         if len(time) < 8 :
#             time = '0' + time
#         if int(time[:2]) < 4 :
#             time = str(int(time[:2]) + 24) + time[2:]
            
#         temp_1_1_result.loc[cnt] = ['경의중앙선', np.NaN, np.NaN,
#                                     name, temp[1][1][j][2], time, np.NaN, np.NaN, temp[1][1][j][0], temp[1][1][j][1], 1, 1, 'G']
#         # print('경의중앙선', np.NaN, np.NaN,
#         #       name, temp[1][1][j][2], time, np.NaN, np.NaN, temp[1][1][j][0], temp[1][1][j][1], 1, 1, 'G')
        
#         cnt += 1

# # 확인
# temp_1_1_result

In [219]:
# temp_1_1_result.to_csv("temp.csv")

##### 3-c. 자동화 코드 작성

In [47]:
# 함수 정의
def mkKorailTableToSeoulTable(route, korail_table, weekBound, inoutBound):
    cnt = 0
    result = pd.DataFrame([], columns=["노선명", "외부코드", "역번호", "역사명", "열차번호",
                          "도착시간", "출발역번호", "도착역번호", "출발역사명", "도착역사명", "요일", "방향", "급행선"])

    for week in range(1, weekBound) : # 요일
        for inout in range(1, inoutBound) : # 상/하행
            # 하나의 시간표 테이블에 대해 변경 코드 수행
            cols = len(korail_table[week][inout].columns)
            rows = len(korail_table[week][inout].index)
            
            for j in range(1, cols): # 각 열차
                startName = korail_table[week][inout][j][0]
                endName = korail_table[week][inout][j][1]
                trainNum = korail_table[week][inout][j][2]
                directTrainSet = set()

                for i in range(3, rows - 1, 2): # 각 역
                    time = ''
                    name = korail_table[week][inout][0][i]
                    
                    if (pd.isnull(startName) and pd.isnull(endName) and pd.isnull(trainNum)):
                        break
                    
                    # 저장할 데이터 없음
                    if pd.isnull(korail_table[week][inout][j][i]) and (trainNum in directTrainSet or pd.isnull(korail_table[week][inout][j][i + 1])):
                        continue

                    # 시간 데이터 저장
                    if (pd.notnull(korail_table[week][inout][j][i]) and pd.notnull(korail_table[week][inout][j][i + 1])) :
                        # 기본적으로, 도착 시간으로 저장
                        time = korail_table[week][inout][j][i]
                        
                    else:
                        # 도착 시간만 없고 시작역일 경우, 출발 시간으로 저장
                        if pd.isnull(korail_table[week][inout][j][i]) and pd.notnull(korail_table[week][inout][j][i + 1]):
                           time = korail_table[week][inout][j][i + 1]
                           
                           if name != startName:  # 출발역이 아닐 경우, 급행
                               directTrainSet.add(trainNum)
                               continue
                                   
                        # 출발 시간이 없고 종착역일 경우, 도착 시간으로 저장
                        elif pd.notnull(korail_table[week][inout][j][i]) and pd.isnull(korail_table[week][inout][j][i + 1]):
                            time = korail_table[week][inout][j][i]

                    # 시간 정보 가공
                    if len(time) < 8:
                        time = '0' + time
                    if int(time[:2]) < 4:
                        time = str(int(time[:2]) + 24) + time[2:]

                    result.loc[cnt] = [route, np.NaN, np.NaN,
                                       name, trainNum, time, np.NaN, np.NaN, startName, endName, week, inout, "G"]
                    
                    # print(route, np.NaN, np.NaN, name, temp[1][1][j][2], time, np.NaN, np.NaN, temp[1][1][j][0], temp[1][1][j][1], 1, 1, 'G')

                    cnt += 1
            
                # D 정보 채워넣어주기                     
                for directTrainNum in directTrainSet:
                    result.loc[(result["요일"] == week) & (result["방향"] == inout) & (
                        result["열차번호"] == directTrainNum), "급행선"] = "D"
                    
    # 초단위 버림
    result["도착시간"] = result["도착시간"].str.slice(start=0, stop=5)

    return result

In [50]:
# 변형 자동화 코드

routes_korail = ["경강선", "경의중앙선", "경의중앙선_임진강", "경춘선", "경춘선_광운대", "서해선", "수인분당선"]

for route in routes_korail:
    temp = [[0] for i in range(3)]
    
    weekBound = 3
    inoutBound = 3

    if route == "경춘선_광운대":
        weekBound = 2
    
    for i in range(1, weekBound) :    
        for j in range(1, inoutBound) :
            temp[i].append(pd.read_csv(
                'timetable/origin/{0}_{1}_{2}.csv'.format(route, i, j), header=None))            
        
            # 컬럼에 포함된 공백 null처리
            temp[i][j] = temp[i][j].replace(
                to_replace=r'^[\s]+$', value=np.NaN, regex=True)
            
            temp[i][j].dropna(how='all', axis='columns')

    result = mkKorailTableToSeoulTable("경의중앙선" if route == "경의중앙선_임진강" else "경춘선" if route == "경춘선_광운대" else route, temp, weekBound, inoutBound)
    result.to_csv("timetable/result/" + route + ".csv")

##### 3-d. 경의중앙선_임진강, 경춘선_광운대 합치기

In [None]:
GyeongChung_ImJean = pd.read_csv('timetable/result/경의중앙선_임진강.csv')
GyeongChung_ImJean = GyeongChung_ImJean.drop(columns=["Unnamed: 0"])

GyeongChung_ImJean

# 방향 뒤집기 (경의중앙선과 방향 반대임)
GyeongChung_ImJean["방향"][GyeongChung_ImJean["방향"] == 1] = 3
GyeongChung_ImJean["방향"][GyeongChung_ImJean["방향"] == 2] = 1
GyeongChung_ImJean["방향"][GyeongChung_ImJean["방향"] == 3] = 2

GyeongChung_ImJean.to_csv("timetable/result/경의중앙선_임진강.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GyeongChung_ImJean["방향"][GyeongChung_ImJean["방향"] == 1] = 3
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GyeongChung_ImJean["방향"][GyeongChung_ImJean["방향"] == 2] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GyeongChung_ImJean["방향"][GyeongChung_ImJean["방향"] == 3] = 2


In [None]:
# .csv 수동으로 합침
# 합치던 중, 기존 파일에서도 시간순 정렬이 제대로 되어있지 않은 것을 확인
# 시간 순으로 재정렬 필요 -> in excel

##### 3-e. 크롤링으로 부족한 시간표 추가

In [11]:
# import
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.ie.options import Options

In [64]:
# 수인분당선
# 테스트 코드 작성

options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")
driver = webdriver.Chrome('../../../../../../chromeDriver/chromedriver.exe'
                          , options=options)

# 페이지 열기
driver.get(url='https://www.shinbundang.co.kr/index.jsp')
time.sleep(1)

# 페이지 이동
driver.execute_script("callPage('/dxline/dxline3_1.jsp', '1', '4', '1')")
time.sleep(1)

html = driver.page_source
soupCB = BeautifulSoup(html, 'html.parser')

timetable = soupCB.select(
    "#tab_d03 > dd:nth-child(2) > table > tbody > tr > td")

print("신사")
for i in range(0, len(timetable), 3):
    # 상행
    if len(timetable[i]) == 1:
        print(timetable[i].string)
for i in range(2, len(timetable), 3):
    # 하행
    if len(timetable[i]) == 1:
        print(timetable[i].string)
    elif len(timetable[i]) > 1:
        temp = list(timetable[i])
        for j in range(len(temp)):
            print(temp[j].get_text(), end="")
        print()

# 휴일 시간표 이동
driver.execute_script("tab('tab_d03', 1)")
time.sleep(1)

# 다른 시간표 이동
driver.execute_script("javascript:search('SB001')") # ~SB016까지 존재
time.sleep(1)

# 페이지 닫기
driver.close()

신사
30 42 55
06 18 30 42 54
04 14 21 27 33 38 43 48 53 58
03 08 13 18 23 28 33 38 43 48 53 58
03 08 13 18 23 28 33 38 43 49 55
02 08(정자) 14 21 29 37 45 53
01 08 16 24 32 40 48 56
04 12 20 28 36 44 52
00 08 16 24 32 40 48 56
04 12 20 28 36 44 52
00 08 16 24 32 40 48 56
04 12 20 28 36 44 52
00 08 16 24 32 40 48 56
04 12 18 23 28 33 38 43 48 53 58
03 08 13 18 23 28 33 38 43 48 53 58
03 08 13 18 23 28 33 38 43 49 55
01 08 15 22 29 36 43 50 57
04 11 18 25 32 39 46 53
00 07 15 24 32(정자) 40 50
00 10(정자) 21(정자)


3-e-i. 신분당선

In [137]:
# 신분당선 시간표 크롤링 코드
def sinBundangScrapper():
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('window-size=1920x1080')
    options.add_argument("disable-gpu")
    driver = webdriver.Chrome(
        '../../../../../../chromeDriver/chromedriver.exe', options=options)

    # 페이지 열기
    driver.get(url='https://www.shinbundang.co.kr')
    time.sleep(1)

    # 페이지 이동
    driver.execute_script("callPage('/dxline/dxline3_1.jsp', '1', '4', '1')")
    time.sleep(1)

    stations = ["신사", "논현", "신논현", "강남", "양재(서초구청)", "양재시민의숲(매헌)", "청계산입구", "판교(판교테크노밸리)", "정자", "미금(분당서울대병원)",
                "동천", "수지구청", "성복", "상현", "광교중앙(아주대)", "광교(경기대)"]
    nextCodes = ["SB002", "SB003", "SB004", "SB005", "SB006", "SB007", "SB008",
                 "SB009", "SB010", "SB011", "SB012", "SB013", "SB014", "SB015", "SB016", ""]
    prefixes = ["05:", "06:", "07:", "08:", "09:", "10:", "11:", "12:", "13:", "14:",
                "15:", "16:", "17:", "18:", "19:", "20:", "21:", "22:", "23:", "24:", "25:"]
    
    constants = list(zip(stations, nextCodes))

    cnt = 0
    errInd = []
    result = pd.DataFrame([], columns=["노선명", "외부코드", "역번호", "역사명", "열차번호",
                          "도착시간", "출발역번호", "도착역번호", "출발역사명", "도착역사명", "요일", "방향", "급행선"])

    for station, nextCode in constants:
        html = driver.page_source
        soupCB = BeautifulSoup(html, 'html.parser')

        # 표에서 내용 가져오기
        timetables = ["",
                      soupCB.select(
                          "#tab_d03 > dd:nth-child(2) > table > tbody > tr > td"),
                      soupCB.select(
                          "#tab_d03 > dd:nth-child(4) > table > tbody > tr > td")]

        for week in range(1, 3):
            for i in range(0, len(timetables[week]), 3):
                # 상행
                if len(timetables[week][i]) == 0:
                    continue
                else:
                    arr = timetables[week][i].string.split(" ")

                for ele in arr:
                    timeStr = prefixes[i // 3] + ele

                    result.loc[cnt] = ["신분당선", np.NaN, np.NaN,
                                       station, np.NaN, timeStr, np.NaN, np.NaN, "광교(경기대)", "신사", week, 1, "G"]
                    cnt += 1

            for i in range(2, len(timetables[week]), 3):
                # 하행
                if len(timetables[week][i]) == 0:
                    continue
                elif len(timetables[week][i]) == 1:
                    arr = timetables[week][i].string.split(" ")
                else:
                    arr = list(timetables[week][i])
                    newArr = []
                    for ele in arr:
                        if str(type(ele)) == "<class 'bs4.element.Tag'>":
                            ele = ele.get_text()
                            newArr.append(ele[:2])
                        else:
                            ele = ele.split(" ")
                            newArr.extend(ele)

                    arr = newArr

                for ele in arr:
                    ele = ele.strip()

                    if ele == "":
                        continue

                    timeStr = prefixes[i // 3] + ele
                    if len(timeStr) != 5:
                        if len(timeStr) == 9 and timeStr[5] == '(':
                            timeStr = timeStr[:5]
                        else :
                            errInd.append(cnt)
                            
                    result.loc[cnt] = ["신분당선", np.NaN, np.NaN,
                                       station, np.NaN, timeStr, np.NaN, np.NaN, "신사", "광교(경기대)", week, 2, "G"]
                    cnt += 1

        # 다른 시간표 이동
        if nextCode != "":
            driver.execute_script(
                "search('{0}')".format(nextCode))  # ~SB016까지 존재
            time.sleep(1)

    # 페이지 닫기
    driver.close()
    
    print(errInd)
    return result

In [138]:
# # 함수 수행 후 엑셀에 저장해 보기

# sinBundang = sinBundangScrapper()
# sinBundang


[]


Unnamed: 0,노선명,외부코드,역번호,역사명,열차번호,도착시간,출발역번호,도착역번호,출발역사명,도착역사명,요일,방향,급행선
0,신분당선,,,신사,,05:30,,,신사,광교(경기대),1,2,G
1,신분당선,,,신사,,05:42,,,신사,광교(경기대),1,2,G
2,신분당선,,,신사,,05:55,,,신사,광교(경기대),1,2,G
3,신분당선,,,신사,,06:06,,,신사,광교(경기대),1,2,G
4,신분당선,,,신사,,06:18,,,신사,광교(경기대),1,2,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8849,신분당선,,,광교(경기대),,22:46,,,광교(경기대),신사,2,1,G
8850,신분당선,,,광교(경기대),,22:54,,,광교(경기대),신사,2,1,G
8851,신분당선,,,광교(경기대),,23:02,,,광교(경기대),신사,2,1,G
8852,신분당선,,,광교(경기대),,23:11,,,광교(경기대),신사,2,1,G


In [139]:
# sinBundang.to_csv("timetable/result/신분당선.csv")

3-e-ii. 신림선

In [221]:
# import

import re

In [226]:
# 신림선 시간표 크롤링 코드
def sillimScrapper():
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('window-size=1920x1080')
    options.add_argument("disable-gpu")
    driver = webdriver.Chrome(
        '../../../../../../chromeDriver/chromedriver.exe', options=options)


    codes = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11"]
    prefixes = ["05:", "06:", "07:", "08:", "09:", "10:", "11:", "12:", "13:", "14:",
                "15:", "16:", "17:", "18:", "19:", "20:", "21:", "22:", "23:", "24:", "25:"]

    cnt = 0
    errInd = []
    result = pd.DataFrame([], columns=["노선명", "외부코드", "역번호", "역사명", "열차번호",
                          "도착시간", "출발역번호", "도착역번호", "출발역사명", "도착역사명", "요일", "방향", "급행선"])

    for code in codes:
        # 페이지 열기
        driver.get(
            url='http://www.sillimlrt.com/kr/html/sub01/010102{0}.html'.format(code))
        time.sleep(1)
    
        html = driver.page_source
        soupCB = BeautifulSoup(html, 'html.parser')

        # 표에서 내용 가져오기
        station = soupCB.select_one("#location > h2")
        station = station.get_text()
        
        timetables = ["", 
                      ["",
                      soupCB.select(
                          "#tab1 > div.rail_schedule > div:nth-child(2) > div > table > tbody"),
                      soupCB.select(
                          "#tab1 > div.rail_schedule > div:nth-child(1) > div > table > tbody")],
                      ["",
                      soupCB.select(
                          "#tab2 > div.rail_schedule > div:nth-child(2) > div > table > tbody"),
                      soupCB.select(
                          "#tab2 > div.rail_schedule > div:nth-child(1) > div > table > tbody")]]
                        
        for week in range(1, 3):
            for inout in range(1, 3):
                arr = list(timetables[week][inout][0])

                for i in range(1, len(arr), 2):
                    temp = arr[i].find("td").get_text()
                    temp.strip()
                    if temp == "" or temp == "-":
                        continue
                    else:
                        temp = re.split(r',| ', temp)

                    for ele in temp:
                        ele = ele.strip()
                        if ele == "" or ele = "-":
                            continue
                        
                        timeStr = prefixes[i // 2] + ele
                        
                        if len(timeStr) != 5:
                            errInd.append(cnt)

                        result.loc[cnt] = ["신림선", np.NaN, np.NaN,
                                           station, np.NaN, timeStr, np.NaN, np.NaN, "관악산(서울대)" if inout == 1 else "샛강", "샛강" if inout == 1 else "관악산(서울대)", week, inout, "G"]
                        cnt += 1

    # 페이지 닫기
    driver.close()

    print(errInd)
    return result


In [227]:
# sillim = sillimScrapper()
# sillim

[598, 606, 832, 1054, 1543, 1694, 6299, 6525]


Unnamed: 0,노선명,외부코드,역번호,역사명,열차번호,도착시간,출발역번호,도착역번호,출발역사명,도착역사명,요일,방향,급행선
0,신림선,,,샛강,,05:30,,,샛강,관악산(서울대),1,2,G
1,신림선,,,샛강,,05:40,,,샛강,관악산(서울대),1,2,G
2,신림선,,,샛강,,05:50,,,샛강,관악산(서울대),1,2,G
3,신림선,,,샛강,,06:00,,,샛강,관악산(서울대),1,2,G
4,신림선,,,샛강,,06:10,,,샛강,관악산(서울대),1,2,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6879,신림선,,,관악산(서울대),,23:04,,,관악산(서울대),샛강,2,1,G
6880,신림선,,,관악산(서울대),,23:14,,,관악산(서울대),샛강,2,1,G
6881,신림선,,,관악산(서울대),,23:24,,,관악산(서울대),샛강,2,1,G
6882,신림선,,,관악산(서울대),,23:34,,,관악산(서울대),샛강,2,1,G


In [228]:
# sillim.to_csv("timetable/result/신림선.csv")

3-e-iii. 우이신설선

In [237]:
# 이상값 확인
# 확인 결과, 원래부터 이상한 값이 페이지에 기록되어 있었음
# 수동으로 수정

sillim = pd.read_csv("timetable/result/신림선.csv")
sillim.info()

print("도착시간 길이가 5가 아닌 행 개수:", len(sillim[sillim["도착시간"].str.len() != 5]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6882 entries, 0 to 6881
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  6882 non-null   int64  
 1   노선명         6882 non-null   object 
 2   외부코드        0 non-null      float64
 3   역번호         0 non-null      float64
 4   역사명         6882 non-null   object 
 5   열차번호        0 non-null      float64
 6   도착시간        6882 non-null   object 
 7   출발역번호       0 non-null      float64
 8   도착역번호       0 non-null      float64
 9   출발역사명       6882 non-null   object 
 10  도착역사명       6882 non-null   object 
 11  요일          6882 non-null   int64  
 12  방향          6882 non-null   int64  
 13  급행선         6882 non-null   object 
dtypes: float64(5), int64(3), object(6)
memory usage: 752.8+ KB
도착시간 길이가 5가 아닌 행 개수: 0


In [260]:
# 우이신설선 시간표 크롤링 코드
def uiScrapper():
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('window-size=1920x1080')
    options.add_argument("disable-gpu")
    driver = webdriver.Chrome(
        '../../../../../../chromeDriver/chromedriver.exe', options=options)

    codes = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13"]
    prefixes = ["05:", "06:", "07:", "08:", "09:", "10:", "11:", "12:", "13:", "14:",
                "15:", "16:", "17:", "18:", "19:", "20:", "21:", "22:", "23:", "24:", "25:"]

    cnt = 0
    errInd = []
    result = pd.DataFrame([], columns=["노선명", "외부코드", "역번호", "역사명", "열차번호",
                          "도착시간", "출발역번호", "도착역번호", "출발역사명", "도착역사명", "요일", "방향", "급행선"])

    for code in codes:
        # 페이지 열기
        driver.get(
            url='http://ui-line.com/html/intro/intro00/intro_00_{0}.php?pGubn=T2'.format(code))
        time.sleep(1)

        html = driver.page_source
        soupCB = BeautifulSoup(html, 'html.parser')

        # 표에서 내용 가져오기
        station = soupCB.select_one(
            "#fare > div > div > ul.route_station > li:nth-child(2) > p")
        station = station.get_text()

        timetables = ["",
                       soupCB.select(
                           "#tab1 > div:nth-child(2) > table > tbody > tr > td"),
                       soupCB.select(
                           "#tab2 > div:nth-child(2) > table > tbody > tr > td")]

        for week in range(1, 3):
            for i in range(2, len(timetables[week]), 3):
                # 상행
                temp = timetables[week][i].get_text()
                
                if len(temp) == 0:
                    continue
                else:
                    arr = temp.split(" ")

                for ele in arr:
                    if ele == "":
                        continue 
                    
                    timeStr = prefixes[i // 3] + ele
                    
                    if len(timeStr) != 5:
                        errInd.append(cnt)

                    result.loc[cnt] = ["우이신설선", np.NaN, np.NaN,
                                       station, np.NaN, timeStr, np.NaN, np.NaN, "북한산우이", "신설동", week, 1, "G"]
                    cnt += 1

            for i in range(0, len(timetables[week]), 3):
                # 하행
                temp = timetables[week][i].get_text()
                
                if len(temp) == 0:
                    continue
                else:
                    arr = temp.split(" ")

                for ele in arr:
                    if ele == "":
                        continue 
                    
                    timeStr = prefixes[i // 3] + ele
                    
                    if len(timeStr) != 5:
                        errInd.append(cnt)

                    result.loc[cnt] = ["우이신설선", np.NaN, np.NaN,
                                       station, np.NaN, timeStr, np.NaN, np.NaN, "신설동", "북한산우이", week, 2, "G"]
                    cnt += 1

    # 페이지 닫기
    driver.close()

    print(errInd)
    return result


In [261]:
# ui = uiScrapper()
# ui

[]


Unnamed: 0,노선명,외부코드,역번호,역사명,열차번호,도착시간,출발역번호,도착역번호,출발역사명,도착역사명,요일,방향,급행선
0,우이신설선,,,북한산우이,,05:30,,,북한산우이,신설동,1,1,G
1,우이신설선,,,북한산우이,,05:42,,,북한산우이,신설동,1,1,G
2,우이신설선,,,북한산우이,,05:54,,,북한산우이,신설동,1,1,G
3,우이신설선,,,북한산우이,,06:06,,,북한산우이,신설동,1,1,G
4,우이신설선,,,북한산우이,,06:13,,,북한산우이,신설동,1,1,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11277,우이신설선,,,신설동,,22:49,,,신설동,북한산우이,2,2,G
11278,우이신설선,,,신설동,,23:01,,,신설동,북한산우이,2,2,G
11279,우이신설선,,,신설동,,23:13,,,신설동,북한산우이,2,2,G
11280,우이신설선,,,신설동,,23:25,,,신설동,북한산우이,2,2,G


In [262]:
# ui.to_csv("./timetable/result/우이신설선.csv")

In [273]:
# 신분당선 역사명 요일 방향별 개수 세기
stations = ["신사", "논현", "신논현", "강남", "양재(서초구청)", "양재시민의숲(매헌)", "청계산입구", "판교(판교테크노밸리)", "정자", "미금(분당서울대병원)",
            "동천", "수지구청", "성복", "상현", "광교중앙(아주대)", "광교(경기대)"]

for week in range(1, 3):
    for inout in range(1, 3):        
        for station in stations:
            print(station, week, inout, len(sinBundang[(sinBundang["역사명"] == station) & (sinBundang["요일"] == week) & (sinBundang["방향"] == inout)]))
            
## 확인 결과, 정자/판교행을 고려할 필요 있음 

신사 1 1 0
논현 1 1 163
신논현 1 1 163
강남 1 1 163
양재(서초구청) 1 1 163
양재시민의숲(매헌) 1 1 163
청계산입구 1 1 163
판교(판교테크노밸리) 1 1 163
정자 1 1 162
미금(분당서울대병원) 1 1 160
동천 1 1 160
수지구청 1 1 160
성복 1 1 160
상현 1 1 160
광교중앙(아주대) 1 1 160
광교(경기대) 1 1 160
신사 1 2 163
논현 1 2 163
신논현 1 2 163
강남 1 2 163
양재(서초구청) 1 2 163
양재시민의숲(매헌) 1 2 163
청계산입구 1 2 163
판교(판교테크노밸리) 1 2 163
정자 1 2 160
미금(분당서울대병원) 1 2 160
동천 1 2 160
수지구청 1 2 160
성복 1 2 160
상현 1 2 160
광교중앙(아주대) 1 2 160
광교(경기대) 1 2 0
신사 2 1 0
논현 2 1 135
신논현 2 1 135
강남 2 1 135
양재(서초구청) 2 1 135
양재시민의숲(매헌) 2 1 135
청계산입구 2 1 135
판교(판교테크노밸리) 2 1 135
정자 2 1 134
미금(분당서울대병원) 2 1 132
동천 2 1 132
수지구청 2 1 132
성복 2 1 132
상현 2 1 132
광교중앙(아주대) 2 1 132
광교(경기대) 2 1 132
신사 2 2 135
논현 2 2 135
신논현 2 2 135
강남 2 2 135
양재(서초구청) 2 2 135
양재시민의숲(매헌) 2 2 135
청계산입구 2 2 135
판교(판교테크노밸리) 2 2 135
정자 2 2 132
미금(분당서울대병원) 2 2 132
동천 2 2 132
수지구청 2 2 132
성복 2 2 132
상현 2 2 132
광교중앙(아주대) 2 2 132
광교(경기대) 2 2 0


In [272]:
# 신림선 역사명 요일 방향별 개수 세기
stations = ["샛강", "대방(성애병원)", "서울지방병무청", "보라매", "보라매공원", "보라매병원(전문건설회관)", "당곡", "신림", "서원", "서울대벤처타운", "관악산(서울대)"]

cnt = 0
for week in range(1, 3):
    for inout in range(1, 3):
        for station in stations:
            cnt += len(sillim[(sillim["역사명"] == station) & (
                sillim["요일"] == week) & (sillim["방향"] == inout)])
            print(station, week, inout, len(sillim[(sillim["역사명"] == station) & (
                sillim["요일"] == week) & (sillim["방향"] == inout)]))

샛강 1 1 0
대방(성애병원) 1 1 192
서울지방병무청 1 1 192
보라매 1 1 192
보라매공원 1 1 193
보라매병원(전문건설회관) 1 1 192
당곡 1 1 192
신림 1 1 192
서원 1 1 192
서울대벤처타운 1 1 192
관악산(서울대) 1 1 191
샛강 1 2 182
대방(성애병원) 1 2 184
서울지방병무청 1 2 181
보라매 1 2 182
보라매공원 1 2 181
보라매병원(전문건설회관) 1 2 192
당곡 1 2 192
신림 1 2 191
서원 1 2 192
서울대벤처타운 1 2 192
관악산(서울대) 1 2 0
샛강 2 1 0
대방(성애병원) 2 1 156
서울지방병무청 2 1 156
보라매 2 1 156
보라매공원 2 1 156
보라매병원(전문건설회관) 2 1 156
당곡 2 1 156
신림 2 1 156
서원 2 1 156
서울대벤처타운 2 1 156
관악산(서울대) 2 1 156
샛강 2 2 150
대방(성애병원) 2 2 150
서울지방병무청 2 2 150
보라매 2 2 150
보라매공원 2 2 150
보라매병원(전문건설회관) 2 2 157
당곡 2 2 157
신림 2 2 156
서원 2 2 157
서울대벤처타운 2 2 156
관악산(서울대) 2 2 0


In [280]:
# 수동 수정 후 재확인
sillim = pd.read_csv("timetable/result/신림선.csv")
stations = ["샛강", "대방(성애병원)", "서울지방병무청", "보라매", "보라매공원",
            "보라매병원(전문건설회관)", "당곡", "신림", "서원", "서울대벤처타운", "관악산(서울대)"]

cnt = 0
for week in range(1, 3):
    for inout in range(1, 3):
        for station in stations:
            cnt += len(sillim[(sillim["역사명"] == station) & (
                sillim["요일"] == week) & (sillim["방향"] == inout)])
            print(station, week, inout, len(sillim[(sillim["역사명"] == station) & (
                sillim["요일"] == week) & (sillim["방향"] == inout)]))


샛강 1 1 0
대방(성애병원) 1 1 192
서울지방병무청 1 1 192
보라매 1 1 192
보라매공원 1 1 192
보라매병원(전문건설회관) 1 1 192
당곡 1 1 192
신림 1 1 192
서원 1 1 192
서울대벤처타운 1 1 192
관악산(서울대) 1 1 192
샛강 1 2 192
대방(성애병원) 1 2 192
서울지방병무청 1 2 192
보라매 1 2 192
보라매공원 1 2 192
보라매병원(전문건설회관) 1 2 192
당곡 1 2 192
신림 1 2 192
서원 1 2 192
서울대벤처타운 1 2 192
관악산(서울대) 1 2 0
샛강 2 1 0
대방(성애병원) 2 1 156
서울지방병무청 2 1 156
보라매 2 1 156
보라매공원 2 1 156
보라매병원(전문건설회관) 2 1 156
당곡 2 1 156
신림 2 1 156
서원 2 1 156
서울대벤처타운 2 1 156
관악산(서울대) 2 1 156
샛강 2 2 156
대방(성애병원) 2 2 156
서울지방병무청 2 2 156
보라매 2 2 156
보라매공원 2 2 156
보라매병원(전문건설회관) 2 2 156
당곡 2 2 156
신림 2 2 156
서원 2 2 156
서울대벤처타운 2 2 156
관악산(서울대) 2 2 0


In [275]:
# 우이신설선 역사명 요일 방향별 개수 세기
stations = ["북한산우이", "솔밭공원", "4·19민주묘지", "가오리", "화계", "삼양", "삼양사거리", "솔샘", "북한산보국문", "정릉", "성신여대입구", "보문", "신설동"]

cnt = 0
for week in range(1, 3):
    for inout in range(1, 3):
        for station in stations:
            cnt += len(ui[(ui["역사명"] == station) & (
                ui["요일"] == week) & (ui["방향"] == inout)])
            print(station, week, inout, len(ui[(ui["역사명"] == station) & (
                ui["요일"] == week) & (ui["방향"] == inout)]))

북한산우이 1 1 258
솔밭공원 1 1 258
4·19민주묘지 1 1 258
가오리 1 1 258
화계 1 1 258
삼양 1 1 258
삼양사거리 1 1 258
솔샘 1 1 258
북한산보국문 1 1 258
정릉 1 1 258
성신여대입구 1 1 258
보문 1 1 258
신설동 1 1 0
북한산우이 1 2 0
솔밭공원 1 2 258
4·19민주묘지 1 2 258
가오리 1 2 258
화계 1 2 258
삼양 1 2 258
삼양사거리 1 2 258
솔샘 1 2 258
북한산보국문 1 2 258
정릉 1 2 258
성신여대입구 1 2 258
보문 1 2 258
신설동 1 2 258
북한산우이 2 1 212
솔밭공원 2 1 212
4·19민주묘지 2 1 214
가오리 2 1 212
화계 2 1 212
삼양 2 1 212
삼양사거리 2 1 212
솔샘 2 1 212
북한산보국문 2 1 212
정릉 2 1 212
성신여대입구 2 1 212
보문 2 1 212
신설동 2 1 0
북한산우이 2 2 0
솔밭공원 2 2 212
4·19민주묘지 2 2 212
가오리 2 2 212
화계 2 2 212
삼양 2 2 212
삼양사거리 2 2 212
솔샘 2 2 212
북한산보국문 2 2 212
정릉 2 2 212
성신여대입구 2 2 212
보문 2 2 212
신설동 2 2 212


In [277]:
# 4·19민주묘지의 이상값 수동 해결 - 카카오 노선도

ui = pd.read_csv("./timetable/result/우이신설선.csv")
print(len(ui[(ui["역사명"] == "4·19민주묘지") & (
    ui["요일"] == 2) & (ui["방향"] == 1)]))

212


### 4. 시간표 형태 변경

##### 4-a. 열차번호 있는 시간표

In [7]:
def mkOldTableToNewTable(timetable):
    # 처음에 변경했던 형식을 받아
    # 새 형식 만들기

    cnt = 0
    trainNum = dict()
    colList = list(set(timetable["역사명"])) + ["열차번호", "출발역사명",
                                             "도착역사명", "출발시간", "도착시간", "요일", "방향", "급행선"]

    result = pd.DataFrame([], columns=colList)

    nowIndex = 0
    for i in range(len(timetable)):
        if str(timetable["열차번호"].loc[i]) + "/" + str(timetable["요일"].loc[i]) + str(timetable["방향"].loc[i]) not in trainNum:
            # 새로 추가
            trainNum[str(timetable["열차번호"].loc[i]) + "/" +
                     str(timetable["요일"].loc[i]) + str(timetable["방향"].loc[i])] = cnt

            result.loc[cnt] = [np.NaN] * len(colList)
            result["열차번호"].loc[cnt] = timetable["열차번호"].loc[i]
            result["출발역사명"].loc[cnt] = timetable["출발역사명"].loc[i]
            result["도착역사명"].loc[cnt] = timetable["도착역사명"].loc[i]
            result["요일"].loc[cnt] = timetable["요일"].loc[i]
            result["방향"].loc[cnt] = timetable["방향"].loc[i]

            if timetable["급행선"].loc[i] == "D":
                result["급행선"].loc[cnt] = True
            else:
                result["급행선"].loc[cnt] = False

            nowIndex = cnt
            cnt += 1
        else:
            # 이미 해당 열차번호에 해당하는 tuple 존재
            nowIndex = trainNum[str(timetable["열차번호"].loc[i]) + "/" +
                                str(timetable["요일"].loc[i]) + str(timetable["방향"].loc[i])]

        # nowIndex를 통해 정보 넣기
        colName = timetable["역사명"].loc[i]
        result[colName].loc[nowIndex] = timetable["도착시간"].loc[i]

        if timetable["역사명"].loc[i] == timetable["출발역사명"].loc[i]:
            result["출발시간"].loc[nowIndex] = timetable["도착시간"].loc[i]
        if timetable["역사명"].loc[i] == timetable["도착역사명"].loc[i]:
            result["도착시간"].loc[nowIndex] = timetable["도착시간"].loc[i]

    return result


In [37]:
# 테스트
pd.set_option('mode.chained_assignment', None)
              
route1 = pd.read_csv("./timetable/result/1호선.csv")
result1 = mkOldTableToNewTable(route1)

result1

# 실제 시간표는 colab을 이용하여 만듦 (속도 문제로 인해)

  route1 = pd.read_csv("./timetable/result/1호선.csv")


Unnamed: 0,가능,지행,회룡,관악,화서,금천구청,동인천,서울역,백운,외대앞,...,도원,양주,군포,출발역사명,도착역사명,출발시간,도착시간,요일,방향,급행선
0,,,,,,,,,,,...,,,,천안,청량리,,,1.0,1.0,True
1,,,,,,,,,,,...,,,,천안,청량리,,,1.0,1.0,False
2,,,,,,,,,,,...,,,,신창,청량리,,,1.0,1.0,True
3,,,,,,,,,,,...,,,,신창,청량리,,,1.0,1.0,True
4,,,,,,,,,,,...,,,,천안,청량리,,,1.0,1.0,True
5,,,,,,,,,,,...,,,,신창,청량리,,,1.0,1.0,True
6,,,,,,,,,,,...,,,,천안,청량리,,,1.0,1.0,True
7,,,,,,,,,,,...,,,,신창,청량리,,,1.0,1.0,True
8,,,,,,,,,,,...,,,,천안,청량리,,,1.0,1.0,True
9,,,,,,,,,,,...,,,,천안,청량리,,,1.0,1.0,True


In [5]:
import warnings

warnings.simplefilter(
    action='ignore', category=FutureWarning)  # FutureWarning 제거
pd.set_option('mode.chained_assignment', None)

In [None]:
for i in range(9):
  temp = pd.read_csv("./timetable/result/{0}호선.csv".format(i + 1))
  result = mkOldTableToNewTable(temp)
  result.to_csv("./timetable/result_2/{0}호선.csv".format(i + 1))

In [None]:
names = ["경의중앙선", "경춘선", "수인분당선"]

for name in names:
  temp = pd.read_csv("./timetable/result/" + name + ".csv")
  result = mkOldTableToNewTable(temp)
  result.to_csv("./timetable/result_2/" + name + ".csv")

##### 4-b. 열차번호 없는 시간표

In [19]:
# 신분당선 확인
sinBundang = pd.read_csv("./timetable/result/신분당선.csv")

stations = ["신사", "논현", "신논현", "강남", "양재(서초구청)", "양재시민의숲(매헌)", "청계산입구", "판교(판교테크노밸리)", "정자", "미금(분당서울대병원)",
            "동천", "수지구청", "성복", "상현", "광교중앙(아주대)", "광교(경기대)"]

for week in range(1, 3):
    for inout in range(1, 3):
        for station in stations:
            print(station, week, inout, len(sinBundang[(sinBundang["역사명"] == station) & (
                sinBundang["요일"] == week) & (sinBundang["방향"] == inout)]))
            
# 종점 엑셀 만든 뒤 확인 - 신사, 정자 문제 확인
# 다시 한 번 종점 엑셀 만든 뒤 재확인 - 정자 문제 확인

신사 1 1 163
논현 1 1 163
신논현 1 1 163
강남 1 1 163
양재(서초구청) 1 1 163
양재시민의숲(매헌) 1 1 163
청계산입구 1 1 163
판교(판교테크노밸리) 1 1 163
정자 1 1 162
미금(분당서울대병원) 1 1 160
동천 1 1 160
수지구청 1 1 160
성복 1 1 160
상현 1 1 160
광교중앙(아주대) 1 1 160
광교(경기대) 1 1 160
신사 1 2 163
논현 1 2 163
신논현 1 2 163
강남 1 2 163
양재(서초구청) 1 2 163
양재시민의숲(매헌) 1 2 163
청계산입구 1 2 163
판교(판교테크노밸리) 1 2 163
정자 1 2 164
미금(분당서울대병원) 1 2 160
동천 1 2 160
수지구청 1 2 160
성복 1 2 160
상현 1 2 160
광교중앙(아주대) 1 2 160
광교(경기대) 1 2 160
신사 2 1 135
논현 2 1 135
신논현 2 1 135
강남 2 1 135
양재(서초구청) 2 1 135
양재시민의숲(매헌) 2 1 135
청계산입구 2 1 135
판교(판교테크노밸리) 2 1 135
정자 2 1 134
미금(분당서울대병원) 2 1 132
동천 2 1 132
수지구청 2 1 132
성복 2 1 132
상현 2 1 132
광교중앙(아주대) 2 1 132
광교(경기대) 2 1 132
신사 2 2 135
논현 2 2 135
신논현 2 2 135
강남 2 2 135
양재(서초구청) 2 2 135
양재시민의숲(매헌) 2 2 135
청계산입구 2 2 135
판교(판교테크노밸리) 2 2 135
정자 2 2 136
미금(분당서울대병원) 2 2 132
동천 2 2 132
수지구청 2 2 132
성복 2 2 132
상현 2 2 132
광교중앙(아주대) 2 2 132
광교(경기대) 2 2 132


In [22]:
# 각 출발->도착역 조합 확인
ends = ["신사", "판교(판교테크노밸리)", "정자", "광교(경기대)"]
stations = ["신사", "논현", "신논현", "강남", "양재(서초구청)", "양재시민의숲(매헌)", "청계산입구", "판교(판교테크노밸리)", "정자", "미금(분당서울대병원)",
            "동천", "수지구청", "성복", "상현", "광교중앙(아주대)", "광교(경기대)"]
            
# for week in range(1, 3):
#     for inout in range(1, 3):
#         for start in ends:
#             for end in ends:
#                 if start == end: 
#                     continue
                
#                 for station in stations:
#                     count = len(sinBundang[(sinBundang["역사명"] == station) & (sinBundang["요일"] == week) & (sinBundang["방향"] == inout) & (sinBundang["출발역사명"] == start) & (sinBundang["도착역사명"] == end)])
#                     print(station + " // " + start + "->" + end + " // " + str(week) + str(inout) + " : " + str(count))
            
# 확인 결과, 신사->광교에서 문제 발견
# 맨 앞 열차는 정자->광교로 수정

In [3]:
# 신림선 확인
sillim = pd.read_csv("./timetable/result/신림선.csv")

# 신림선 역사명 요일 방향별 개수 세기
stations = ["샛강", "대방(성애병원)", "서울지방병무청", "보라매", "보라매공원",
            "보라매병원(전문건설회관)", "당곡", "신림", "서원", "서울대벤처타운", "관악산(서울대)"]

cnt = 0
for week in range(1, 3):
    for inout in range(1, 3):
        for station in stations:
            cnt += len(sillim[(sillim["역사명"] == station) & (
                sillim["요일"] == week) & (sillim["방향"] == inout)])
            print(station, week, inout, len(sillim[(sillim["역사명"] == station) & (
                sillim["요일"] == week) & (sillim["방향"] == inout)]))


샛강 1 1 192
대방(성애병원) 1 1 192
서울지방병무청 1 1 192
보라매 1 1 192
보라매공원 1 1 192
보라매병원(전문건설회관) 1 1 192
당곡 1 1 192
신림 1 1 192
서원 1 1 192
서울대벤처타운 1 1 192
관악산(서울대) 1 1 192
샛강 1 2 192
대방(성애병원) 1 2 192
서울지방병무청 1 2 192
보라매 1 2 192
보라매공원 1 2 192
보라매병원(전문건설회관) 1 2 192
당곡 1 2 192
신림 1 2 192
서원 1 2 192
서울대벤처타운 1 2 192
관악산(서울대) 1 2 192
샛강 2 1 156
대방(성애병원) 2 1 156
서울지방병무청 2 1 156
보라매 2 1 156
보라매공원 2 1 156
보라매병원(전문건설회관) 2 1 156
당곡 2 1 156
신림 2 1 156
서원 2 1 156
서울대벤처타운 2 1 156
관악산(서울대) 2 1 156
샛강 2 2 156
대방(성애병원) 2 2 156
서울지방병무청 2 2 156
보라매 2 2 156
보라매공원 2 2 156
보라매병원(전문건설회관) 2 2 156
당곡 2 2 156
신림 2 2 156
서원 2 2 156
서울대벤처타운 2 2 156
관악산(서울대) 2 2 156


In [None]:
names = ["신림선", "신분당선", "우이신설선"]

for name in names:
  temp = pd.read_csv("./timetable/result/" + name + ".csv")
  result = mkOldTableToNewTable(temp)
  result.to_csv("./timetable/result_2/" + name + ".csv")

##### 4-c. 공항철도선

In [45]:
# 함수 정의
def mkPlainTableToSeoulTable(route, korail_table, weekBound, inoutBound):
    cnt = 0
    result = pd.DataFrame([], columns=["노선명", "외부코드", "역번호", "역사명", "열차번호",
                          "도착시간", "출발역번호", "도착역번호", "출발역사명", "도착역사명", "요일", "방향", "급행선"])

    for week in range(1, weekBound):  # 요일
        for inout in range(1, inoutBound):  # 상/하행
            # 하나의 시간표 테이블에 대해 변경 코드 수행
            cols = len(korail_table[week][inout].columns)
            rows = len(korail_table[week][inout].index)

            for j in range(1, cols):  # 열차
                startName = korail_table[week][inout][j][1]
                endName = korail_table[week][inout][j][3]
                trainNum = korail_table[week][inout][j][0]
                directTrainSet = set()
                
                if (pd.notnull(korail_table[week][inout][j][34])):
                    # 직통인 경우
                    directTrainSet.add(trainNum)
                
                # print(startName, endName, trainNum,
                #       korail_table[week][inout][j][34])
                
                for i in range(5, rows - 2, 2):  # 각 역
                    time = ''
                    name = korail_table[week][inout][0][i]

                    if (pd.isnull(startName) and pd.isnull(endName) and pd.isnull(trainNum)):
                        break

                    # print(korail_table[week][inout][j][i], korail_table[week][inout][j][i + 1])
                    # 저장할 데이터 없음
                    if pd.isnull(korail_table[week][inout][j][i]) and (pd.isnull(korail_table[week][inout][j][i + 1])):
                        continue

                    # 시간 데이터 저장
                    if (pd.notnull(korail_table[week][inout][j][i]) and pd.notnull(korail_table[week][inout][j][i + 1])):
                        # 기본적으로, 도착 시간으로 저장
                        time = korail_table[week][inout][j][i]

                    else:
                        # 도착 시간만 없고 시작역일 경우, 출발 시간으로 저장
                        if pd.isnull(korail_table[week][inout][j][i]) and (pd.notnull(korail_table[week][inout][j][i + 1])):
                           time = korail_table[week][inout][j][i + 1]
                           
                           if name != startName:  # 출발역이 아닐 경우, 급행
                            continue

                        # 출발 시간이 없고 종착역일 경우, 도착 시간으로 저장
                        elif pd.notnull(korail_table[week][inout][j][i]) and pd.isnull(korail_table[week][inout][j][i + 1]):
                            time = korail_table[week][inout][j][i]

                    # 시간 정보 가공
                    if len(time) < 8:
                        time = '0' + time
                    if int(time[:2]) < 4:
                        time = str(int(time[:2]) + 24) + time[2:]

                    result.loc[cnt] = [route, np.NaN, np.NaN,
                                       name, trainNum, time, np.NaN, np.NaN, startName, endName, week, inout, "D" if trainNum in directTrainSet else "G"]

                    cnt += 1

    # 초단위 버림
    result["도착시간"] = result["도착시간"].str.slice(start=0, stop=5)

    return result

In [46]:
temp = [[0] for i in range(3)]

for i in range(1, 3):
    for j in range(1, 3):
        temp[i].append(pd.read_csv(
            'timetable/original/공항철도선_{0}_{1}.csv'.format(i, j), header=None))

        # 컬럼에 포함된 공백 null처리
        temp[i][j] = temp[i][j].replace(
            to_replace=r'^[\s]+$', value=np.NaN, regex=True)
        
        temp[i][j] = temp[i][j].replace('---', np.NaN)

        temp[i][j].dropna(how='all', axis='columns')

# print(temp)
result = mkPlainTableToSeoulTable("공항철도선", temp, 3, 3)
result.to_csv("timetable/result/공항철도선.csv")

In [None]:
temp = pd.read_csv("./timetable/result/공항철도선.csv")
result = mkOldTableToNewTable(temp)
result.to_csv("./timetable/result_2/공항철도선.csv")

### 5. 역번호로 변경

##### 5-a. result

In [81]:
result = pd.read_csv('result_train_station_withcode.csv')

In [82]:
filenames = [ "경의중앙선", "경춘선", "공항철도선", "1호선", "2호선", "3호선", "4호선", "5호선", "6호선", "7호선", "8호선", "9호선", "수인분당선", "신림선", "신분당선", "우이신설선"]
routes = sorted(set(result["노선명"]))

In [83]:
timetables = [0] * len(filenames)
for i in range(len(timetables)):
    timetables[i] = pd.read_csv(
        "./timetable/result/" + filenames[i] + ".csv")

  timetables[i] = pd.read_csv(


In [84]:
timetables[0]["역사명"] = timetables[0]["역사명"].replace({
    '1양원': '양원',
    '1양정': '양정',
    '디엠시': '디지털미디어시티',
    '홍대입': '홍대입구',
    '효창공': '효창공원앞',
    '서울': '서울역'
})

timetables[0]["출발역사명"] = timetables[0]["출발역사명"].replace({
    '1양원': '양원',
    '1양정': '양정',
    '디엠시': '디지털미디어시티',
    '홍대입': '홍대입구',
    '효창공': '효창공원앞',
    '서울': '서울역'
})

timetables[0]["도착역사명"] = timetables[0]["도착역사명"].replace({
    '1양원': '양원',
    '1양정': '양정',
    '디엠시': '디지털미디어시티',
    '홍대입': '홍대입구',
    '효창공': '효창공원앞',
    '서울': '서울역'
})

In [85]:
timetables[1]["역사명"] = timetables[1]["역사명"].replace({
    '평내호': '평내호평'
})

timetables[1]["출발역사명"] = timetables[1]["출발역사명"].replace({
    '평내호': '평내호평'
})

timetables[1]["도착역사명"] = timetables[1]["도착역사명"].replace({
    '평내호': '평내호평'
})


In [86]:
timetables[2]["역사명"] = timetables[2]["역사명"].replace({
    '서울': '서울역'
})

timetables[2]["출발역사명"] = timetables[2]["출발역사명"].replace({
    '서울': '서울역'
})

timetables[2]["도착역사명"] = timetables[2]["도착역사명"].replace({
    '서울': '서울역'
})

In [87]:
timetables[3]["역사명"] = timetables[3]["역사명"].replace({
    '쌍용': '쌍용(나사렛대)'
})

timetables[3]["출발역사명"] = timetables[3]["도착역사명"].replace({
    '쌍용': '쌍용(나사렛대)'
})

timetables[3]["도착역사명"] = timetables[3]["도착역사명"].replace({
    '쌍용': '쌍용(나사렛대)'
})

In [88]:
timetables[6]["역사명"] = timetables[6]["역사명"].replace({
    '총신대입구': '총신대입구(이수)'
})

timetables[6]["출발역사명"] = timetables[6]["출발역사명"].replace({
    '총신대입구': '총신대입구(이수)'
})

timetables[6]["도착역사명"] = timetables[6]["도착역사명"].replace({
    '총신대입구': '총신대입구(이수)'
})

In [89]:
timetables[7]["역사명"] = timetables[7]["역사명"].replace({
    '광나루': '광나루(장신대)',
    '군자': '군자(능동)',
    '아차산': '아차산(어린이대공원후문)',
    '천호': '천호(풍납토성)'
})

timetables[7]["출발역사명"] = timetables[7]["출발역사명"].replace({
    '광나루': '광나루(장신대)',
    '군자': '군자(능동)',
    '아차산': '아차산(어린이대공원후문)',
    '천호': '천호(풍납토성)'
})

timetables[7]["도착역사명"] = timetables[7]["도착역사명"].replace({
    '광나루': '광나루(장신대)',
    '군자': '군자(능동)',
    '아차산': '아차산(어린이대공원후문)',
    '천호': '천호(풍납토성)'
})

In [90]:
timetables[9]["역사명"] = timetables[9]["역사명"].replace({
    '군자': '군자(능동)',
    '어린이대공원': '어린이대공원(세종대)',
    '이수': '총신대입구(이수)'
})

timetables[9]["출발역사명"] = timetables[9]["도착역사명"].replace({
    '군자': '군자(능동)',
    '어린이대공원': '어린이대공원(세종대)',
    '이수': '총신대입구(이수)'
})

timetables[9]["도착역사명"] = timetables[9]["도착역사명"].replace({
    '군자': '군자(능동)',
    '어린이대공원': '어린이대공원(세종대)',
    '이수': '총신대입구(이수)'
})

In [91]:
timetables[10]["역사명"] = timetables[10]["역사명"].replace({
    '몽촌토성': '몽촌토성(평화의문)',
    '천호': '천호(풍납토성)'
})

timetables[10]["출발역사명"] = timetables[10]["출발역사명"].replace({
    '몽촌토성': '몽촌토성(평화의문)',
    '천호': '천호(풍납토성)'
})

timetables[10]["도착역사명"] = timetables[10]["도착역사명"].replace({
    '몽촌토성': '몽촌토성(평화의문)',
    '천호': '천호(풍납토성)'
})

In [92]:
timetables[12]["역사명"] = timetables[12]["역사명"].replace({
    '강남구': '강남구청',
    '구룡역': '구룡',
    '남동인': '남동인더스파크',
    '대모산': '대모산입구',
    '로데오': '압구정로데오',
    '매탄권': '매탄권선',
    '소래포': '소래포구',
    '수원시': '수원시청',
    '신수원': '수원',
    '신길온': '신길온천',
    '신인천': '인천',
    '인천논': '인천논현'
})

timetables[12]["출발역사명"] = timetables[12]["출발역사명"].replace({
    '강남구': '강남구청',
    '구룡역': '구룡',
    '남동인': '남동인더스파크',
    '대모산': '대모산입구',
    '로데오': '압구정로데오',
    '매탄권': '매탄권선',
    '소래포': '소래포구',
    '수원시': '수원시청',
    '신수원': '수원',
    '신길온': '신길온천',
    '신인천': '인천',
    '인천논': '인천논현'
})

timetables[12]["도착역사명"] = timetables[12]["도착역사명"].replace({
    '강남구': '강남구청',
    '구룡역': '구룡',
    '남동인': '남동인더스파크',
    '대모산': '대모산입구',
    '로데오': '압구정로데오',
    '매탄권': '매탄권선',
    '소래포': '소래포구',
    '수원시': '수원시청',
    '신수원': '수원',
    '신길온': '신길온천',
    '신인천': '인천',
    '인천논': '인천논현'
})

In [93]:
timetables[13]["역사명"] = timetables[13]["역사명"].replace({
    '대방(성애병원)': '대방',
    '보라매병원(전문건설회관)': '보라매병원'
})

timetables[13]["출발역사명"] = timetables[13]["출발역사명"].replace({
    '대방(성애병원)': '대방',
    '보라매병원(전문건설회관)': '보라매병원'
})

timetables[13]["도착역사명"] = timetables[13]["도착역사명"].replace({
    '대방(성애병원)': '대방',
    '보라매병원(전문건설회관)': '보라매병원'
})

In [94]:
timetables[14]["역사명"] = timetables[14]["역사명"].replace({
    '광교(경기대)': '광교',
    '광교중앙(아주대)': '광교중앙',
    '미금(분당서울대병원)': '미금',
    '양재(서초구청)': '양재',
    '양재시민의숲(매헌)': '양재시민의숲',
    '판교(판교테크노밸리)': '판교'
})

timetables[14]["출발역사명"] = timetables[14]["출발역사명"].replace({
    '광교(경기대)': '광교',
    '광교중앙(아주대)': '광교중앙',
    '미금(분당서울대병원)': '미금',
    '양재(서초구청)': '양재',
    '양재시민의숲(매헌)': '양재시민의숲',
    '판교(판교테크노밸리)': '판교'
})

timetables[14]["도착역사명"] = timetables[14]["도착역사명"].replace({
    '광교(경기대)': '광교',
    '광교중앙(아주대)': '광교중앙',
    '미금(분당서울대병원)': '미금',
    '양재(서초구청)': '양재',
    '양재시민의숲(매헌)': '양재시민의숲',
    '판교(판교테크노밸리)': '판교'
})

In [95]:
timetables[15]["역사명"] = timetables[15]["역사명"].replace({
    '4·19민주묘지': '4.19 민주묘지'
})

timetables[15]["출발역사명"] = timetables[15]["출발역사명"].replace({
    '4·19민주묘지': '4.19 민주묘지'
})

timetables[15]["도착역사명"] = timetables[15]["도착역사명"].replace({
    '4·19민주묘지': '4.19 민주묘지'
})

In [96]:
# 재확인

for i in range(len(timetables)):
    station_name = set(result["역사명"][result["노선명"] == routes[i]])
    timetable_name = set(timetables[i]["역사명"])
    timetable_start_name = set(timetables[i]["출발역사명"])
    timetable_end_name = set(timetables[i]["도착역사명"])
    # print(timetable_name)

    station_diff = sorted(station_name - timetable_name)
    timetable_diff = sorted(timetable_name - station_name)
    timetable_start_diff = sorted(timetable_start_name - station_name)
    timetable_end_diff = sorted(timetable_end_name - station_name)

    print(" ----- " + filenames[i] + " ----- ")
    # print(station_name)
    # print(timetable_name)
    # print(station_diff)
    print(timetable_diff)
    print(timetable_start_diff)
    print(timetable_end_diff)
    print(" ---------- ")

# 서울의 경우, 서울역으로 표기 후 따로 API 호출 시 바꾸기

 ----- 경의중앙선 ----- 
[]
[]
[]
 ---------- 
 ----- 경춘선 ----- 
[]
[]
[]
 ---------- 
 ----- 공항철도선 ----- 
[]
[]
[]
 ---------- 
 ----- 1호선 ----- 
[]
[]
[]
 ---------- 
 ----- 2호선 ----- 
[]
[]
[]
 ---------- 
 ----- 3호선 ----- 
[]
[]
[]
 ---------- 
 ----- 4호선 ----- 
[]
[]
[]
 ---------- 
 ----- 5호선 ----- 
[]
[]
[]
 ---------- 
 ----- 6호선 ----- 
[]
[]
[]
 ---------- 
 ----- 7호선 ----- 
[]
[]
[]
 ---------- 
 ----- 8호선 ----- 
[]
[]
[]
 ---------- 
 ----- 9호선 ----- 
[]
[]
[]
 ---------- 
 ----- 수인분당선 ----- 
[]
[]
[]
 ---------- 
 ----- 신림선 ----- 
[]
[]
[]
 ---------- 
 ----- 신분당선 ----- 
[]
[]
[]
 ---------- 
 ----- 우이신설선 ----- 
[]
[]
[]
 ---------- 


In [97]:
for i in range(len(timetables)):
    station_name = list(result["역사명"][result["노선명"] == routes[i]])
    station_num = list(result["역코드"][result["노선명"] == routes[i]])
    name_to_num = dict(zip(station_name, station_num))

    # print(name_to_num)

    timetables[i]["역사명"] = timetables[i]["역사명"].replace(name_to_num)
    timetables[i]["출발역사명"] = timetables[i]["출발역사명"].replace(name_to_num)
    timetables[i]["도착역사명"] = timetables[i]["도착역사명"].replace(name_to_num)
    
    timetables[i] = timetables[i].rename(columns={"역사명": "역코드"})
    timetables[i] = timetables[i].rename(columns={"출발역사명": "출발역코드"})
    timetables[i] = timetables[i].rename(columns={"도착역사명": "도착역코드"})

In [101]:
for i in range(len(timetables)):
    timetables[i]["급행선"] = timetables[i]["급행선"].replace({'G': False, 'D': True})
    
    timetables[i] = timetables[i].drop(columns=['외부코드', '역번호'])
    timetables[i] = timetables[i].drop(columns=['출발역번호', '도착역번호'])

In [102]:
# 테스트

timetables[1]

Unnamed: 0,노선명,역코드,열차번호,도착시간,출발역코드,도착역코드,요일,방향,급행선
0,경춘선,1824.0,K8008,05:27,1830.0,1806.0,1,1,False
1,경춘선,1824.0,K8010,05:53,1830.0,1806.0,1,1,False
2,경춘선,1824.0,K8012,06:20,1830.0,1810.0,1,1,False
3,경춘선,1824.0,K8402,06:36,1830.0,1806.0,1,1,True
4,경춘선,1824.0,K8016,06:52,1830.0,1810.0,1,1,False
...,...,...,...,...,...,...,...,...,...
4134,경춘선,1807.0,K8039,13:33,1806.0,1830.0,2,2,False
4135,경춘선,1807.0,K8055,16:54,1806.0,1830.0,2,2,False
4136,경춘선,1807.0,K8067,19:31,1806.0,1830.0,2,2,False
4137,경춘선,1807.0,K8079,22:09,1806.0,1830.0,2,2,False


##### 5-b. result_2

In [6]:
timetables = [0] * len(filenames)
for i in range(len(timetables)):
    timetables[i] = pd.read_csv("./timetable/result_2/" + filenames[i] + ".csv")

In [108]:
for i in range(len(timetables)):
    station_name = set(result["역사명"][result["노선명"] == routes[i]])
    timetable_name = set(timetables[i].columns) - {"열차번호", "출발역사명", "도착역사명", "출발시간", "도착시간", "요일", "방향", "급행선"}
    # print(timetable_name)
    
    station_diff = sorted(station_name - timetable_name)
    timetable_diff = sorted(timetable_name - station_name)

    print(" ----- " + filenames[i] + " ----- ")
    # print(station_name)
    # print(timetable_name)
    print(station_diff)
    print(timetable_diff)
    print(" ---------- ")

 ----- 경의중앙선 ----- 
['디지털미디어시티', '서울역', '양원', '양정', '홍대입구', '효창공원앞']
['1양원', '1양정', '디엠시', '서울', '홍대입', '효창공']
 ---------- 
 ----- 경춘선 ----- 
['평내호평']
['평내호']
 ---------- 
 ----- 공항철도선 ----- 
['서울역']
['서울']
 ---------- 
 ----- 1호선 ----- 
['쌍용(나사렛대)']
['쌍용']
 ---------- 
 ----- 2호선 ----- 
[]
[]
 ---------- 
 ----- 3호선 ----- 
[]
[]
 ---------- 
 ----- 4호선 ----- 
['총신대입구(이수)']
['총신대입구']
 ---------- 
 ----- 5호선 ----- 
['광나루(장신대)', '군자(능동)', '아차산(어린이대공원후문)', '천호(풍납토성)']
['광나루', '군자', '아차산', '천호']
 ---------- 
 ----- 6호선 ----- 
[]
[]
 ---------- 
 ----- 7호선 ----- 
['군자(능동)', '어린이대공원(세종대)', '총신대입구(이수)']
['군자', '어린이대공원', '이수']
 ---------- 
 ----- 8호선 ----- 
['몽촌토성(평화의문)', '천호(풍납토성)']
['몽촌토성', '천호']
 ---------- 
 ----- 9호선 ----- 
[]
[]
 ---------- 
 ----- 수인분당선 ----- 
['강남구청', '구룡', '남동인더스파크', '대모산입구', '매탄권선', '소래포구', '수원', '수원시청', '신길온천', '압구정로데오', '인천', '인천논현']
['강남구', '구룡역', '남동인', '대모산', '로데오', '매탄권', '소래포', '수원시', '신길온', '신수원', '신인천', '인천논']
 ---------- 
 ----- 신림선 ----- 
['대방', '보라매병원']
[

In [109]:
timetables[0] = timetables[0].rename(columns={
    '1양원': '양원', 
    '1양정': '양정', 
    '디엠시': '디지털미디어시티', 
    '홍대입': '홍대입구', 
    '효창공': '효창공원앞',
    '서울': '서울역'
})

In [110]:
timetables[1] = timetables[1].rename(columns={
    '평내호': '평내호평'
})

In [111]:
timetables[2] = timetables[2].rename(columns={
    '서울': '서울역'
})

In [112]:
timetables[3] = timetables[3].rename(columns={
    '쌍용': '쌍용(나사렛대)'
})

In [113]:
timetables[6] = timetables[6].rename(columns={
    '총신대입구': '총신대입구(이수)'
})

In [114]:
timetables[7] = timetables[7].rename(columns={
    '광나루': '광나루(장신대)',
    '군자': '군자(능동)',
    '아차산': '아차산(어린이대공원후문)',
    '천호': '천호(풍납토성)'
})

In [115]:
timetables[9] = timetables[9].rename(columns={
    '군자': '군자(능동)', 
    '어린이대공원': '어린이대공원(세종대)', 
    '이수': '총신대입구(이수)'
})

In [116]:
timetables[10] = timetables[10].rename(columns={
    '몽촌토성': '몽촌토성(평화의문)', 
    '천호': '천호(풍납토성)'
})

In [117]:
timetables[12] = timetables[12].rename(columns={
    '강남구': '강남구청',
    '구룡역': '구룡', 
    '남동인': '남동인더스파크', 
    '대모산': '대모산입구', 
    '로데오': '압구정로데오', 
    '매탄권': '매탄권선',
    '소래포': '소래포구', 
    '수원시': '수원시청',
    '신수원': '수원', 
    '신길온': '신길온천', 
    '신인천': '인천', 
    '인천논': '인천논현'
})

In [118]:
timetables[13] = timetables[13].rename(columns={
    '대방(성애병원)': '대방',
    '보라매병원(전문건설회관)': '보라매병원'
})

In [119]:
timetables[14] = timetables[14].rename(columns={
    '광교(경기대)': '광교', 
    '광교중앙(아주대)': '광교중앙', 
    '미금(분당서울대병원)': '미금',
    '양재(서초구청)': '양재', 
    '양재시민의숲(매헌)': '양재시민의숲', 
    '판교(판교테크노밸리)': '판교'
})

In [120]:
timetables[15] = timetables[15].rename(columns={
    '4·19민주묘지': '4.19 민주묘지'
})

In [121]:
# 재확인

for i in range(len(timetables)):
    station_name = set(result["역사명"][result["노선명"] == routes[i]])
    timetable_name = set(
        timetables[i].columns) - {"열차번호", "출발역사명", "도착역사명", "출발시간", "도착시간", "요일", "방향", "급행선"}
    # print(timetable_name)

    station_diff = sorted(station_name - timetable_name)
    timetable_diff = sorted(timetable_name - station_name)

    print(" ----- " + filenames[i] + " ----- ")
    # print(station_name)
    # print(timetable_name)
    print(station_diff)
    print(timetable_diff)
    print(" ---------- ")
    
# 서울의 경우, 서울역으로 표기 후 따로 API 호출 시 바꾸기

 ----- 경의중앙선 ----- 
[]
[]
 ---------- 
 ----- 경춘선 ----- 
[]
[]
 ---------- 
 ----- 공항철도선 ----- 
[]
[]
 ---------- 
 ----- 1호선 ----- 
[]
[]
 ---------- 
 ----- 2호선 ----- 
[]
[]
 ---------- 
 ----- 3호선 ----- 
[]
[]
 ---------- 
 ----- 4호선 ----- 
[]
[]
 ---------- 
 ----- 5호선 ----- 
[]
[]
 ---------- 
 ----- 6호선 ----- 
[]
[]
 ---------- 
 ----- 7호선 ----- 
[]
[]
 ---------- 
 ----- 8호선 ----- 
[]
[]
 ---------- 
 ----- 9호선 ----- 
[]
[]
 ---------- 
 ----- 수인분당선 ----- 
[]
[]
 ---------- 
 ----- 신림선 ----- 
[]
[]
 ---------- 
 ----- 신분당선 ----- 
[]
[]
 ---------- 
 ----- 우이신설선 ----- 
[]
[]
 ---------- 


In [122]:
for i in range(len(timetables)):
    station_name = list(result["역사명"][result["노선명"] == routes[i]])
    # station_num = np.array(list(result["역코드"][result["노선명"] == routes[i]])).astype(int)
    
    col_change = dict(zip(station_name, station_num))
    
    timetables[i] = timetables[i].rename(columns=col_change)

In [123]:
# 확인

for i in range(len(timetables)):
    station_name = set(result["역사명"][result["노선명"] == routes[i]])
    timetable_name = set(
        timetables[i].columns) - {"열차번호", "출발역사명", "도착역사명", "출발시간", "도착시간", "요일", "방향", "급행선"}
    # print(timetable_name)

    station_diff = sorted(station_name - timetable_name)
    timetable_diff = sorted(timetable_name - station_name)

    print(" ----- " + filenames[i] + " ----- ")
    # print(station_name)
    print(timetable_name)
    # print(station_diff)
    # print(timetable_diff)
    print(" ---------- ")


 ----- 경의중앙선 ----- 
{1299, 1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315, 1316, 1317, 1318, 1607, 1606, 1608, 1609, 1610, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629, 1630, 1631, 1635, 1636, 1390, 1394, 191, 192, 193, 195, 196, 197, 198}
 ---------- 
 ----- 경춘선 ----- 
{1806, 1807, 1808, 1809, 1810, 1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818, 1819, 1820, 1821, 1822, 1823, 1824, 1825, 1826, 1827, 1828, 1829, 1830}
 ---------- 
 ----- 공항철도선 ----- 
{4001, 4002, 4003, 4004, 4005, 4006, 4007, 4008, 4009, 4010, 4011, 4012, 4013, 4020}
 ---------- 
 ----- 1호선 ----- 
{1401, 1409, 1408, 100, 101, 102, 103, 104, 105, 106, 107, 108, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 1175, 157, 158, 156

### 6. 세부 사항 변경

##### 6-a. result

In [103]:
for i in range(len(timetables)):
    timetables[i]['도착시간'] = timetables[i]['도착시간'].str.replace(':', '')

In [104]:
# 확인
timetables[0]

Unnamed: 0,노선명,역코드,열차번호,도착시간,출발역코드,도착역코드,요일,방향,급행선
0,경의중앙선,1314.0,K5004,0514,1310.0,1629.0,1,1,False
1,경의중앙선,1314.0,K5008,0541,1308.0,1629.0,1,1,False
2,경의중앙선,1314.0,K5010,0559,1300.0,1629.0,1,1,False
3,경의중앙선,1314.0,K5012,0610,1300.0,1622.0,1,1,False
4,경의중앙선,1314.0,K5014,0623,1300.0,1629.0,1,1,False
...,...,...,...,...,...,...,...,...,...
16198,경의중앙선,1606.0,K5137,2237,1629.0,1300.0,2,2,False
16199,경의중앙선,1606.0,K5139,2257,1629.0,1310.0,2,2,False
16200,경의중앙선,1606.0,K5141,2310,1629.0,1310.0,2,2,False
16201,경의중앙선,1606.0,K5143,2334,1629.0,1310.0,2,2,False


In [105]:
for i in range(len(timetables)):
    timetables[i].to_csv('./timetable/result_final/' + filenames[i] +
                         '.csv', index=False, encoding="utf-8-sig")

In [106]:
# 9시 이전 도착 열차 시간표 제거

for i in range(len(timetables)):
    dropList = []
    for j in range(len(timetables[i])):
        if int(timetables[i]["도착시간"].loc[j]) < 2100:
            # 9시 이전 시간표 -> 제거해도 됨
            dropList.append(j)
            
    timetables[i].drop(dropList, inplace=True)

In [107]:
for i in range(len(timetables)):
    timetables[i]["도착시간"] = timetables[i]["도착시간"].astype(int)

In [108]:
for i in range(len(timetables)):
    print(routes[i] + " - " + str(len(timetables[i]["도착시간"][timetables[i]["도착시간"] < 2100])))

경의중앙선 - 0
경춘선 - 0
공항철도 - 0
수도권 1호선 - 0
수도권 2호선 - 0
수도권 3호선 - 0
수도권 4호선 - 0
수도권 5호선 - 0
수도권 6호선 - 0
수도권 7호선 - 0
수도권 8호선 - 0
수도권 9호선 - 0
수인분당선 - 0
신림선 - 0
신분당선 - 0
우이신설선 - 0


In [109]:
for i in range(len(timetables)):
    timetables[i] = timetables[i].sort_values(by=['요일', '방향', '역코드', '도착시간'])

In [110]:
# 테스트
timetables[0]

Unnamed: 0,노선명,역코드,열차번호,도착시간,출발역코드,도착역코드,요일,방향,급행선
12685,경의중앙선,191.0,K5144,2101,1300.0,1629.0,1,1,False
12686,경의중앙선,191.0,K5148,2119,1299.0,1629.0,1,1,False
12687,경의중앙선,191.0,K5146,2132,1310.0,1622.0,1,1,False
12688,경의중앙선,191.0,K5150,2149,1300.0,1629.0,1,1,False
12689,경의중앙선,191.0,K5152,2202,1300.0,1629.0,1,1,False
...,...,...,...,...,...,...,...,...,...
8535,경의중앙선,1636.0,K5141,2226,1629.0,1310.0,2,2,False
8536,경의중앙선,1636.0,K2234,2242,1610.0,1629.0,2,2,False
8537,경의중앙선,1636.0,K5143,2250,1629.0,1310.0,2,2,False
8538,경의중앙선,1636.0,K5145,2308,1629.0,191.0,2,2,False


In [111]:
for i in range(len(timetables)):
    timetables[i].to_csv('./timetable/result_final_after_9pm/' + filenames[i] +
                         '.csv', index=False, encoding="utf-8-sig")

##### 6-b. result_2

In [None]:
for i in range(len(timetables)):
    timetable_name = set(
        timetables[i].columns) - {"열차번호", "출발역사명", "도착역사명", "출발시간", "도착시간", "요일", "방향", "급행선"}

    for name in timetable_name:
        # 시간 형식 바꾸기
        timetables[i][name] = timetables[i][name].str.replace(':', '')
    
    timetables[i]['도착시간'] = timetables[i]['도착시간'].str.replace(':', '')
    timetables[i]['출발시간'] = timetables[i]['출발시간'].str.replace(':', '')

In [9]:
# 확인
timetables[0]

Unnamed: 0,1303,192,1312,1317,1625,1611,1623,1614,1304,1311,...,1622,1621,열차번호,출발역사명,도착역사명,출발시간,도착시간,요일,방향,급행선
0,,532.0,507.0,524.0,641.0,,633.0,607.0,,503.0,...,631.0,628.0,K5004,덕소,문산,0500,655,1.0,1.0,False
1,,559.0,534.0,551.0,708.0,,700.0,634.0,,530.0,...,658.0,655.0,K5008,팔당,문산,0520,722,1.0,1.0,False
2,518.0,617.0,552.0,609.0,727.0,,719.0,652.0,522.0,548.0,...,716.0,714.0,K5010,용문,문산,0501,740,1.0,1.0,False
3,529.0,628.0,603.0,620.0,,,,703.0,533.0,559.0,...,727.0,725.0,K5012,용문,일산,0512,727,1.0,1.0,False
4,542.0,641.0,616.0,633.0,751.0,,743.0,716.0,546.0,612.0,...,740.0,738.0,K5014,용문,문산,0525,804,1.0,1.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433,,,,,,,,,,,...,,,K2607,임진강,문산,1745,1755,2.0,1.0,False
434,,,,,,,,,,,...,,,K2602,문산,임진강,0935,945,2.0,2.0,False
435,,,,,,,,,,,...,,,K2604,문산,임진강,1035,1045,2.0,2.0,False
436,,,,,,,,,,,...,,,K2606,문산,임진강,1545,1555,2.0,2.0,False


In [None]:
for i in range(len(timetables)):
    timetables[i].to_csv('./timetable/result_2/' + filenames[i] + '.csv', index=False, encoding="utf-8-sig")

In [110]:
for i in range(len(timetables)):
    timetable_time = set(
        timetables[i].columns) - {"열차번호", "출발역사명", "도착역사명", "요일", "방향", "급행선"}

    for time in timetable_time:
        # 시간 형식 바꾸기
        timetables[i][time] = timetables[i][time].fillna(-1)
        # timetables[i][time] = timetables[i][time].astype(int)
        timetables[i][time] = timetables[i][time].replace(-1, np.nan)

In [111]:
timetables[0]

Unnamed: 0,191,192,193,195,196,197,198,1299,1300,1301,...,1635,1636,열차번호,출발시간,도착시간,출발역사명,도착역사명,요일,방향,급행선
312,500.0,503.0,506.0,508.0,511.0,514.0,516.0,,,,...,545.0,607.0,K5002,500,626,청량리,문산,1,1,False
313,,,,,,,,,,,...,550.0,527.0,K2201,509,610,문산,서울,1,1,False
0,527.0,532.0,534.0,537.0,539.0,542.0,545.0,,,,...,613.0,636.0,K5004,500,655,덕소,문산,1,1,False
315,,,,,,,,,,,...,626.0,603.0,K2203,545,646,문산,서울,1,1,False
314,540.0,543.0,546.0,548.0,551.0,554.0,556.0,,,,...,625.0,647.0,K5006,540,706,청량리,문산,1,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,613.0,609.0,607.0,604.0,602.0,559.0,556.0,739.0,734.0,729.0,...,,,K5005,553,739,용산,지평,2,2,False
434,,,,,,,,,,,...,,,K2602,935,945,문산,임진강,2,2,False
435,,,,,,,,,,,...,,,K2604,1035,1045,문산,임진강,2,2,False
436,,,,,,,,,,,...,,,K2606,1545,1555,문산,임진강,2,2,False


In [112]:
result = pd.read_csv('result_train_station_withcode.csv')

filenames = ["경의중앙선", "경춘선", "공항철도선", "1호선", "2호선", "3호선", "4호선",
             "5호선", "6호선", "7호선", "8호선", "9호선", "수인분당선", "신림선", "신분당선", "우이신설선"]
routes = sorted(set(result["노선명"]))

timetables = [0] * len(filenames)

for i in range(len(timetables)):
    timetables[i] = pd.read_csv('./timetable/result_2_final/' + filenames[i] + '.csv')

In [113]:
# 9시 이전 도착 열차 시간표 제거

for i in range(len(timetables)):
    dropList = []
    for j in range(len(timetables[i])):
        if timetables[i]["도착시간"].loc[j] < 2100:
            # 9시 이전 시간표 -> 제거해도 됨
            dropList.append(j)
            
    timetables[i].drop(dropList, inplace=True)

In [114]:
for i in range(len(timetables)):
    print(routes[i] + " - " + str(len(timetables[i]["도착시간"][timetables[i]["도착시간"] < 2100])))

경의중앙선 - 0
경춘선 - 0
공항철도 - 0
수도권 1호선 - 0
수도권 2호선 - 0
수도권 3호선 - 0
수도권 4호선 - 0
수도권 5호선 - 0
수도권 6호선 - 0
수도권 7호선 - 0
수도권 8호선 - 0
수도권 9호선 - 0
수인분당선 - 0
신림선 - 0
신분당선 - 0
우이신설선 - 0


In [115]:
for i in range(len(timetables)):
    print(routes[i] + " - " + str(len(timetables[i]
          ["도착시간"][timetables[i]["도착시간"] >= 2100])))

경의중앙선 - 86
경춘선 - 45
공항철도 - 122
수도권 1호선 - 458
수도권 2호선 - 371
수도권 3호선 - 206
수도권 4호선 - 237
수도권 5호선 - 212
수도권 6호선 - 161
수도권 7호선 - 203
수도권 8호선 - 124
수도권 9호선 - 246
수인분당선 - 138
신림선 - 110
신분당선 - 113
우이신설선 - 132


In [116]:
timetables[0].info()

timetables[0] = timetables[0].sort_values(by=['요일', '방향', '1616'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86 entries, 91 to 430
Data columns (total 65 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   191     73 non-null     float64
 1   192     73 non-null     float64
 2   193     72 non-null     float64
 3   195     73 non-null     float64
 4   196     72 non-null     float64
 5   197     72 non-null     float64
 6   198     73 non-null     float64
 7   1299    6 non-null      float64
 8   1300    39 non-null     float64
 9   1301    38 non-null     float64
 10  1302    39 non-null     float64
 11  1303    38 non-null     float64
 12  1304    38 non-null     float64
 13  1305    38 non-null     float64
 14  1306    39 non-null     float64
 15  1307    38 non-null     float64
 16  1308    40 non-null     float64
 17  1309    41 non-null     float64
 18  1310    68 non-null     float64
 19  1311    67 non-null     float64
 20  1312    68 non-null     float64
 21  1313    68 non-null     float64
 22  13

In [117]:
timetables[1].info()

timetables[1] = timetables[1].sort_values(by=['요일', '방향', '1810'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45 entries, 45 to 203
Data columns (total 33 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1806    11 non-null     float64
 1   1807    11 non-null     float64
 2   1808    9 non-null      float64
 3   1809    1 non-null      float64
 4   1810    42 non-null     float64
 5   1811    40 non-null     float64
 6   1812    40 non-null     float64
 7   1813    42 non-null     float64
 8   1814    42 non-null     float64
 9   1815    42 non-null     float64
 10  1816    42 non-null     float64
 11  1817    40 non-null     float64
 12  1818    45 non-null     int64  
 13  1819    41 non-null     float64
 14  1820    43 non-null     float64
 15  1821    39 non-null     float64
 16  1822    41 non-null     float64
 17  1823    39 non-null     float64
 18  1824    41 non-null     float64
 19  1825    39 non-null     float64
 20  1826    39 non-null     float64
 21  1827    41 non-null     float64
 22  18

In [118]:
timetables[2].info()

timetables[2] = timetables[2].sort_values(by=['요일', '방향', '4001'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122 entries, 155 to 705
Data columns (total 22 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   4001    120 non-null    float64
 1   4002    104 non-null    float64
 2   4003    104 non-null    float64
 3   4004    106 non-null    float64
 4   4005    106 non-null    float64
 5   4006    106 non-null    float64
 6   4007    106 non-null    float64
 7   4008    90 non-null     float64
 8   4009    90 non-null     float64
 9   4010    106 non-null    float64
 10  4011    90 non-null     float64
 11  4012    90 non-null     float64
 12  4013    106 non-null    float64
 13  4020    106 non-null    float64
 14  열차번호    122 non-null    object 
 15  출발시간    122 non-null    int64  
 16  도착시간    122 non-null    int64  
 17  출발역사명   122 non-null    object 
 18  도착역사명   122 non-null    object 
 19  요일      122 non-null    int64  
 20  방향      122 non-null    int64  
 21  급행선     122 non-null    bool   
dtype

In [119]:
timetables[3].info(max_cols=150)

timetables[3] = timetables[3].sort_values(by=['요일', '방향', '141'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 458 entries, 296 to 2311
Data columns (total 107 columns):
 #    Column  Non-Null Count  Dtype  
---   ------  --------------  -----  
 0    100     55 non-null     float64
 1    101     78 non-null     float64
 2    102     78 non-null     float64
 3    103     78 non-null     float64
 4    104     78 non-null     float64
 5    105     78 non-null     float64
 6    106     78 non-null     float64
 7    107     120 non-null    float64
 8    108     120 non-null    float64
 9    110     120 non-null    float64
 10   111     159 non-null    float64
 11   112     159 non-null    float64
 12   113     159 non-null    float64
 13   114     159 non-null    float64
 14   115     159 non-null    float64
 15   116     159 non-null    float64
 16   117     159 non-null    float64
 17   118     159 non-null    float64
 18   119     159 non-null    float64
 19   120     260 non-null    float64
 20   121     238 non-null    float64
 21   122     238

In [120]:
timetables[4].info()

timetables[4] = timetables[4].sort_values(by=['요일', '방향', '234'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 371 entries, 305 to 2659
Data columns (total 59 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   201     122 non-null    float64
 1   202     123 non-null    float64
 2   203     120 non-null    float64
 3   204     120 non-null    float64
 4   205     120 non-null    float64
 5   206     120 non-null    float64
 6   207     120 non-null    float64
 7   208     120 non-null    float64
 8   209     120 non-null    float64
 9   210     120 non-null    float64
 10  211     268 non-null    float64
 11  212     126 non-null    float64
 12  213     126 non-null    float64
 13  214     126 non-null    float64
 14  215     126 non-null    float64
 15  216     126 non-null    float64
 16  217     126 non-null    float64
 17  218     126 non-null    float64
 18  219     129 non-null    float64
 19  220     126 non-null    float64
 20  221     126 non-null    float64
 21  222     126 non-null    float64
 22 

In [121]:
timetables[5].info()

timetables[5] = timetables[5].sort_values(by=['요일', '방향', '330'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 206 entries, 153 to 1143
Data columns (total 52 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   310     134 non-null    float64
 1   311     134 non-null    float64
 2   312     134 non-null    float64
 3   313     134 non-null    float64
 4   314     134 non-null    float64
 5   315     134 non-null    float64
 6   316     134 non-null    float64
 7   317     134 non-null    float64
 8   318     134 non-null    float64
 9   319     124 non-null    float64
 10  320     188 non-null    float64
 11  321     186 non-null    float64
 12  322     186 non-null    float64
 13  323     186 non-null    float64
 14  324     186 non-null    float64
 15  325     186 non-null    float64
 16  326     189 non-null    float64
 17  327     189 non-null    float64
 18  328     189 non-null    float64
 19  329     189 non-null    float64
 20  330     189 non-null    float64
 21  331     189 non-null    float64
 22 

In [122]:
timetables[6].info()

timetables[6] = timetables[6].sort_values(by=['요일', '방향', '424'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 239 entries, 187 to 1292
Data columns (total 59 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   405     70 non-null     float64
 1   406     70 non-null     float64
 2   408     70 non-null     float64
 3   409     223 non-null    float64
 4   410     220 non-null    float64
 5   411     222 non-null    float64
 6   412     223 non-null    float64
 7   413     223 non-null    float64
 8   414     223 non-null    float64
 9   415     223 non-null    float64
 10  416     223 non-null    float64
 11  417     223 non-null    float64
 12  418     223 non-null    float64
 13  419     226 non-null    float64
 14  420     226 non-null    float64
 15  421     226 non-null    float64
 16  422     226 non-null    float64
 17  423     226 non-null    float64
 18  424     226 non-null    float64
 19  425     226 non-null    float64
 20  426     226 non-null    float64
 21  427     223 non-null    float64
 22 

In [123]:
timetables[7].info()

timetables[7] = timetables[7].sort_values(by=['요일', '방향', '540'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 212 entries, 164 to 1105
Data columns (total 64 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   510     194 non-null    float64
 1   511     194 non-null    float64
 2   512     194 non-null    float64
 3   513     194 non-null    float64
 4   514     194 non-null    float64
 5   515     194 non-null    float64
 6   516     194 non-null    float64
 7   517     196 non-null    float64
 8   518     193 non-null    float64
 9   519     193 non-null    float64
 10  520     193 non-null    float64
 11  521     193 non-null    float64
 12  522     193 non-null    float64
 13  523     193 non-null    float64
 14  524     193 non-null    float64
 15  525     193 non-null    float64
 16  526     196 non-null    float64
 17  527     196 non-null    float64
 18  528     196 non-null    float64
 19  529     196 non-null    float64
 20  530     199 non-null    float64
 21  531     196 non-null    float64
 22 

In [124]:
timetables[8].info()

timetables[8] = timetables[8].sort_values(by=['요일', '방향', '610'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 161 entries, 139 to 861
Data columns (total 47 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   610     149 non-null    float64
 1   611     88 non-null     float64
 2   612     88 non-null     float64
 3   613     88 non-null     float64
 4   614     85 non-null     float64
 5   615     85 non-null     float64
 6   616     146 non-null    float64
 7   617     144 non-null    float64
 8   618     144 non-null    float64
 9   619     144 non-null    float64
 10  620     144 non-null    float64
 11  621     144 non-null    float64
 12  622     144 non-null    float64
 13  623     144 non-null    float64
 14  624     144 non-null    float64
 15  625     146 non-null    float64
 16  626     148 non-null    float64
 17  627     145 non-null    float64
 18  628     145 non-null    float64
 19  629     145 non-null    float64
 20  630     145 non-null    float64
 21  631     148 non-null    float64
 22  

In [125]:
timetables[9].info()

timetables[9] = timetables[9].sort_values(by=['요일', '방향', '750'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 204 entries, 162 to 1080
Data columns (total 61 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   709     83 non-null     float64
 1   710     186 non-null    float64
 2   711     186 non-null    float64
 3   712     186 non-null    float64
 4   713     186 non-null    float64
 5   714     186 non-null    float64
 6   715     186 non-null    float64
 7   716     186 non-null    float64
 8   717     189 non-null    float64
 9   718     189 non-null    float64
 10  719     189 non-null    float64
 11  720     189 non-null    float64
 12  721     189 non-null    float64
 13  722     189 non-null    float64
 14  723     189 non-null    float64
 15  724     189 non-null    float64
 16  725     189 non-null    float64
 17  726     189 non-null    float64
 18  727     192 non-null    float64
 19  728     189 non-null    float64
 20  729     192 non-null    float64
 21  730     189 non-null    float64
 22 

In [126]:
timetables[10].info()

timetables[10] = timetables[10].sort_values(by=['요일', '방향', '814'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124 entries, 124 to 794
Data columns (total 26 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   810     121 non-null    float64
 1   811     121 non-null    float64
 2   812     121 non-null    float64
 3   813     121 non-null    float64
 4   814     124 non-null    float64
 5   815     124 non-null    float64
 6   816     124 non-null    float64
 7   817     124 non-null    float64
 8   818     121 non-null    float64
 9   819     121 non-null    float64
 10  820     121 non-null    float64
 11  821     121 non-null    float64
 12  822     121 non-null    float64
 13  823     121 non-null    float64
 14  824     121 non-null    float64
 15  825     121 non-null    float64
 16  826     121 non-null    float64
 17  831     121 non-null    float64
 18  열차번호    124 non-null    int64  
 19  출발시간    124 non-null    int64  
 20  도착시간    124 non-null    int64  
 21  출발역사명   124 non-null    object 
 22  

In [127]:
timetables[11].info()

timetables[11] = timetables[11].sort_values(by=['요일', '방향', '926'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246 entries, 93 to 1231
Data columns (total 46 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   901     133 non-null    float64
 1   902     229 non-null    float64
 2   903     133 non-null    float64
 3   904     133 non-null    float64
 4   905     230 non-null    float64
 5   906     134 non-null    float64
 6   907     230 non-null    float64
 7   908     131 non-null    float64
 8   909     131 non-null    float64
 9   910     230 non-null    float64
 10  911     134 non-null    float64
 11  912     134 non-null    float64
 12  913     231 non-null    float64
 13  914     131 non-null    float64
 14  915     231 non-null    float64
 15  916     134 non-null    float64
 16  917     231 non-null    float64
 17  918     134 non-null    float64
 18  919     134 non-null    float64
 19  920     234 non-null    float64
 20  921     134 non-null    float64
 21  922     134 non-null    float64
 22  

In [128]:
timetables[12].info()

timetables[12] = timetables[12].sort_values(by=['요일', '방향', '1534'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138 entries, 147 to 783
Data columns (total 71 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1509    0 non-null      float64
 1   1510    102 non-null    float64
 2   1511    102 non-null    float64
 3   1512    102 non-null    float64
 4   1513    102 non-null    float64
 5   1514    102 non-null    float64
 6   1515    102 non-null    float64
 7   1516    102 non-null    float64
 8   1517    102 non-null    float64
 9   1518    102 non-null    float64
 10  1519    102 non-null    float64
 11  1520    102 non-null    float64
 12  1521    102 non-null    float64
 13  1522    102 non-null    float64
 14  1523    102 non-null    float64
 15  1524    102 non-null    float64
 16  1525    102 non-null    float64
 17  1526    102 non-null    float64
 18  1527    102 non-null    float64
 19  1528    102 non-null    float64
 20  1529    102 non-null    float64
 21  1530    102 non-null    float64
 22  

In [129]:
timetables[13].info()

timetables[13] = timetables[13].sort_values(by=['요일', '방향', '11711'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110 entries, 161 to 695
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   11711   110 non-null    int64 
 1   11712   110 non-null    int64 
 2   11713   110 non-null    int64 
 3   11714   110 non-null    int64 
 4   11715   110 non-null    int64 
 5   11716   110 non-null    int64 
 6   11717   110 non-null    int64 
 7   11718   110 non-null    int64 
 8   11719   110 non-null    int64 
 9   11720   110 non-null    int64 
 10  11721   110 non-null    int64 
 11  출발시간    110 non-null    int64 
 12  도착시간    110 non-null    int64 
 13  출발역사명   110 non-null    object
 14  도착역사명   110 non-null    object
 15  요일      110 non-null    int64 
 16  방향      110 non-null    int64 
 17  급행선     110 non-null    bool  
dtypes: bool(1), int64(15), object(2)
memory usage: 15.6+ KB


In [130]:
timetables[14].info()

timetables[14] = timetables[14].sort_values(by=['요일', '방향', '1907'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113 entries, 135 to 596
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1907    113 non-null    float64
 1   1908    113 non-null    float64
 2   1909    113 non-null    float64
 3   1910    113 non-null    float64
 4   1911    113 non-null    float64
 5   1912    113 non-null    float64
 6   1913    113 non-null    float64
 7   1914    113 non-null    float64
 8   1915    113 non-null    float64
 9   1916    106 non-null    float64
 10  1917    106 non-null    float64
 11  1918    106 non-null    float64
 12  1919    106 non-null    float64
 13  1920    106 non-null    float64
 14  1921    106 non-null    float64
 15  1922    106 non-null    float64
 16  출발시간    113 non-null    int64  
 17  도착시간    113 non-null    int64  
 18  출발역사명   113 non-null    object 
 19  도착역사명   113 non-null    object 
 20  요일      113 non-null    float64
 21  방향      113 non-null    float64
 22  

In [131]:
timetables[15].info()

timetables[15] = timetables[15].sort_values(by=['요일', '방향', '11321'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132 entries, 217 to 939
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   11321   132 non-null    int64 
 1   11322   132 non-null    int64 
 2   11323   132 non-null    int64 
 3   11324   132 non-null    int64 
 4   11325   132 non-null    int64 
 5   11326   132 non-null    int64 
 6   11327   132 non-null    int64 
 7   11328   132 non-null    int64 
 8   11329   132 non-null    int64 
 9   11330   132 non-null    int64 
 10  11331   132 non-null    int64 
 11  11332   132 non-null    int64 
 12  11333   132 non-null    int64 
 13  출발시간    132 non-null    int64 
 14  도착시간    132 non-null    int64 
 15  출발역사명   132 non-null    object
 16  도착역사명   132 non-null    object
 17  요일      132 non-null    int64 
 18  방향      132 non-null    int64 
 19  급행선     132 non-null    bool  
dtypes: bool(1), int64(17), object(2)
memory usage: 20.8+ KB


In [133]:
for i in range(len(timetables)):
    timetables[i].to_csv('./timetable/result_2_final_after_9pm/' + filenames[i] +
                         '.csv', index=False, encoding="utf-8-sig")

### 7. result 시간표 전부 통합 후 저장

In [3]:
routes = ["1호선", "2호선", "3호선", "4호선", "5호선", "6호선", "7호선", "8호선", "9호선", "경의중앙선", "경춘선", "공항철도선", "수인분당선", "신림선", "신분당선", "우이신설선"]

temp = [0] * len(routes)
cnt = 0
for route in routes:
    temp[cnt] = pd.read_csv('./timetable/result_final_after_9pm/' + route + '.csv')
    cnt += 1

In [4]:
temp[0]

Unnamed: 0,노선명,역코드,열차번호,도착시간,출발역코드,도착역코드,요일,방향,급행선
0,1호선,100.0,K178,2126,100.0,100.0,1,1,False
1,1호선,100.0,K188,2210,100.0,100.0,1,1,False
2,1호선,100.0,K200,2303,100.0,100.0,1,1,False
3,1호선,100.0,K206,2329,100.0,100.0,1,1,False
4,1호선,100.0,K212,2359,100.0,100.0,1,1,False
...,...,...,...,...,...,...,...,...,...
12143,1호선,1416.0,K705,2158,1416.0,1416.0,3,2,False
12144,1호선,1416.0,K709,2227,1416.0,1416.0,3,2,False
12145,1호선,1416.0,K711,2252,1416.0,1416.0,3,2,False
12146,1호선,1416.0,K715,2333,1416.0,1416.0,3,2,False


In [5]:
for i in range(cnt):
    temp[i] = temp[i].rename(columns={"노선명":"route_name", "역코드": "stat_code", "열차번호": "train_code", "도착시간": "time", "요일": "weeks", "방향": "inout"})
    temp[i] = temp[i].drop(columns=["출발역코드", "도착역코드", "급행선"])
    temp[i] = temp[i].sort_values(["route_name", "weeks", "inout", "train_code", "time"])

In [8]:
result = pd.DataFrame(columns=["route_name", "stat_code", "train_code", "time", "weeks", "inout"])
for i in range(cnt):
    result = pd.concat([result, temp[i]])

In [9]:
result

Unnamed: 0,route_name,stat_code,train_code,time,weeks,inout
869,1호선,138.0,K1168,2100,1,1
827,1호선,137.0,K1168,2102,1,1
785,1호선,136.0,K1168,2104,1,1
742,1호선,135.0,K1168,2108,1,1
1000,1호선,141.0,K1170,2102,1,1
...,...,...,...,...,...,...
1405,우이신설선,11325.0,212,2353,2,2
1382,우이신설선,11324.0,212,2355,2,2
1359,우이신설선,11323.0,212,2357,2,2
1335,우이신설선,11322.0,212,2358,2,2


In [None]:
result.to_csv('./result_total_timetable_result.csv',
              index=False, encoding="utf-8-sig")