Selenium을 이용하여 시각화 자료를 크롤링하려고 한다.

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time

In [4]:
empty_data = pd.DataFrame({'date' : [], 'weekday' : [], 'collect_hour' : [], 'traffic' : [], 'traffic_time' : [], 'departure_nm': [], 'arrival_nm' : [], 'diff' : [], 'Is_holiday' : [], 'holidays' : []})

In [5]:
def makeDf(DATA, str_date, day_year, holidays):
    # DATA : 크롤링으로 가지고 온 데이터 문자열들을 미리 나눈 list
    # str_date : 해당 데이터의 날짜. YYYYMMDD 형식의 문자열을 받음.
    # day_year : 해당 년도의 추석 당일 날짜. YYYYMMDD 형식의 문자열을 받음.
    # holidays : 추석 연휴 list
    data = empty_data.copy()
    date = datetime.strptime(str_date, '%Y%m%d').date()
    weekday = date.weekday()
    # 0 : 월요일 ~ 6 : 일요일
    for dataset in DATA:
        # 해당 데이터의 날짜 먼저 삽입.
        keys = ['date', 'weekday']
        values = [date, weekday]
        
        data_list = dataset.split(',') # DATA의 각 문자열을 split
        
        for kv in data_list:
            pair = kv.split(':')
            keys.append(pair[0].replace('"', ''))
            values.append(pair[1].replace('"', ''))

        keys[2], keys[3] = keys[3], keys[2] # hour와 traffic 위치 swap
        values[2], values[3] = values[3], values[2] # hour와 traffic 위치 swap
        values[4] = float(values[4]) # index=4 --> traffic_time (시간단위) --> 데이터를 float으로 변환.
        
        keys.append('diff')
        diff_days = (date - (datetime.strptime(day_year, '%Y%m%d').date())).days
        values.append(diff_days) # 추석 당일날과의 차이
        
        keys.append('Is_holiday')
        Is_holiday = 1 if holidays.count(str_date) > 0 else 0
        values.append(Is_holiday) # 추석 연휴 날이면 1, 그렇지 않으면 0
        
        keys.append('holidays')
        values.append(len(holidays)) # 해당 연도의 추석 연휴 day 수
        
        my_dict = dict(zip(keys, values))
        df = pd.DataFrame(my_dict, index = [0])
        data = pd.concat([data, df], axis=0)
        
    data = data.loc[data['arrival_nm'] == '부산'] # 부산이 목적지인 데이터셋만 원하므로 loc.
    data.sort_values(by='collect_hour', ascending=True, inplace=True) # 시간 기준으로 sorting
    data.reset_index(drop=True, inplace=True) # index를 reset.
    data['traffic_time_1'] = data['traffic_time'].shift(-1) # 1시간 뒤의 traffic_time 생성.
    return data

In [6]:
def makeDataFrame(date_year, day_year, holidays):
    # date_year : 추출할 데이터의 날짜 리스트. YYYYMMDD 형식의 문자열을 받음.
    # day_year : 해당 년도의 추석 당일 날짜. YYYYMMDD 형식의 문자열을 받음.
    # holidays : 추석 연휴 list
    DF = pd.DataFrame()
    for date in date_year:
        print(date)
        element = driver.find_element(By.CSS_SELECTOR, '#date')
        driver.execute_script(f'arguments[0].setAttribute("value", {date})', element)
        driver.find_element(By.XPATH, '//*[@id="btn_Search"]/a').click()
        time.sleep(5)
        script = driver.find_element(By.XPATH, '/html/head/script[26]')
        data_text = script.get_attribute('innerHTML')[80:-38]
        datas = data_text.split('},{')
        datas[0] = datas[0].replace('{', '')
        datas[-1] = datas[-1].replace('}', '')
        if len(datas) > 30:
            df = makeDf(datas, date, day_year, holidays)
            DF = pd.concat([DF, df], axis=0)
    return DF

In [7]:
url = 'http://data.ex.co.kr/visual/analysis'

In [8]:
date_2018 = ['20180915','20180916','20180917','20180918','20180919','20180920','20180921','20180922', '20180923', '20180924', '20180925', '20180926'] # 추출할 데이터의 날짜 리스트.
holidays_2018 = ['20180922', '20180923', '20180924', '20180925', '20180926'] # 추석 연휴 list
day_2018 = '20180924' # 해당 년도의 추석 당일 날짜.
date_2019 = ['20190905','20190906','20190907','20190908','20190909','20190910','20190911','20190912', '20190913', '20190914', '20190915']
holidays_2019 = ['20190912', '20190913', '20190914', '20190915']
day_2019 = '20190913'
date_2020 = ['20200923','20200924','20200925','20200926','20200927','20200928','20200929','20200930', '20201001', '20201002', '20201003', '20201004']
holidays_2020 = ['20200930', '20201001', '20201002', '20201003', '20201004']
day_2020 = '20201001'
date_2021 = ['20210911','20210912','20210913','20210914','20210915','20210916','20210917','20210918', '20210919', '20210920', '20210921', '20210922']
holidays_2021 = ['20210918', '20210919', '20210920', '20210921', '20210922']
day_2021 = '20210921'

In [9]:
date_2022 = ['20220902','20220903','20220904','20220905','20220906','20220907','20220908','20220909','20220910','20220911','20220912']
holidays_2022 = ['20220909','20220910','20220911','20220912']
day_2022 = '20220910'

In [10]:
Lunar_2018 = ['20180208','20180209','20180210','20180211','20180212','20180213','20180214','20180215','20180216','20180217','20180218'] # 추출할 데이터의 날짜 리스트.
Lolidays_2018 = ['20180215','20180216','20180217','20180218'] # 설날 연휴 list
Lday_2018 = '20180216' # 해당 년도의 설날 당일 날짜.
Lunar_2019 = ['20190202','20190203','20190204', '20190205', '20190206']
Lolidays_2019 = ['20190202','20190203','20190204', '20190205', '20190206']
Lday_2019 = '20190205'
Lunar_2020 = ['20200117','20200118','20200119','20200120','20200121','20200122','20200123','20200124', '20200125', '20200126']
Lolidays_2020 = ['20200124', '20200125', '20200126']
Lday_2020 = '20200125'
Lunar_2021 = ['20210204','20210205','20210206','20210207','20210208','20210209','20210210','20210211', '20210212', '20210213', '20210214']
Lolidays_2021 = ['20210211', '20210212', '20210213', '20210214']
Lday_2021 = '20210212'
Lunar_2022 = ['20220122','20220123','20220124','20220125','20220126','20220127','20220128','20220129','20220130','20220131','20220201','20220202']
Lolidays_2022 = ['20220129','20220130','20220131','20220201','20220202']
Lday_2022 = '20220201'

In [11]:
driver = webdriver.Chrome()
driver.get(url)

In [12]:
data_18 = makeDataFrame(date_2018, day_2018, holidays_2018)
data_19 = makeDataFrame(date_2019, day_2019, holidays_2019)
data_20 = makeDataFrame(date_2020, day_2020, holidays_2020)
data_21 = makeDataFrame(date_2021, day_2021, holidays_2021)
data_22 = makeDataFrame(date_2022, day_2022, holidays_2022)

20180915
20180916
20180917
20180918
20180919
20180920
20180921
20180922
20180923
20180924
20180925
20180926
20190905
20190906
20190907
20190908
20190909
20190910
20190911
20190912
20190913
20190914
20190915
20200923
20200924
20200925
20200926
20200927
20200928
20200929
20200930
20201001
20201002
20201003
20201004
20210911
20210912
20210913
20210914
20210915
20210916
20210917
20210918
20210919
20210920
20210921
20210922
20220902
20220903
20220904
20220905
20220906
20220907
20220908
20220909
20220910
20220911
20220912


In [13]:
data_18.fillna(method='ffill', inplace=True)
data_19.fillna(method='ffill', inplace=True)
data_20.fillna(method='ffill', inplace=True)
data_21.fillna(method='ffill', inplace=True)
data_22.fillna(method='ffill', inplace=True)

In [14]:
Ldate_18 = makeDataFrame(Lunar_2018, Lday_2018, Lolidays_2018)
Ldate_19 = makeDataFrame(Lunar_2019, Lday_2019, Lolidays_2019)
Ldate_20 = makeDataFrame(Lunar_2020, Lday_2020, Lolidays_2020)
Ldate_21 = makeDataFrame(Lunar_2021, Lday_2021, Lolidays_2021)
Ldate_22 = makeDataFrame(Lunar_2022, Lday_2022, Lolidays_2022)

20180208
20180209
20180210
20180211
20180212
20180213
20180214
20180215
20180216
20180217
20180218
20190202
20190203
20190204
20190205
20190206
20200117
20200118
20200119
20200120
20200121
20200122
20200123
20200124
20200125
20200126
20210204
20210205
20210206
20210207
20210208
20210209
20210210
20210211
20210212
20210213
20210214
20220122
20220123
20220124
20220125
20220126
20220127
20220128
20220129
20220130
20220131
20220201
20220202


In [16]:
Ldate_18.fillna(method='ffill', inplace=True)
Ldate_19.fillna(method='ffill', inplace=True)
Ldate_20.fillna(method='ffill', inplace=True)
Ldate_21.fillna(method='ffill', inplace=True)
Ldate_22.fillna(method='ffill', inplace=True)

In [17]:
data = pd.concat([Ldate_18, data_18, Ldate_19, data_19, Ldate_20, data_20, Ldate_21, data_21, Ldate_22, data_22], axis=0)

In [18]:
data.shape

(1804, 11)

In [19]:
drop_cols = ['departure_nm', 'arrival_nm']
data2 = data.drop(columns = drop_cols, axis=1)

In [23]:
data2 = data2.astype({'collect_hour' : 'int',
                      'weekday' : 'int',
                     'traffic' : 'int',
                     'diff' : 'int',
                     'Is_holiday' : 'int',
                     'holidays' : 'int'})

In [24]:
data2.to_csv('highway.csv', index=False)

In [25]:
driver.quit()