In [3]:
# over python3.7 for asyncio function

# K-report 사이트를 통한 사업자 기본정보 조회
- iFrame가 적용된 웹사이트

목차
[1.라이브러리](#Import-lib)<br>
[2.타겟 데이터 class](#target-data-class)<br>
[3.크롤러 구현](#K-report-crawler)<br>
[4.유틸리티](#Util)<br>
[5.실행](#Main)<br>

### Import lib

In [4]:
import pandas as pd
import time

from concurrent.futures.thread import ThreadPoolExecutor
import asyncio
import nest_asyncio
nest_asyncio.apply()

from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import exceptions as e

import requests

### target data class

In [56]:
class Dataset:
    def __init__(self, header):
        self.dataset = [header]

### K-report crawler

In [6]:
class KReport:
    def __init__(self):
        self.base_address = 'http://www.kreport.co.kr/ctcssr_a30s.do'
        self.webdriver_path = '../webdriver/chromedriver'
        self.init_delay = 5 
        self.webdriver_hide = '--headless'

In [7]:
def init_webdriver(self, path, hide):
    chrome_options = Options()  
    chrome_options.add_argument(hide)
    return Chrome(path, options=chrome_options)

KReport.init_webdriver = init_webdriver

In [8]:
def on_site(self, browser, url, delay):
    browser.get(url)
    browser.implicitly_wait(delay)

KReport.on_site = on_site

In [9]:
def run_query(self, browser, cn, delay):
    try:
        WebDriverWait(browser, delay).until(EC.presence_of_element_located(
            (By.XPATH, "//div[@class='searcharea2 mgt0']/form/input[@id='cmQuery']")))
        browser.find_element_by_xpath("//div[@class='searcharea2 mgt0']/form/input[@id='cmQuery']").click()
        browser.find_element_by_xpath("//div[@class='searcharea2 mgt0']/form/input[@id='cmQuery']").send_keys(cn)
        browser.implicitly_wait(0.1)
        browser.find_element_by_class_name('btn_search').click()
    except e.TimeoutException:
        return e
    return True
KReport.run_query = run_query

In [10]:
def find_herf(self, browser, delay):
    try:
        WebDriverWait(browser, delay).until(EC.presence_of_element_located(
            (By.XPATH, "//ul[@class='bizlist']")))
        a = browser.find_element_by_xpath("//ul[@class='bizlist']/li/ul/li/a")
        browser.get(a.get_attribute('href'))
    except e.TimeoutException:
        return e
    return True

KReport.find_herf = find_herf

In [11]:
def crawl_data(self, browser, delay):
    try:
        WebDriverWait(browser, delay).until(EC.presence_of_element_located(
            (By.XPATH, "//div[@class='businfo']")))
        data = browser.find_elements_by_xpath("//div[@class='businfo']/dl/dd")
        data = [i.text for i in data]
    except e.TimeoutException:
        return False, e
    return True, data

KReport.crawl_data = crawl_data

In [12]:
def etl_data(self, browser, cn, condition, data):
    if condition == True:
        return [cn] + data
    return [cn] + [None] * len(data)

KReport.etl_data = etl_data

In [16]:
def run_single(cn):
    kre = KReport() # init hyper param
    browser = kre.init_webdriver(kre.webdriver_path, kre.webdriver_hide) # init browser
    kre.on_site(browser, kre.base_address, kre.init_delay)
    kre.run_query(browser, cn, kre.init_delay)
    kre.find_herf(browser, kre.init_delay)

    condition, data = kre.crawl_data(browser, kre.init_delay)
    data = kre.etl_data(browser, cn, condition, data)

    ds.dataset.append(data)
    browser.close()
KReport.run_single = run_single

In [17]:
def run_async(thread_workers_count, cns):
    loop = asyncio.get_running_loop()
    executor = ThreadPoolExecutor(thread_workers_count)
    
    for cn in cns:
        loop.run_in_executor(executor, run_single, cn)

        
    loop.run_until_complete(asyncio.gather(*asyncio.all_tasks(loop)))    

KReport.run_async = run_async

### Util

In [73]:
def save_csv(df):
    now = time.localtime()
    now = time.strftime("%Y%m%d%H%M%S", now)
    file_name = '../data/result/' + now + '.csv'
    df.to_csv(file_name,'\t',mode='w')

### Main

In [75]:
ds = Dataset(['key','대표자명','법인구분','사업자등록번호','종교단체구분','법인등록번호','업종','주소','설립일'])

In [76]:
cns = \
[
'1318208689',
'1228212229',
'신한디에스',
'삼성에스디에스'
]

In [77]:
run_async(3, cns)

In [84]:
df = pd.DataFrame(ds.dataset[1:], columns=ds.dataset[0])
df

Unnamed: 0,key,대표자명,법인구분,사업자등록번호,종교단체구분,법인등록번호,업종,주소,설립일
0,1228212229,주승중,소기업,122-82-12***,일반법인,124232-0******,그 외 기타 비거주 복지 서비스업,인천 남동구 문화로,2014년 06월 20일
1,1318208689,박유진,,131-82-08***,일반법인,124721-0******,천주교 단체,경기 김포시 하성면 월하로,2001년 12월 04일
2,신한디에스,이써니,보호대상중견기업,110-81-34***,외감,110111-0******,시스템 소프트웨어 개발 및 공급업,서울 중구 남대문로10길,1991년 05월 13일
3,삼성에스디에스,홍원표,대기업,110-81-28***,유가증권시장,110111-0******,컴퓨터시스템 통합 자문 및 구축 서비스업,서울 송파구 올림픽로35길,1985년 05월 01일


In [74]:
save_csv(df)