In [1]:
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from io import StringIO
import re
from tqdm import tqdm

def wait(): time.sleep(0.5)

class Crawler:
    def __init__(self) -> None:
        driver = Chrome()
        driver.get("https://dart.fss.or.kr/dsab007/main.do?option=report")

        # Input report name
        input_reportName = driver.find_element(By.ID, "reportName")
        input_reportName.clear()
        input_reportName.send_keys("사업보고서")
        wait()

        # Click final report
        final_report = driver.find_element(By.ID, "finalReport")
        final_report.click()
        wait()

        # Set num line per page = 100
        num_line_per_page = driver.find_element(By.ID, "maxResultsCb")
        Select(num_line_per_page).select_by_value("100")
        wait()

        #
        self.driver = driver

    def fill_startDate_endDate(self, startDate, endDate):
        input_startDate = self.driver.find_element(By.ID, "startDate")
        input_startDate.clear()
        input_startDate.send_keys(startDate)
        wait()

        input_endDate = self.driver.find_element(By.ID, "endDate")
        input_endDate.clear()
        input_endDate.send_keys(endDate)
        wait()

        self.driver.find_element(By.CLASS_NAME, "btnArea").find_element(By.CLASS_NAME, "btnSearch").click()
        wait()

        self.wait_for_data()

    def get_soup(self):
        return BeautifulSoup(self.driver.page_source, "html.parser")

    def wait_for_data(self, first_index=1):
        while True:
            soup = self.get_soup()
            table = soup.find("table")
            df = pd.read_html(StringIO(str(table)))[0]
            if df.iloc[0,0] == first_index: return
            wait()

    def get_table(self):
        soup = self.get_soup()
        table = soup.find("table")
        df = pd.read_html(StringIO(str(table)))[0]
        list_tr = table.find("tbody").find_all("tr")
        assert len(list_tr) == len(df)
        list_popup_href = []
        list_report_href = []
        for tr in list_tr:
            list_td = tr.find_all("td")
            assert len(list_td) == df.shape[1]

            popup_href = list_td[1].find("a")["href"]
            list_popup_href.append(popup_href)

            report_href = list_td[2].find("a")["href"]
            list_report_href.append(report_href)

        df["popup_href"] = list_popup_href
        df["report_href"] = report_href

        assert df["popup_href"].str.startswith("javascript:openCorpInfoNew('").all()
        assert df["popup_href"].str.endswith("', 'winCorpInfo', '/dsae001/selectPopup.ax');").all()
        df["popup_href"] = df["popup_href"].str[len("javascript:openCorpInfoNew('"):-len("', 'winCorpInfo', '/dsae001/selectPopup.ax');")]

        assert df["report_href"].str.startswith("/dsaf001/main.do?rcpNo=").all()
        df["report_href"] = df["report_href"].str[len("/dsaf001/main.do?rcpNo="):]

        return df

    def go_next_page(self, page):
        if page == 1: return

        div = self.driver.find_element(By.CLASS_NAME, "pageSkip")
        list_li = div.find_element(By.TAG_NAME, "ul").find_elements(By.TAG_NAME, "li")
        if page % 10 == 1: list_li[-2].find_element(By.TAG_NAME, "a").click()
        else:
            button_id = (page - 1) % 10 + 2
            list_li[button_id].find_element(By.TAG_NAME, "a").click()

        first_index = 1 + 100 * (page - 1)
        self.wait_for_data(first_index)

In [None]:
crawler = Crawler()
for iii in range(8):
    startDate = f"{2021-3*iii}0411"
    endDate = f"{2024-3*iii}0410"
    crawler.fill_startDate_endDate(startDate, endDate)
    soup = crawler.get_soup()
    pageInfo = soup.find("div", attrs={"class":"pageInfo"})
    temp = "".join(pageInfo.text.split()).split("][")
    total_page = int(temp[0].split("/")[1])
    total_line = int(re.findall(r"\d+", "".join(temp[1].split(",")))[0])
    total_df = None
    for page in tqdm(range(1, total_page+1)):
        crawler.go_next_page(page)
        df = crawler.get_table()
        try:
            total_df = pd.concat([total_df, df], ignore_index=True)
        except:
            total_df = df.copy()
    
    total_df.to_csv(f"{iii}.csv", index=False)