In [1]:
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from io import StringIO
import re
from tqdm import tqdm

def wait(): time.sleep(0.5)

class Crawler:
    def __init__(self) -> None:
        driver = Chrome()
        driver.get("https://dart.fss.or.kr/dsab007/main.do?option=report")

        # Input report name
        reportName = driver.find_element(By.ID, "reportName")
        reportName.clear()
        reportName.send_keys("사업보고서")
        wait()

        # Click final report button
        finalReport = driver.find_element(By.ID, "finalReport")
        finalReport.click()
        wait()

        # Set number line per page to 100
        numLinePerPage = driver.find_element(By.ID, "maxResultsCb")
        Select(numLinePerPage).select_by_value("100")
        wait()

        #
        self.driver = driver

    def search_by_startDate_endDate(self, startDate, endDate):
        startDate_element = self.driver.find_element(By.ID, "startDate")
        startDate_element.clear()
        startDate_element.send_keys(startDate)
        endDate_element = self.driver.find_element(By.ID, "endDate")
        endDate_element.clear()
        endDate_element.send_keys(endDate)
        wait()

        # Click search button
        self.driver.find_element(By.CLASS_NAME, "btnArea").find_element(By.CLASS_NAME, "btnSearch").click()
        wait()

        # Wait for the search results to load
        self.wait_for_results()

    def get_soup(self):
        return BeautifulSoup(self.driver.page_source, "html.parser")

    def wait_for_results(self, first_index_of_table=1):
        while True:
            soup = self.get_soup()
            table = soup.find("table")
            df = pd.read_html(StringIO(str(table)))[0]
            if df.iloc[0,0] == first_index_of_table: return
            wait()

    def get_table(self):
        soup = self.get_soup()
        table = soup.find("table")
        df = pd.read_html(StringIO(str(table)))[0]
        list_tr = table.find("tbody").find_all("tr")
        assert len(list_tr) == df.shape[0]

        list_popup_href = []
        list_report_id = []
        for tr in list_tr:
            list_td = tr.find_all("td")
            assert len(list_td) == df.shape[1]

            popup_href = list_td[1].find("a")["href"]
            report_id = list_td[2].find("a")["href"]
            list_popup_href.append(popup_href)
            list_report_id.append(report_id)

        df["popup_href"] = list_popup_href
        df["report_id"] = list_report_id
        return df

    def go_to_page(self, page_number):
        if page_number == 1: return

        pageSkip_element = self.driver.find_element(By.CLASS_NAME, "pageSkip")
        list_li = pageSkip_element.find_element(By.TAG_NAME, "ul").find_elements(By.TAG_NAME, "li")

        if page_number % 10 == 1: button_id = -2
        else: button_id = (page_number - 1) % 10 + 2
        list_li[button_id].find_element(By.TAG_NAME, "a").click()

        first_index_of_table = 1 + 100 * (page_number - 1)
        self.wait_for_results(first_index_of_table)

In [None]:
crawler = Crawler()
for iii in range(4):
    startDate = f"{2022-3*iii}0101"
    endDate = f"{2024-3*iii}1231"
    # print(f"Start date: {startDate}, End date: {endDate}")
    crawler.search_by_startDate_endDate(startDate, endDate)
    soup = crawler.get_soup()
    pageInfo = soup.find("div", attrs={"class":"pageInfo"})
    temp = "".join(pageInfo.text.split()).split("][")
    total_page = int(temp[0].split("/")[1])
    total_line = int(re.findall(r"\d+", "".join(temp[1].split(",")))[0])
    total_df = None
    for page in tqdm(range(1, total_page+1)):
        crawler.go_to_page(page)
        df = crawler.get_table()
        try: total_df = pd.concat([total_df, df])
        except: total_df = df

    total_df.to_csv(f"{iii}.csv", index=False)