# Dart Report Crawler

## 0. imports

In [5]:
%load_ext jupyter_black

In [6]:
import os
import re
import random
import requests
import time

import dill
import OpenDartReader
import pandas as pd

from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

In [None]:
from dotenv import load_dotenv

load_dotenv()

## 1. Class

In [8]:
class DartCrawler:
    def __init__(self):
        self.dart = OpenDartReader(os.getenv("DART_API_KEY"))

    def get_list(self, start_date: str, end_date: str, kind: str = "") -> pd.DataFrame:
        self.start_date, self.end_date = start_date, end_date
        start = datetime.strptime(start_date, "%Y-%m-%d")
        end = datetime.strptime(end_date, "%Y-%m-%d")
        dfs = []

        while start < end:
            next_month = start + timedelta(days=30)
            if next_month > end:
                next_month = end

            dfs.append(
                self.dart.list(
                    start=start.strftime("%Y-%m-%d"),
                    end=next_month.strftime("%Y-%m-%d"),
                    kind=kind,
                )
            )
            start = next_month

        self.list_df = pd.concat(dfs, ignore_index=True)
        self.list_df = self.list_df.reset_index(drop=True)
        return self.list_df

    def get_document(self, list_df: pd.DataFrame, save_dir: str = "data") -> list[dict]:
        self.data = []
        for idx, row in tqdm(list_df.iterrows(), total=len(list_df)):
            corp_code, corp_name = row["corp_code"], row["corp_name"]
            rcept_no, report_nm = row["rcept_no"], row["report_nm"]

            doc_df = self.dart.sub_docs(rcept_no)
            document = {}
            for idx, row in doc_df.iterrows():
                url = row["url"]
                title = row["title"]
                response = requests.get(url)
                soup = BeautifulSoup(response.text, "html.parser")
                text = soup.get_text(strip=False)
                text = re.sub(r"\n+", "\n", text)
                text = re.sub(r" {2,}", " ", text)

                document["title"] = title
                document["text"] = text

            self.data.append(
                {
                    "corp_code": corp_code,
                    "corp_name": corp_name,
                    "report_nm": report_nm,
                    "document": document,
                }
            )

            time.sleep(random.uniform(0.3, 0.9))
        if save_dir:
            self._save_data(self.data, save_dir)

        return self.data

    def _save_data(self, data: list[dict], save_dir: str = "data") -> None:
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        save_path = os.path.join(
            save_dir, f"dart_report_{self.start_date}_{self.end_date}.pkl"
        )
        with open(save_path, "wb") as f:
            dill.dump(data, f)

In [9]:
crawler = DartCrawler()

list_df = crawler.get_list(start_date="2024-01-01", end_date="2024-01-02")

In [11]:
# data = crawler.get_document(list_df, save_dir="../data")

In [1]:
import sys

sys.path.append("..")

In [2]:
from src.dart_crawler import DartCrawler

In [3]:
crawler = DartCrawler()

list_df = crawler.get_list(start_date="2024-01-01", end_date="2024-01-02")

In [None]:
data = crawler.get_document(list_df, save_dir="../data")

In [None]:
crawler.user_agent.random


In [7]:
from fake_useragent import UserAgent

USER_AGENT = UserAgent()

In [None]:
USER_AGENT.random
