In [1]:
from bs4 import BeautifulSoup
from requests import Session
from docx import Document
from docx.shared import Pt
from docx.enum.style import WD_STYLE_TYPE
from urllib.parse import urljoin
import os
import json
from concurrent.futures import ThreadPoolExecutor
import logging

with open("stage3.json", "r") as f:
  data = json.load(f)

logging.basicConfig(filename="new_logs.log", format="%(asctime)s - %(message)s", filemode="w", level=logging.CRITICAL)
logger = logging.getLogger()


session = Session()
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
}
dir_name = "optimized_documents"


def create_document_styles(document):
    styles = document.styles
    styles.add_style("MyHead", WD_STYLE_TYPE.PARAGRAPH)

    normal_styles = styles["Normal"]
    font = normal_styles.font
    font.name = "Tahoma"
    font.size = Pt(12)
    normal_styles.paragraph_format.line_spacing = 1.15
    normal_styles.paragraph_format.alignment = 3

    heading_styles = styles["MyHead"]
    font = heading_styles.font
    font.name = "Tahoma"
    font.size = Pt(14)
    font.color.rgb = None
    font.bold = True

def make_safe_filename(s):
    def safe_char(c):
        if c.isalnum():
            return c
        else:
            return "_"
    safe_string = "".join(safe_char(c) for c in s).rstrip("_")
    if(len(safe_string) > 150):
        safe_string = safe_string[:150]
    return safe_string

def fetch_and_save_document(url, folder_path, url_index):
    response = session.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "lxml")
    start_elem = soup.select_one("h2.make-database")

    if start_elem:
        heading = (
            start_elem.text.strip()
            .replace("\n", "")
            .replace("\r", "")
            .replace("\t", "")
        )
        heading = (
            heading.replace(":", "-").replace("?", "").replace("*", "").replace("<", "")
        )
        heading = (
            heading.replace(">", "")
            .replace("|", "")
            .replace("/", "-")
            .replace("\\", "-")
            .replace('"', "")
        )

        document = Document()
        create_document_styles(document)

        document.add_paragraph(heading).style = "MyHead"
        all_elems = start_elem.find_all_next()

        for elem in all_elems:
            if elem.name == "div" and elem.get("class") == ["make-database"]:
                pdf_link = elem.find("a").get("href")
                if pdf_link.startswith("..") and pdf_link.endswith(".pdf"):
                    temp = url if url.endswith("/") else "/".join(url.split("/")[:-1])
                    pdf_link = urljoin(temp, pdf_link[2:])
                else:
                    pdf_link = urljoin(url, pdf_link)

                res = session.get(pdf_link, stream=True, headers=headers)
                file_name = make_safe_filename(heading)
                pdf_file_path = os.path.join(folder_path, file_name + ".pdf")
                with open(pdf_file_path, "wb") as pdf:
                    for chunk in res.iter_content(chunk_size=1024):
                        if chunk:
                            pdf.write(chunk)
                logger.critical(f"Fetched: {url_index}")
                return
            elif elem.name == "hr" and elem.get("class") == ["make-database"]:
                break

            data = (
                elem.text.strip().replace("\n", "").replace("\r", "").replace("\t", "")
            )
            if data != "":
                document.add_paragraph(data)
        file_name = make_safe_filename(heading)
        docx_file_path = os.path.join(folder_path, file_name + ".docx")
        document.save(docx_file_path)
        logger.critical(f"Fetched: {url_index}")
        
        
with ThreadPoolExecutor(max_workers=100) as executor:
  for d in data:
      title1 = d.get("title1", "")
      title1 = (
          title1.replace(":", "-")
          .replace("?", "")
          .replace("*", "")
          .replace("<", "")
          .replace(">", "")
          .replace("|", "")
          .replace("/", "-")
          .replace("\\", "-")
          .replace('"', "")
      )
      title2 = d.get("title2", "")
      title2 = (
          title2.replace(":", "-")
          .replace("?", "")
          .replace("*", "")
          .replace("<", "")
          .replace(">", "")
          .replace("|", "")
          .replace("/", "-")
          .replace("\\", "-")
          .replace('"', "")
      )
      title3 = d.get("title3", "")
      title3 = (
          title3.replace(":", "-")
          .replace("?", "")
          .replace("*", "")
          .replace("<", "")
          .replace(">", "")
          .replace("|", "")
          .replace("/", "-")
          .replace("\\", "-")
          .replace('"', "")
      )
      url = d.get("url", "")
      url_index = d.get("index")
      folder_path = os.path.join(dir_name, title1, title2)
      if not os.path.exists(folder_path):
          os.makedirs(folder_path)
      executor.submit(fetch_and_save_document, url, folder_path, url_index)