In [None]:
import httpx
from bs4 import BeautifulSoup
from loguru import logger
import loguru
import logging
import sys
import os
import pathlib
import aiofiles
import datetime as dt
import asyncio
from tqdm import tqdm

In [None]:
BASE_URL = "https://www.smclinic.ru"
DATA_PATH = pathlib.Path("./") / "data"
limits = httpx.Limits(max_keepalive_connections=400, max_connections=400)
client = httpx.AsyncClient(timeout=300, limits=limits, follow_redirects=True)

In [None]:
DATA_PATH.absolute()
logger.remove()

# Add a new handler with the desired log level
logger.add(sys.stderr, level="INFO")

In [None]:
class PageStorage:
    def __init__(self, base_path: pathlib.Path, website:str):
        self.__website = website
        self.__date_stamp = dt.datetime.now().strftime("%Y-%m-%d")
        self.__path = base_path / self.__date_stamp / self.__website
        self.__path.mkdir(exist_ok=True, parents=True)
        self.__html_path = self.__path / "html"
        self.__html_path.mkdir(exist_ok=True, parents=True)
        self.__txt_path = self.__path / "txt"
        self.__txt_path.mkdir(exist_ok=True, parents=True)

    def get_html_path(self, url:str) -> pathlib.Path:
        new_filename = "_".join(url.split("/")[2:])
        logger.debug(f"filename on save {new_filename}")
        filename = self.__html_path / f"{new_filename}.html"
        return filename

    def check_page(self, url:str) -> bool:
        filename = self.get_html_path(url)
        return filename.exists()

    async def save_page(self, url:str, content:str):
        filename = self.get_html_path(url)
        async with aiofiles.open(filename, "w") as f:
            await f.write(content)
        logger.info(f"Page saved: {filename}")

    async def load_page(self, url:str) -> str:
        filename = self.get_html_path(url)
        async with aiofiles.open(filename, "r") as f:
            content = await f.read()
        return content

    async def save_page_content(self, content: str , url : str):
        filename = pathlib.Path(str(self.get_html_path(url)).replace(".html", ".txt").replace("html", "txt"))
        async with aiofiles.open(filename, "w", encoding="utf-8") as f:
            await f.write(content)

In [None]:
storage = PageStorage(base_path=DATA_PATH, website="www.smclinic.ru")

In [None]:
async def fetch_page(
    client: httpx.AsyncClient, url: str, retries: int, delay: int
) -> str:
    response = await client.get(url)
    if response.status_code >= 400:
        if retries > 0:
            logger.warning(f"Retrying {url} ({retries} retries left)...")
            await asyncio.sleep(delay)
            return await fetch_page(client, url, retries - 1, delay)
        else:
            logger.error(f"unable to get page {response.status_code} {url}")
            return ""
    return response.text


async def get_page_content(
    client: httpx.AsyncClient,
    storage: PageStorage,
    url: str,
    retries: int = 3,
    delay: int = 2,
) -> str:
    if storage.check_page(url):
        logger.info(f"Page {url} already exists")
        content = await storage.load_page(url)
        return content

    content = await fetch_page(client, url, retries, delay)
    if content:
        await storage.save_page(url, content)
    return content

In [None]:
async def get_diseases_list(client: httpx.AsyncClient, storage: PageStorage) -> list:
    try:
        page_data = await get_page_content(client, storage, BASE_URL + "/diseases")
    except Exception as e:
        logger.error(f"unable to get page {e}")
        return []
    soup = BeautifulSoup(page_data)

    disease_div = soup.find("div", {"class": "diseases-list"})
    if not disease_div:
        logger.error("unable to find disease list")
        return []
    return disease_div.find_all("a")

In [None]:
from typing import Any, Coroutine


async def download_pages(client: httpx.AsyncClient, storage: PageStorage, pages: list[str]):
    tasks: list[Coroutine[Any, Any, str]] = [
        get_page_content(client, storage, url)
        for url in pages
    ]
    results = await asyncio.gather(* tasks)


In [None]:
def get_disease_info(content: str):
    soup = BeautifulSoup(content, "html.parser")

    # b-text-block-6
    content_divs = soup.find_all("div", attrs={"class": "b-text-block-6"})
    delimiter = "\n" + "-" * 80 + "\n"
    result = delimiter.join(
        [
            block.text.strip()
            for block in content_divs
        ]
    )

    return result

In [None]:
array = await get_diseases_list(client, storage)
if not array is None:
    print(len(array), array)
pages = [
    BASE_URL + elem.get("href")
    for elem in array
    if elem.get("href").startswith("/diseases")
]
logger.info(f"found: {len(pages)} pages in index page")

In [None]:
await download_pages(client, storage, pages)

In [None]:
for page_url in tqdm(pages):
    try:
        raw_content = await storage.load_page(page_url)
    except FileNotFoundError as e:
        raw_content = await get_page_content(client, storage, page_url)
    page_content = get_disease_info(raw_content)
    await storage.save_page_content(page_content, page_url)

In [None]:
NEW_HTML_PATH = pathlib.Path("./") / "data" / "www.smclinic.ru" / "html"
NEW_TXT_PATH = pathlib.Path("./") / "data" / "www.smclinic.ru" / "txt"

CURRENT_PATH = pathlib.Path("./") / "data" / "www.smclinic.ru"


# move all html files to new folder
for file in CURRENT_PATH.glob("*.html"):
    new_file = NEW_HTML_PATH / file.name
    file.rename(new_file)
    logger.info(f"moved {file} to {new_file}")

# move all txt files to new folder
for file in CURRENT_PATH.glob("*.txt"):
    new_file = NEW_TXT_PATH / file.name
    file.rename(new_file)
    logger.info(f"moved {file} to {new_file}")