In [9]:
!pip install loguru beautifulsoup4 pandas httpx



In [10]:
import requests
import pandas as pd
import asyncio
import httpx
from bs4 import BeautifulSoup
from loguru import logger

In [11]:
async def get_data_to_dataframe(start_page: int, end_page: int) -> dict:
    """
    Function to get data from https://scitechdaily.com
    and load it to dict
    :return:
    """
    articles_data = {"category_tag": [], "article_title": [], "article_link": [], "article_content": []}

    if start_page == 1:
      # For the first page we should get content separately because of different element tag that contains content
      await extract_data(articles_data, start_page, "class", "archive-list")
      for page_num in range(2, end_page + 1):
          await extract_data(articles_data, page_num, "id", "main-content")
    else:
      for page_num in range(start_page, end_page + 1):
          await extract_data(articles_data, page_num, "id", "main-content")

    return articles_data

In [12]:
async def extract_data(data_dict: dict, page_num: int, main_content_attribute: str, main_content_tag: str):
    """
    Method to parse data from HTML page
    :param data_dict: dict to dump data
    :param page_num: number of page to download
    :param main_content_attribute: div attribute of main content
    :param main_content_tag: definition of attribute (e.g. class name)
    :return:
    """
    client = httpx.AsyncClient(verify=False, follow_redirects=True, timeout=None)
    content = await client.get(f"https://scitechdaily.com/page/{page_num}")
    bs = BeautifulSoup(content.text)
    articles = bs.find("div", {main_content_attribute: main_content_tag}).find_all("article", class_=["content-list", "clearfix"])
    for article in articles:
      category_tag = article.find_all("span", {"class": "entry-meta-cats"})[0]
      article_title = article.find_all("h3", class_=["entry-title", "content-list-title"])[0]
      data_dict["category_tag"].append(category_tag.a.text)
      data_dict["article_title"].append(article_title.a["title"])
      data_dict["article_link"].append(article_title.a["href"])
      article_content = await client.get(article_title.a["href"])
      article_bs = BeautifulSoup(article_content.text)
      article_text = article_bs.find("div", {"class": "entry-content"}).find_all("p")
      raw_content = [p.text for p in article_text]
      data_dict["article_content"].append("\n".join(raw_content))

    logger.info(f"Data from page {page_num=} is downloaded")

In [13]:
def dump_data_to_csv(data: dict, path_to_file: str) -> None:
    """
    Method to dump data to csv file in train folder
    :param data: dict with keys as column names and values as lists
    :return:
    """

    df = pd.DataFrame(data)
    df.to_csv(path_to_file, index=False, sep=";")

In [14]:
%autoawait asyncio

In [16]:
async def main():
    test_data: dict = await get_data_to_dataframe(1, 100)
    dump_data_to_csv(test_data, "./../data/test/scitechdaily_test.csv")
    logger.info("Test data download completed")
    train_data: dict = await get_data_to_dataframe(101, 300)
    dump_data_to_csv(train_data, "./../data/train/scitechdaily.csv")
    logger.info("Train data download completed")

await main()
logger.info("Download completed")

2023-10-02 17:37:43.497 | INFO     | __main__:extract_data:26 - Data from page page_num=1 is downloaded


CancelledError: 

In [None]:
df = pd.read_csv("./../data/train/scitechdaily.csv")
df.info()