In [1]:
import requests
from bs4 import BeautifulSoup

url = "https://med-portal.az/"
response = requests.get(url)
response.status_code

200

In [2]:
soup = BeautifulSoup(response.text, 'html.parser')
# soup

In [3]:
import pathlib
files_dir = pathlib.Path("files")
files_dir.mkdir(exist_ok=True)

# scrape main text at root page
main_text = soup.find('div',class_='elave-melumatlar').text

file_name = files_dir / "main_text.txt"
with open(file_name, 'w', encoding='utf-8') as file:
    file.write(main_text)

# scrape main links at root page
sections = soup.select('div.bolumler')
main_links = []

for i,section in enumerate(sections[0].find_all('a',class_='kateqoriya-ucun-link')):
    link = section.get('href')
    title = section.find('div',class_='esas-kateqoriya-basligi').text
    main_links.append((link, title))
main_links

[('https://med-portal.az/./allergik-xestelikler/', 'Allergik xəstəliklər'),
 ('https://med-portal.az/./androloji-xestelikler/', 'Androloji xəstəliklər'),
 ('https://med-portal.az/./deri-xestelikleri/', 'Dəri xəstəlikləri'),
 ('https://med-portal.az/./dish-xestelikleri/', 'Diş xəstəlikləri'),
 ('https://med-portal.az/./endokrinoloji-xestelikler/',
  'Endokrinoloji xəstəliklər'),
 ('https://med-portal.az/./estetik-problemler/', 'Estetik problemlər'),
 ('https://med-portal.az/./goz-xestelikleri/', 'Göz xəstəlikləri'),
 ('https://med-portal.az/./hezm-sistemi-xestelikleri/',
  'Həzm sistemi xəstəlikləri'),
 ('https://med-portal.az/./immunoloji-xestelikler/', 'İmmunoloji xəstəliklər'),
 ('https://med-portal.az/./infeksion-xestelikler/', 'İnfeksion xəstəliklər'),
 ('https://med-portal.az/./irsi-xestelikler/', 'İrsi xəstəliklər'),
 ('https://med-portal.az/./kosmetik-problemler/', 'Kosmetik problemlər'),
 ('https://med-portal.az/./narkoloji-xestelikler/', 'Narkoloji xəstəliklər'),
 ('https://me

In [4]:
import tqdm

def get_articles_links_from_soup(soup:BeautifulSoup) -> list:
    articles = soup.find('div',class_='kateqoriya-yazilari').find_all('a')
    articles_links = []
    for article in articles:
        article_link = article.get('href')
        article_title = article.text.strip()
        articles_links.append((article_link, article_title))
    return articles_links

def look_for_further_articles_pages(soup:BeautifulSoup) -> list:
    further_pages = soup.select('a.page-numbers:not(.next)')
    further_pages_links = []
    if further_pages:
        for page in further_pages:
            page_link = page.get('href')
            further_pages_links.append(page_link)
    return further_pages_links

# navigate through main links and scrape articles links and category text
for i, (section_link, section_title) in tqdm.tqdm(enumerate(main_links), total=len(main_links)):

    response = requests.get(section_link) # response from https://med-portal.az/./allergik-xestelikler/
    if response.status_code == 200:

        # get all articles links in this category
        section_soup = BeautifulSoup(response.text, 'html.parser')
        articles_links = get_articles_links_from_soup(section_soup) # ('https://med-portal.az/allergik-xestelikler/allergik-keratit/','Allergik keratit'), etc.
        further_articles_pages_links = look_for_further_articles_pages(section_soup) # ['https://med-portal.az/allergik-xestelikler/page/2/']

        if further_articles_pages_links:
            for page_link in further_articles_pages_links:
                response = requests.get(page_link)
                if response.status_code == 200:
                    page_soup = BeautifulSoup(response.text, 'html.parser')
                    further_articles_links = get_articles_links_from_soup(page_soup) # ('https://med-portal.az/allergik-xestelikler/allergik-ovre/', 'Аllergik övrə'), etc.
                    articles_links.extend(further_articles_links)

        try:
            # get category text and title
            category_info = section_soup.find('div',class_='elave-kateqoriya-melumatlari')
            category_info_title = category_info.find('div',class_='kateqoriya-basliq').text.strip()
            category_info_description = category_info.find('div',class_='archive-meta').text

            # write category text and title to file
            file_name = files_dir / f"category_{i}_{category_info_title.lower().replace(' ','-')}.txt"
            with open(file_name, 'w', encoding='utf-8') as file:
                file.write(category_info_title + '\n' + category_info_description)
        except:
            # write category text and title to file
            file_name = files_dir / f"category_{i}.txt"
            with open(file_name, 'w', encoding='utf-8') as file:
                file.write('')


        # navigate through articles links and scrape articles text
        for j, (article_link, article_title) in tqdm.tqdm(enumerate(articles_links), total=len(articles_links)):

            response = requests.get(article_link) # response from https://med-portal.az/./allergik-xestelikler/
            if response.status_code == 200:

                # get all articles links in this category
                article_soup = BeautifulSoup(response.text, 'html.parser')

                try:
                    # get category text and title
                    article_body = article_soup.find('div',class_='xeberin-metni')
                    article_title = article_body.find('div',class_='yazi-basligi').text.strip()
                    article_text = article_body.find('div',class_='metn').text

                    # write category text and title to file
                    file_name = files_dir / f"category_{i}_article_{j}_{article_title.lower().replace(' ','-')}.txt"
                    with open(file_name, 'w', encoding='utf-8') as file:
                        file.write(article_title + '\n' + article_text)
                except:
                    # write category text and title to file
                    file_name = files_dir / f"category_{i}_article_{j}.txt"
                    with open(file_name, 'w', encoding='utf-8') as file:
                        file.write('')

  0%|          | 0/33 [00:00<?, ?it/s]
  0%|          | 0/23 [00:00<?, ?it/s][A
  4%|▍         | 1/23 [00:01<00:32,  1.48s/it][A
  9%|▊         | 2/23 [00:02<00:26,  1.25s/it][A
 13%|█▎        | 3/23 [00:03<00:23,  1.19s/it][A
 17%|█▋        | 4/23 [00:04<00:22,  1.17s/it][A
 22%|██▏       | 5/23 [00:05<00:20,  1.15s/it][A
 26%|██▌       | 6/23 [00:07<00:19,  1.14s/it][A
 30%|███       | 7/23 [00:08<00:18,  1.13s/it][A
 35%|███▍      | 8/23 [00:09<00:16,  1.13s/it][A
 39%|███▉      | 9/23 [00:10<00:15,  1.12s/it][A
 43%|████▎     | 10/23 [00:11<00:14,  1.11s/it][A
 48%|████▊     | 11/23 [00:12<00:13,  1.11s/it][A
 52%|█████▏    | 12/23 [00:13<00:12,  1.11s/it][A
 57%|█████▋    | 13/23 [00:14<00:11,  1.10s/it][A
 61%|██████    | 14/23 [00:15<00:09,  1.10s/it][A
 65%|██████▌   | 15/23 [00:16<00:08,  1.09s/it][A
 70%|██████▉   | 16/23 [00:18<00:07,  1.08s/it][A
 74%|███████▍  | 17/23 [00:19<00:06,  1.12s/it][A
 78%|███████▊  | 18/23 [00:20<00:05,  1.16s/it][A
 83%|█████