# Expertise Organizations Page Parser
This notebook is designed to parse and extract data about organizations from the e-construction.gov.ua website. It includes functions for fetching, parsing, and saving data in various formats.

In [None]:
# Import necessary libraries for HTTP requests, HTML parsing, and data manipulation.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
import os

In [None]:
# Define the URL of the page to be parsed.
url = 'https://e-construction.gov.ua/organizations/org_type=1/page=5'

In [None]:
# Set up HTTP headers for the request to mimic a browser.
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept-Language": "uk-UA,uk;q=0.9"
}

# Send a GET request to the URL and check the response status.
response = requests.get(url, headers=headers)
if response.status_code != 200:
    print(f"Error: {response.status_code}")
    exit()

# Parse the HTML content of the response.
soup = BeautifulSoup(response.text, "html.parser")

In [None]:
# Define a function to extract data from the parsed HTML content.
def get_data(soup):
    """
    Extracts organization data from the parsed HTML content.
    
    Args:
        soup (BeautifulSoup): Parsed HTML content.
    
    Returns:
        list: A list of dictionaries containing organization data.
    """
    # Find all organization blocks.
    org_blocks = soup.find_all("div", class_="dataset__item")

    # Extract information from each block.
    data = []
    for block in org_blocks:
        name_tag = block.find("h3", class_="opendata__name")
        name = name_tag.get_text(strip=True) if name_tag else None

        legal_status = block.find(string="Правовий статус")
        status = legal_status.find_next("span").get_text(strip=True) if legal_status else None

        edrpou = block.find(string="Код ЄДРПОУ")
        edrpou_code = edrpou.find_next("span").get_text(strip=True) if edrpou else None

        people = block.find(string="Кількість атестованих осіб")
        certified_people = people.find_next("span").get_text(strip=True) if people else None

        consequences = block.find(string="Класи наслідків")
        consequence_classes = [label.get_text(strip=True) for label in consequences.find_next("span").find_all("span")] if consequences else []

        data.append({
            "Назва": name,
            "Правовий статус": status,
            "ЄДРПОУ": edrpou_code,
            "Кількість осіб": certified_people,
            "Класи наслідків": consequence_classes
        })
    return data

In [None]:
# Call the get_data function to extract data from the parsed HTML content.
get_data(soup)

In [None]:
# Define a function to parse organization data from multiple pages.
def parse_organizations_all_pages(pages=12, delay=1, retries=3):
    """
    Parses organization data from multiple pages of the website.
    
    Args:
        pages (int): Number of pages to parse.
        delay (int): Delay between requests in seconds.
        retries (int): Number of retries for failed requests.
    
    Returns:
        DataFrame: A pandas DataFrame containing the parsed data.
    """
    base_url = "https://e-construction.gov.ua/organizations/org_type=1"
    data = []

    for page_num in range(1, pages + 1):
        url = base_url if page_num == 1 else f"{base_url}/page={page_num}"

        for attempt in range(1, retries + 1):
            try:
                response = requests.get(url, timeout=10)
                response.raise_for_status()
                break  # Exit retry loop if successful.
            except Exception as e:
                print(f"⚠️ Attempt {attempt} failed for page {page_num} — {e}")
                if attempt == retries:
                    print(f"❌ Skipping page {page_num} after {retries} failed attempts.")
                    response = None

        if not response:
            continue  # Skip to the next page.

        soup = BeautifulSoup(response.text, "html.parser")
        org_blocks = soup.find_all("div", class_="dataset__item")

        for block in org_blocks:
            name_tag = block.find("h3", class_="opendata__name")
            name = name_tag.get_text(strip=True) if name_tag else None

            legal_status = block.find(string="Правовий статус")
            status = legal_status.find_next("span").get_text(strip=True) if legal_status else None

            edrpou = block.find(string="Код ЄДРПОУ")
            edrpou_code = edrpou.find_next("span").get_text(strip=True) if edrpou else None

            people = block.find(string="Кількість атестованих осіб")
            certified_people = people.find_next("span").get_text(strip=True) if people else None

            consequences = block.find(string="Класи наслідків")
            consequence_classes = [label.get_text(strip=True) for label in consequences.find_next("span").find_all("span")] if consequences else []

            link_tag = block.find("a", class_="opendata__link")
            link = link_tag['href'] if link_tag and link_tag.has_attr('href') else None

            data.append({
                "Назва": name,
                "Правовий статус": status,
                "ЄДРПОУ": edrpou_code,
                "Кількість осіб": certified_people,
                "Класи наслідків": consequence_classes,
                "Посилання": link
            })

    df = pd.DataFrame(data)
    return df

In [None]:
# Example usage of the parse_organizations_all_pages function.
# df_1 = parse_organizations_all_pages()
# df_2 = parse_organizations_all_pages()
# df_3 = parse_organizations_all_pages()
# df_4 = parse_organizations_all_pages()
# df_5 = parse_organizations_all_pages()
# df_6 = parse_organizations_all_pages()

# Since pages are unstable and change during navigation, the dataset needs to be parsed multiple times.

In [None]:
# Combine all parsed DataFrames and check for unique organization names.
df_all = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6], ignore_index=True)
df_all['Назва'].nunique() # Check if there are 128 unique names.

In [None]:
# Remove duplicate entries based on the 'Назва' column.
df_all.drop_duplicates(
    subset='Назва',
    inplace=True,
    ignore_index=True
)

In [None]:
# Save the final DataFrame to CSV and Excel formats.
df_all.to_csv('expertise_organisation.csv')
df_all.to_excel('expertise_organisation.xlsx')

In [None]:
# Load the saved CSV file for further analysis.
df = pd.read_csv('expertise_organisation.csv')

In [None]:
# Display the column names of the loaded DataFrame.
df.columns