# Parser for Expertise Data from the EDECCB Registry

This script automates the process of collecting information about conducted expertise of construction documentation from the open EDECCB registry (https://e-construction.gov.ua/document/optype=6).

### Main steps of the script:
1. Downloading HTML pages of expertise using `requests` and `BeautifulSoup` libraries.
2. Extracting key data from each expertise page:
   - Expertise name
   - Organization issuing the expertise
   - Object consequence class
   - Type of construction
   - Additional information (estimate, design stage, etc.)
3. Processing a large number of pages using pagination.
4. Creating a structured dataset (`pandas.DataFrame`) for further analysis.
5. Saving the obtained data to a file for future use.

### Libraries used:
- `requests`
- `beautifulsoup4`
- `pandas`
- `time`
- `re`
- `os`

### Notes:
- Implemented retries in case of temporary connection failures.
- The script extracts complete information for further anomaly detection in the expertise registry.
- Data may contain minor gaps due to site structure limitations.

---

In [43]:
# Importing required libraries for HTTP requests, HTML parsing, and data processing
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
import os

In [44]:
# Defining HTTP headers to mimic a browser and avoid being blocked by the server
headers = {
        "User-Agent": "Samsung/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
        "Accept": "application/json, text/plain, */*",
        "Accept-Language": "uk-UA,uk;q=0.9,en-US;q=0.8,en;q=0.7,de-DE;q=0.6,de;q=0.5",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
    }

## Parsing the Registry
This section defines the base URL and functions for parsing the registry pages.

In [45]:
# Base URL for the expertise registry
BASE_URL = "https://e-construction.gov.ua/document/optype=6"

In [46]:
# Function to extract document IDs and names from an HTML page
def extract_doc_ids_and_names(soup, page_num=0):
    """
    Extract document IDs and names from the HTML content of a page.

    This function parses the HTML content of a page to find document IDs and their corresponding names.

    Args:
        soup (BeautifulSoup): Parsed HTML content of the page.
        page_num (int): The page number being processed (default is 0).

    Returns:
        list: A list of tuples, where each tuple contains:
            - doc_id (str): The document ID.
            - name (str): The document name.
            - page_num (int): The page number.
    """
    results = []

    # Find all entries containing doc_id and name
    for item in soup.find_all("div", class_="dataset__item"):
        # Extract doc_id from <a> links
        link = item.find("a", class_="opendata__link")
        if link and "href" in link.attrs:
            href = link["href"]
            if "doc_id=" in href:
                doc_id = href.split("doc_id=")[1].split("/")[0]  # Extract only the ID
            else:
                doc_id = None
        else:
            doc_id = None

        # Extract text from <h3 class="opendata__name">
        name_tag = item.find("h3", class_="opendata__name")
        name = name_tag.text.strip() if name_tag else None

        # Add to the list if both ID and name are present
        if doc_id and name:
            results.append((doc_id, name, page_num))

    return results

# Function to fetch the HTML content of a single page
def fetch_page(page, session, headers):
    """
    Fetch the HTML content of a single page.

    This function sends an HTTP GET request to fetch the HTML content of a specific page.

    Args:
        page (int): The page number to fetch.
        session (requests.Session): The session object for making HTTP requests.
        headers (dict): HTTP headers to include in the request.

    Returns:
        tuple: A tuple containing:
            - page (int): The page number.
            - html (str or None): The HTML content of the page, or None if the request fails.
    """
    url = BASE_URL if page == 1 else BASE_URL + f"/page={page}"
    headers["Referer"] = url
    try:
        response = session.get(url, headers=headers, timeout=5)
        response.raise_for_status()
        return page, response.text
    except requests.RequestException as e:
        print(f"❌ Problem with page {page}: {e}")
        return page, None  # Return None if the request fails

# Function to scrape multiple pages and save the results to a CSV file
def scrape_pages(start_page=1, end_page=5579, save_interval=50, output_file="documents.csv", max_workers=10):
    """
    Scrape multiple pages for document data and save the results to a CSV file.

    This function iterates through a range of pages, extracts document data, and saves the results to a CSV file.

    Args:
        start_page (int): The starting page number (default is 1).
        end_page (int): The ending page number (default is 5579).
        save_interval (int): The number of pages to process before saving data to the file (default is 50).
        output_file (str): The name of the output CSV file (default is 'documents.csv').
        max_workers (int): The maximum number of threads to use for parallel processing (default is 10).

    Returns:
        list: A list of page numbers that could not be processed.
    """
    all_data = []
    skipped_pages = []
    column_names = ["doc_id", "name", "page_num"]

    session = requests.Session()  # Use a session for faster requests
    pages_parsed = 0  # Counter for processed pages

    # Read the existing file to avoid duplicating work
    try:
        existing_data = pd.read_csv(output_file)
        last_page = existing_data["page_num"].max()
        start_page = last_page + 1 if last_page > start_page else start_page
        print(f"⚡ Resuming from page {start_page}")
    except (IndexError, FileNotFoundError, pd.errors.EmptyDataError):
        print("🔹 Starting from scratch!")
        pd.DataFrame(columns=column_names).to_csv(output_file, index=False)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_page = {executor.submit(fetch_page, page, session, headers): page for page in range(start_page, end_page + 1)}

        for future in as_completed(future_to_page):
            page = future_to_page[future]
            try:
                page, html = future.result()
                if html is None:
                    skipped_pages.append(page)
                    continue

                soup = BeautifulSoup(html, "html.parser")
                data = extract_doc_ids_and_names(soup, page_num=page)
                all_data.extend(data)
                pages_parsed += 1  # Increment the counter for processed pages

            except Exception as e:
                print(f"⚠️ Error processing page {page}: {e}")
                skipped_pages.append(page)

            # Save data every `save_interval` pages
            if len(all_data) >= save_interval * 12:
                df = pd.DataFrame(all_data, columns=column_names)
                df.to_csv(output_file, mode='a', index=False, header=False)
                print(f"📄 Saved pages: {pages_parsed}/{end_page - start_page + 1}")
                all_data = []  # Clear the list after saving

    # Final save if there is remaining data
    if all_data:
        df = pd.DataFrame(all_data, columns=column_names)
        df.to_csv(output_file, mode='a', index=False, header=False)
        print(f"✅ Final save of {len(all_data)} records.")
        all_data = []

    print(f"🔚 Completed! Skipped pages: {len(skipped_pages)}")

    # Retry skipped pages
    still_skipped_pages = []
    if skipped_pages:
        print("🔄 Retrying failed pages...")
        for sk_page in skipped_pages:
            page, html = fetch_page(sk_page, session, headers)
            if html is None:
                print(f"❌ Failed to retry page {sk_page}")
                still_skipped_pages.append(sk_page)
                continue
            soup = BeautifulSoup(html, "html.parser")
            data = extract_doc_ids_and_names(soup, page_num=sk_page)
            all_data.extend(data)
            df = pd.DataFrame(all_data, columns=column_names)
            df.to_csv(output_file, mode='a', index=False, header=False)
            print(f"📄 Retried page {sk_page}")
            all_data = []
        print("🔚 Retry completed!")
    print(f"🔚 Final skipped pages: {len(still_skipped_pages)}")
    return still_skipped_pages


In [47]:
# Example of calling the scrape_pages function to parse and save data
# still_skipped_pages = scrape_pages(output_file='documents.csv', max_workers=10)

In [48]:
# session = requests.Session()  # Use a session for faster requests
# column_names = ["doc_id", "name", "page_num"]
# output_file = "documents.csv"
# all_data = []

# for sk_page in still_skipped_pages:
#     page, html = fetch_page(sk_page, session, headers)
#     if html is None:
#         print(f"❌ Failed to retry page {sk_page}")
#         continue
#     soup = BeautifulSoup(html, "html.parser")
#     data = extract_doc_ids_and_names(soup, page_num=sk_page)
#     all_data.extend(data)
#     df = pd.DataFrame(all_data, columns=column_names)
#     df.to_csv(output_file, mode='a', index=False, header=False)
#     print(f"📄 Retried page {sk_page}")
#     all_data = []

## RESULT PAGE
This section defines the document IDs and fetches the HTML content of a specific document page.

In [70]:
# Define document IDs for different cases
doc_id_normal = "3524292411700282436" # Normal doc_id
doc_id_empty = "3505533101101024560" # Empty doc_id
doc_id_invalid = "3524292411700000000" # Invalid doc_id
doc_id_double = "3454546322529454021" # Double doc_id
url = f"https://e-construction.gov.ua/document_detail/doc_id={doc_id_normal}/optype=6"

In [66]:
# Fetching the HTML content of the page
response = requests.get(url, headers=headers)
if response.status_code != 200:
    print(f"Error: {response.status_code}")
    exit()

# Parsing the HTML
soup = BeautifulSoup(response.text, "html.parser")

In [67]:
# Read all column names from the CSV file
all_columns = pd.read_csv("column_name_mapping.csv", delimiter=";")['ukrainian_name'].tolist()

# Function to clean text by removing unnecessary characters
def clean_text(text):
    return text.strip().replace(";", ",").replace("\n", " ").replace("\r", " ").replace("\t", " ")

# Function to extract expertise data from the HTML content
def extract_expertise_data(soup, all_columns):
    """
    Extract expertise data from the HTML content.

    This function parses the HTML content of a document page to extract information about expertise, cost estimates, and the customer.

    Args:
        soup (BeautifulSoup): The parsed HTML content.
        all_columns (list): A list of all column names for the resulting DataFrame.

    Returns:
        pd.DataFrame: A DataFrame containing the extracted data, with columns matching `all_columns`.
    """
    main_info = {}
    key_counts = {"Орган, що видав": 0}

    # --- 1. Expertise Data ---
    for item in soup.find_all("div", class_="object-info-item"):
        key = clean_text(item.find("div", class_="object-info_left").text)
        value = clean_text(item.find("div", class_="object-info_right").text)
        if key in key_counts:
            key_counts[key] += 1
            if key_counts[key] == 1:
                key += "_експертиза"
            elif key_counts[key] == 2:
                key += "_ПКД"
        main_info[key] = value

    # --- 2. Cost Estimate Data ---


    costs_header = soup.find("h3", id="deviations_building_codes_psm_1")
    if costs_header:
        wrapper_div = costs_header.find_next_sibling("div")

        if wrapper_div:
            if wrapper_div.find("span", class_="object-no-info"):
                main_info["Кошторисна документація"] = "Інформацію не зазначено"

            elif wrapper_div.find("table"):
                table = wrapper_div.find("table")
                tbody = table.find("tbody")
                first_row = tbody.find("tr") if tbody else None

                if first_row:
                    cells = first_row.find_all("td")
                    if len(cells) >= 3:
                        raw_code = clean_text(cells[0].text)
                        date = clean_text(cells[1].text)
                        cost = clean_text(cells[2].text)

                        match = re.match(r"(.+?)\s+Редакція\s+(№\d+)", raw_code)
                        code, redaction = match.groups() if match else (raw_code, "")

                        main_info["Код кошторису"] = code
                        main_info["Номер редакції кошторису"] = redaction
                        main_info["Дата кошторису"] = date
                        main_info["Заявлена кошторисна вартість, тис. грн."] = cost

    # --- 3. Customer Data ---
    section = soup.find("h3", id="doc_applicants")
    if section:
        table = section.find_next("table", class_="object-table")
        if table:
            tbody = table.find("tbody")
            if tbody:
                first_row = tbody.find("tr")
                if first_row:
                    cells = first_row.find_all("td")
                    if len(cells) >= 2:
                        legal_status = clean_text(cells[0].text)
                        raw_name = clean_text(cells[1].text)
                        match = re.search(r"\((\д{8})\)", raw_name)
                        edrpou = match.group(1) if match else ""
                        name = re.sub(r"\с*\(\д{8}\)\с*$", "", raw_name).strip()
                        main_info["Правовий статус замовника"] = legal_status
                        main_info["Назва замовника"] = name
                        main_info["ЄДРПОУ замовника"] = edrpou

    # --- Create Final DataFrame ---
    df = pd.DataFrame([{col: main_info.get(col, "") for col in all_columns}])
    return df


In [75]:
def extract_doc_versions(soup, doc_id):
    """
    Extracts all document versions (doc_id and date) from the 'doc-versions' section of the HTML.

    Args:
        soup (BeautifulSoup): Parsed HTML content of the page.
        doc_id (str): The document ID of the main document.

    Returns:
        dict: A dictionary where keys are document IDs and values are dictionaries containing
              the version and date of each document.
    """
    versions = {}
    # Locate the section containing document versions
    versions_section = soup.find("div", class_="doc-versions")
    if versions_section:
        # Iterate through all links in the 'doc-versions' section
        for link in versions_section.find_all("a", class_="doc-versions__item"):
            if "редакція" in link.text:  # Check if the link text contains "редакція" (version)
                match = re.search(r'doc_id=(\d+)', link["href"])  # Extract doc_id from the link
                date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', link.text)  # Extract date from the link text
                if match and date_match:
                    versions[match.group(1)] = {
                        "date": date_match.group(1)    # Date of the version
                    }
        
        # Find the active version (currently selected version)
        active_version = versions_section.find("span", class_="doc-versions__item active")
        if active_version:
            active_match = re.search(r'(\d+) редакція від (\d{2}\.\d{2}\.\d{4})', active_version.text)
            if active_match:
                versions[doc_id] = {
                    "date": active_match.group(2)            # Date of the active version
                }
    return versions

In [76]:
# Example of calling the extract_doc_versions function
extract_doc_versions(soup, "3223037525790557843")

{'3445958426151093499': {'date': '30.08.2024'},
 '3223037525790557843': {'date': '16.12.2024'}}

## Final parsing!
This section defines the functions for fetching and parsing document data, including handling document versions.

In [55]:
# Base URL for the document detail page
BASE_URL = "https://e-construction.gov.ua/document_detail/doc_id={}/optype=6"
# Read all column names from the CSV file
all_columns = pd.read_csv("column_name_mapping.csv", delimiter=";")['ukrainian_name'].tolist()

# Function to fetch the HTML content of a document page using the provided doc_id
def fetch_page(doc_id, session, headers):
    """
    Fetch the HTML content of a document page using the provided doc_id.

    Args:
        doc_id (str): The document ID to fetch.
        session (requests.Session): The session object for making HTTP requests.
        headers (dict): The HTTP headers to include in the request.

    Returns:
        tuple: A tuple containing the doc_id and the HTML content (or None if an error occurs).
    """
    url = BASE_URL.format(doc_id)
    headers["Referer"] = url
    try:
        response = session.get(url, headers=headers, timeout=5)
        response.raise_for_status()
        return doc_id, response.text
    except requests.RequestException as e:
        print(f"❌ Issue with doc_id {doc_id}: {e}")
        return doc_id, None

# Function to clean text by removing unnecessary characters
def clean_text(text):
    """
    Cleans the input text by removing unnecessary characters and formatting.

    Args:
        text (str): The text to clean.

    Returns:
        str: The cleaned text.
    """
    return text.strip().replace(";", ",").replace("\n", " ").replace("\r", " ").replace("\t", " ")

# Function to extract expertise data from the HTML content
def extract_expertise_data(soup, all_columns):
    """
    Extract expertise data from the HTML content.

    This function parses the HTML content of a document page to extract information about expertise, cost estimates, and the customer.

    Args:
        soup (BeautifulSoup): The parsed HTML content.
        all_columns (list): A list of all column names for the resulting DataFrame.

    Returns:
        pd.DataFrame: A DataFrame containing the extracted data, with columns matching `all_columns`.
    """
    main_info = {}
    key_counts = {"Орган, що видав": 0}

    # --- 1. Expertise Data ---
    for item in soup.find_all("div", class_="object-info-item"):
        key = clean_text(item.find("div", class_="object-info_left").text)
        value = clean_text(item.find("div", class_="object-info_right").text)
        if key in key_counts:
            key_counts[key] += 1
            if key_counts[key] == 1:
                key += "_експертиза"
            elif key_counts[key] == 2:
                key += "_ПКД"
        main_info[key] = value

    # --- 2. Cost Estimate Data ---


    costs_header = soup.find("h3", id="deviations_building_codes_psm_1")
    if costs_header:
        wrapper_div = costs_header.find_next_sibling("div")

        if wrapper_div:
            if wrapper_div.find("span", class_="object-no-info"):
                main_info["Кошторисна документація"] = "Інформацію не зазначено"

            elif wrapper_div.find("table"):
                table = wrapper_div.find("table")
                tbody = table.find("tbody")
                first_row = tbody.find("tr") if tbody else None

                if first_row:
                    cells = first_row.find_all("td")
                    if len(cells) >= 3:
                        raw_code = clean_text(cells[0].text)
                        date = clean_text(cells[1].text)
                        cost = clean_text(cells[2].text)

                        match = re.match(r"(.+?)\с+Редакція\с+(№\д+)", raw_code)
                        code, redaction = match.groups() if match else (raw_code, "")

                        main_info["Код кошторису"] = code
                        main_info["Номер редакції кошторису"] = redaction
                        main_info["Дата кошторису"] = date
                        main_info["Заявлена кошторисна вартість, тис. грн."] = cost

    # --- 3. Customer Data ---
    section = soup.find("h3", id="doc_applicants")
    if section:
        table = section.find_next("table", class_="object-table")
        if table:
            tbody = table.find("tbody")
            if tbody:
                first_row = tbody.find("tr")
                if first_row:
                    cells = first_row.find_all("td")
                    if len(cells) >= 2:
                        legal_status = clean_text(cells[0].text)
                        raw_name = clean_text(cells[1].text)
                        match = re.search(r"\((\д{8})\)", raw_name)
                        edrpou = match.group(1) if match else ""
                        name = re.sub(r"\с*\(\д{8}\)\с*$", "", raw_name).strip()
                        main_info["Правовий статус замовника"] = legal_status
                        main_info["Назва замовника"] = name
                        main_info["ЄДРПОУ замовника"] = edrpou

    # --- Create Final DataFrame ---
    df = pd.DataFrame([{col: main_info.get(col, "") for col in all_columns}])
    return df

# Function to extract all document versions from the HTML content
def extract_doc_versions(soup, doc_id):
    """
    Extract all document versions from the HTML content.

    This function parses the 'doc-versions' section of the HTML content to extract all versions of a document, including their IDs and dates.

    Args:
        soup (BeautifulSoup): The parsed HTML content.
        doc_id (str): The primary document ID.

    Returns:
        dict: A dictionary where keys are document IDs and values are dictionaries containing:
            - 'date' (str): The date of the version.
    """
    versions = {}
    versions_section = soup.find("div", class_="doc-versions")
    if versions_section:
        for link in versions_section.find_all("a", class_="doc-versions__item"):
            if "редакція" in link.text:
                match = re.search(r'doc_id=(\д+)', link["href"])
                date_match = re.search(r'(\д{2}\.\д{2}\.\д{4})', link.text)
                if match and date_match:
                    versions[match.group(1)] = {"date": date_match.group(1)}
        
        active_version = versions_section.find("span", class_="doc-versions__item active")
        if active_version:
            active_match = re.search(r'(\д+) редакція від (\д{2}\.\д{2}\.\д{4})', active_version.text)
            if active_match:
                versions[doc_id] = {"date": active_match.group(2)}
    return versions

# Function to parse document data from a CSV file and save the results to an output file
def parse_documents(csv_file, all_columns, output_file="parsed_documents.csv", max_workers=10, save_interval=2000):
    """
    Parse document expertise data from a CSV file and save the results to an output file.

    This function reads document IDs from a CSV file, fetches their HTML content, extracts relevant data, and saves the results to a new CSV file.

    Args:
        csv_file (str): The input CSV file containing document IDs.
        output_file (str): The output CSV file to save the results.
        max_workers (int): The maximum number of threads to use for parallel processing.
        save_interval (int): The number of records to save at each interval.
    """
    session = requests.Session()

    # Create the output file if it doesn't exist
    if not os.path.exists(output_file):
        pd.DataFrame(columns=all_columns).to_csv(output_file, index=False, sep=";")

    df = pd.read_csv(csv_file)
    doc_ids = df["doc_id"].unique()

    documents_parsed = 0
    skipped_versions = []
    skipped_documents = []
    versions_cache = {}  # Cache for storing document versions
    additional_futures = []
    primary_docs_processed = 0  # Counter for primary documents
    versions_to_process = 0  # Counter for versions to process
    buffer = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_doc = {}
        # Add all primary documents to the queue
        for doc_id in doc_ids:
            future_to_doc[executor.submit(fetch_page, doc_id, session, headers)] = doc_id
        print(len(future_to_doc), "primary documents in the queue")

        for future in as_completed(future_to_doc):
            doc_id = future_to_doc[future]
            try:
                doc_id, html = future.result()
                if html is None:
                    skipped_documents.append(doc_id)
                    continue

                soup = BeautifulSoup(html, "html.parser")
                data_df_orig = extract_expertise_data(soup, all_columns)

                # Use cache if available; otherwise, call extract_doc_versions
                if doc_id not in versions_cache:
                    versions_cache[doc_id] = extract_doc_versions(soup, doc_id)

                versions = versions_cache[doc_id]

                data_df_orig["doc_id"] = doc_id
                data_df_orig["date"] = versions.get(doc_id, {}).get("date", "Unknown")
                data_df_orig["version"] = versions.get(doc_id, {}).get("version", "Unknown")

                buffer.append(data_df_orig)
                documents_parsed += 1
                primary_docs_processed += 1  # Primary document processed

                # Add document versions to the queue for parsing
                for version_doc_id, version_data in versions.items():
                    if version_doc_id not in versions_cache:  # Avoid duplicate parsing
                        versions_cache[version_doc_id] = version_data
                        additional_futures.append(executor.submit(fetch_page, version_doc_id, session, headers))
                        future_to_doc[additional_futures[-1]] = version_doc_id
                        versions_to_process += 1  # Count versions to process

                if len(buffer) >= save_interval:
                    result_df = pd.concat(buffer, ignore_index=True)
                    result_df.to_csv(output_file, mode='a', index=False, header=False, sep=";")
                    buffer = []
                    print(f"📄 Primary documents processed: {primary_docs_processed}/{len(doc_ids)}")

            except Exception as e:
                skipped_documents.append(doc_id)
                print(f"⚠️ Error processing doc_id {doc_id}: {e}")

        if buffer:
            result_df = pd.concat(buffer, ignore_index=True)
            result_df.to_csv(output_file, mode='a', index=False, header=False, sep=";")
            print(f"✅ Final save of {len(buffer)} records.")
            buffer = []
        if skipped_documents:
            skipped_df = pd.DataFrame(skipped_documents)
            skipped_df.to_csv("skipped_documents.csv", mode='a', index=False, sep=";")
            print(f"⚠️ Skipped documents saved to 'skipped_documents.csv' ({len(skipped_documents)} records)")
        else:
            print("No skipped documents!")
        print(f"🔚 Primary documents processed!")

        # Process all additional versions (added later)
        versions_processed = 0  # Counter for processed versions

        for future in as_completed(additional_futures):
            version_doc_id = future_to_doc[future]
            try:
                version_doc_id, html = future.result()
                if html is None:
                    skipped_versions.append({"doc_id": version_doc_id, "reason": "No HTML response"})
                    continue

                soup = BeautifulSoup(html, "html.parser")
                data_df = extract_expertise_data(soup, all_columns)

                # Use cache instead of calling extract_doc_versions again
                version_info = versions_cache.get(version_doc_id, {})
                data_df["doc_id"] = version_doc_id
                data_df["date"] = version_info.get("date", "Unknown")
                data_df["version"] = version_info.get("version", "Unknown")

                buffer.append(data_df)
                documents_parsed += 1
                versions_processed += 1  # Increment processed versions counter

                # Display progress
                if len(buffer) >= save_interval:
                    result_df = pd.concat(buffer, ignore_index=True)
                    result_df.to_csv(output_file, mode='a', index=False, header=False, sep=";")
                    buffer = []
                    print(f"✅ Versions processed: {versions_processed}/{versions_to_process}")

            except Exception as e:
                skipped_versions.append({"doc_id": version_doc_id, "reason": str(e)})
                print(f"⚠️ Error processing version_doc_id {version_doc_id}: {e}")

        # Final save
        if buffer:
            result_df = pd.concat(buffer, ignore_index=True)
            result_df.to_csv(output_file, mode='a', index=False, header=False, sep=";")
            print(f"✅ Final save of {len(buffer)} records.")

        # Save skipped versions to CSV
        if skipped_versions:
            skipped_df = pd.DataFrame(skipped_versions)
            skipped_df.to_csv("skipped_versions.csv", mode='a', index=False, sep=";")
            print(f"⚠️ Skipped versions saved to 'skipped_versions.csv' ({len(skipped_versions)} records)")

        print(f"🔚 Completed! Skipped versions: {len(skipped_versions)}")


In [56]:
# Example of calling the parse_documents function to parse and save data
# parse_documents("documents.csv", all_columns, max_workers=10, save_interval=1000)

## DATA CLEANING
This section defines the steps for cleaning the parsed data.

In [57]:
# Read the parsed data from the CSV file
df = pd.read_csv("parsed_documents.csv", sep=";", encoding="utf-8", index_col=False)

In [58]:
# Check for duplicate document IDs
df.duplicated(subset=["doc_id"]).sum()

np.int64(4)

In [59]:
# Drop duplicate document IDs, keeping the last occurrence
df.drop_duplicates(subset=["doc_id"], keep="last", inplace=True)

In [60]:
# column_mapping = {
#     "document_type": "Тип документу",
#     "registration_number_edessb": "Реєстраційний номер в ЄДЕССБ",
#     "document_version": "Версія документу",
#     "registration_status": "Статус реєстрації",
#     "document_status": "Статус документу",
#     "version_full_text": "version",
#     "date_of_version": "date",
#     "document_name": "Документ",
#     "issuing_body_expertise": "Орган, що видав_експертиза",
#     "issuing_body_project_documentation": "Орган, що видав_ПКД",
#     "object_name": "Назва об’єкта",
#     "project_code": "Код проектної документації",
#     "expertise_area": "Напрям експертизи",
#     "chief_project_expert": "Головний експерт проекту",
#     "registration_number": "Реєстраційний номер",
#     "construction_type": "Вид будівництва",
#     "project_works_list": "Перелік видів робіт, виконаних проектувальником (генпроектувальником)",
#     "number_of_design_stages": "Кількість стадій проектування",
#     "current_design_stage": "Поточна стадія проектування",
#     "dkbs_code": "Код ДКБС",
#     "public_funding_involved": "Об'єкт споруджуються із залученням бюджетних коштів, коштів державних і комунальних підприємств, установ та організацій, а також кредитів, наданих під державні гарантії?",
#     "intellectual_property_rights": "Належність майнових прав на проектну документацію (право змінювати проектну документацію)",
#     "contract_date": "Дата договору на розробку документації",
#     "project_documentation_number": "Номер проектної документації",
#     "document_internal_status": "Статус документа",
#     "estimate_code": "Код кошторису",
#     "estimate_version": "Номер редакції кошторису",
#     "estimate_date": "Дата кошторису",
#     "declared_estimated_cost": "Заявлена кошторисна вартість, тис. грн.",
#     "client_legal_status": "Правовий статус замовника",
#     "client_name": "Назва замовника",
#     "client_edrpou": "ЄДРПОУ замовника",
#     "doc_id": "doc_id"
# }

# df_columns = pd.DataFrame([
#     {"english_name": k, "ukrainian_name": v}
#     for k, v in column_mapping.items()
# ])

# df_columns.to_csv("column_name_mapping.csv", index=False, sep=";", encoding="utf-8")
# print("✅ Файл 'column_name_mapping.csv' створено.")


In [61]:
# Read the column mapping from the CSV file
column_mapping_df = pd.read_csv("column_name_mapping.csv", delimiter=";")
column_mapping = dict(zip(column_mapping_df['ukrainian_name'], column_mapping_df['english_name']))

# Rename the columns of the dataframe
df.rename(columns=column_mapping, inplace=True)

In [62]:
# Check the value counts of the document_version column
df['document_version'].value_counts()

document_version
№1     66609
№2     11127
№3      2913
№4       827
№5       286
№6        98
№7        39
№8        18
№9         7
№10        6
№11        2
№13        1
№12        1
Name: count, dtype: int64

In [63]:
# Split the document_name column into document_number and document_date
df[['document_number', 'document_date']] = df['document_name'].str.split(' від ', expand=True, n=1)
# Drop the original document_name column
df.drop(columns=['document_name'], inplace=True)

In [64]:
# Save the cleaned data to a new CSV file
# df.to_csv("expertise_english_columns.csv", index=False, sep=";", encoding="utf-8")