Script that will transform the descriptions file to a csv file.

In [1]:
import pandas as pd
import re

In [2]:
def parse_metadata(file_path: str) -> pd.DataFrame:
    """
    Parses a metadata text file of the form:

        1. Description (multiple parentheses) (COLUMN_NAME):
        0 – no
        1 – yes

    Produces a DataFrame with columns: ["Name", "Description", "Possible Values"].
    If no enumerated values are found for an item, we assign Possible Values = "Continuous".
    """

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    data = []
    current_name = None
    current_description = None
    current_possible_values = {}

    for line in lines:
        line = line.strip()
        # Skip empty lines
        if not line:
            continue

        # 1) Check if this line defines a new item, e.g. "13. Some text (column_name):"
        #    We match: <number>. <anything>
        item_match = re.match(r"^(\d+)\.\s*(.*)$", line)
        if item_match:
            # If we had a previous item, save it before starting a new one
            if current_name is not None:
                # If no enumerated values found, call it Continuous
                if len(current_possible_values) == 0:
                    final_values = "Continuous"
                else:
                    final_values = current_possible_values
                data.append((current_name, current_description, final_values))

            # Start parsing a new item
            remainder = item_match.group(2)

            # Find the *last* set of parentheses for the name
            last_open_idx = remainder.rfind('(')
            last_close_idx = remainder.rfind(')')

            # Reset possible values
            current_possible_values = {}

            if last_open_idx != -1 and last_close_idx != -1 and last_close_idx > last_open_idx:
                # The name is what's inside the last (...)
                current_name = remainder[last_open_idx + 1 : last_close_idx].strip()
                # The description is everything before that
                current_description = remainder[:last_open_idx].strip()
                # Remove trailing punctuation (colon, period, etc.)
                current_description = current_description.rstrip(':').rstrip('.')
            else:
                # Fallback if we can't parse parentheses properly
                current_name = None
                current_description = remainder

        # 2) Check if this line is an enumerated possible value: e.g. "0 – No"
        elif re.match(r"^\d+\s*[–-]\s*", line):
            key_value = None
            # First try splitting by '–'
            key_value = line.split('–', 1)
            # If not found, try splitting by '-'
            if len(key_value) < 2:
                key_value = line.split('-', 1)

            if len(key_value) == 2:
                key_str = key_value[0].strip()
                val_str = key_value[1].strip()
                # Try converting key to int
                try:
                    key_int = int(key_str)
                    current_possible_values[key_int] = val_str
                except ValueError:
                    # If not integer, store as string
                    current_possible_values[key_str] = val_str
        else:
            # For lines that do not match a new item or enumerated value,
            # we ignore them, or you could handle them if you wish.
            pass

    # After the loop, save the final item if one is in progress
    if current_name is not None:
        if len(current_possible_values) == 0:
            final_values = "Continuous"
        else:
            final_values = current_possible_values
        data.append((current_name, current_description, final_values))

    # Convert to DataFrame
    df = pd.DataFrame(data, columns=["name", "description", "possible_values"])
    return df



In [3]:
file_path = "../database/descriptions.txt"

In [4]:
df_metadata = parse_metadata(file_path)

In [None]:
df_metadata.to_csv("../database/feature_descriptions.csv", index=False)