In [1]:
import os
import re
import pandas as pd

# Get paths
script_dir = os.getcwd()
project_root = os.path.dirname(script_dir)
folder_path = os.path.join(project_root, "Data", "ScrapeddocumentsCorneel(22-25)")
excel_path = os.path.join(project_root, "Data")

# Statistical patterns
dutch_statistical_patterns = [
    r"\b(hoeveel|aantal|percentage van|percentage|cijfer over|data over|statistieken van)\b",
    r"\b(trend in|evolutie van|groei van|toename van|afname van|ontwikkeling van)\b",
    r"\b(?:verschaffen|geven|tonen|lijst|overzicht van)?\s*(de|een)?\s*(gegevens|statistieken|cijfers)\b"
]

def is_dutch_statistical(question):
    return any(re.search(pattern, question, re.IGNORECASE) for pattern in dutch_statistical_patterns)

# Metadata removal
def remove_metadata(text):
    metadata_patterns = [
        r"(?i)^title:.*", r"(?i)^thema:.*", r"(?i)^pdf link:.*", r"(?i)^thema link:.*",
        r"(?i)^SCHRIFTELIJKE VRAAG.*", r"(?i)^nr\.\s*\d+\s*$", r"(?i)^van\s+.*",
        r"(?i)^datum:.*", r"(?i)^aan\s+.*", r"(?i)^onderwerp:.*",
        r"(?i)^programma’s.*", r"(?i)^vraag van.*"
    ]
    lines = text.split("\n")

    def is_fully_uppercase(line):
        words = re.findall(r"[A-Za-zÀ-ÿ]", line)
        return words and all(word.isupper() for word in words)

    filtered_lines = [line for line in lines if not any(re.search(pattern, line) for pattern in metadata_patterns) and not is_fully_uppercase(line)]
    return "\n".join(filtered_lines).strip()

# Smart sentence splitter
def smart_sentence_split(text):
    abbreviations = [
        "t.o.v.", "a.d.h.v.", "i.v.m.", "m.b.t.", "m.a.w.", "d.w.z.",
        "z.o.z.", "o.a.", "e.d.", "e.o.", "n.a.v.", "v.w.b.", "c.q.",
        "d.d.", "m.n.", "p.m.", "r.i.p.", "s.v.p.", "t.a.v.", "t.k.",
        "t.z.t.", "z.g.a.n.", "z.s.m.", "z.n.", "z.d.", "z.m.",
    ]

    special_prefixes = [
        r"\b\d+\.", r"\bnr\.", r"\b[ivxlcdm]+\."
    ]

    abbr_map = {abbr: f"__AFKORTING_{i}__" for i, abbr in enumerate(abbreviations)}
    for abbr, placeholder in abbr_map.items():
        text = text.replace(abbr, placeholder)

    prefix_map = {}
    for i, regex in enumerate(special_prefixes):
        matches = list(re.finditer(regex, text, flags=re.IGNORECASE))
        for j, match in enumerate(matches):
            key = f"__PREFIX_{i}_{j}__"
            prefix_map[key] = match.group()
            text = text.replace(match.group(), key, 1)

    sentences = re.split(r'(?<=[.?!])\s+', text)

    restored = []
    for sentence in sentences:
        for abbr, placeholder in abbr_map.items():
            sentence = sentence.replace(placeholder, abbr)
        for key, original in prefix_map.items():
            sentence = sentence.replace(key, original)
        restored.append(sentence)

    return restored

# Main question extraction
def extract_questions_with_custom_subgrouping(text):
    text = remove_metadata(text)
    sentences = smart_sentence_split(text)

    extracted_data = []
    prev_non_question_sentences = []
    current_question_group = []

    for i, sentence in enumerate(sentences):
        is_question = re.search(r"\?", sentence)
        is_subquestion = re.match(r"^\s*([a-z]\)|[ivxlcdm]+\.)?\s*(Zo ja|Graag)\b", sentence, re.IGNORECASE)
        is_numbered_list_item = re.match(r"^\s*\d+\.", sentence)
        is_lettered_list_item = re.match(r"^\s*[a-z]\)", sentence, re.IGNORECASE)
        is_roman_item = re.match(r"^\s*[ivxlcdm]+\.", sentence, re.IGNORECASE)
        is_nr_item = re.match(r"^\s*nr\.", sentence, re.IGNORECASE)

        # Get previous sentence if it exists
        previous_sentence = sentences[i - 1] if i > 0 else ""
        previous_is_question = re.search(r"\?", previous_sentence)

        if is_question:
            if current_question_group and is_subquestion:
                current_question_group.append(sentence)
            else:
                if current_question_group:
                    context = " ".join(prev_non_question_sentences[-2:])
                    full_question = " ".join(current_question_group)
                    extracted_data.append((context, full_question))
                    current_question_group = []
                current_question_group.append(sentence)

        elif is_subquestion and not previous_is_question:
            # Graag-line not following a question → treat as standalone question
            if current_question_group:
                context = " ".join(prev_non_question_sentences[-2:])
                full_question = " ".join(current_question_group)
                extracted_data.append((context, full_question))
                current_question_group = []

            current_question_group.append(sentence)

        else:
            if (
                not is_numbered_list_item and
                not is_lettered_list_item and
                not is_roman_item and
                not is_nr_item and
                not re.search(r"\?", sentence) and
                not is_subquestion
            ):
                prev_non_question_sentences.append(sentence)
                if len(prev_non_question_sentences) > 2:
                    prev_non_question_sentences.pop(0)

            if current_question_group:
                context = " ".join(prev_non_question_sentences[-2:])
                full_question = " ".join(current_question_group)
                extracted_data.append((context, full_question))
                current_question_group = []

    return extracted_data


# Process all files
data = []
for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()
            theme_match = re.search(r"thema:\s*(.+)", content, re.IGNORECASE)
            theme = theme_match.group(1).strip() if theme_match else "Unknown"
            questions_with_custom_subgrouping = extract_questions_with_custom_subgrouping(content)

            for context, question in questions_with_custom_subgrouping:
                data.append({
                    "context": context,
                    "question": question,
                    "statistical": 1 if is_dutch_statistical(question) else 0,
                    "theme": theme,
                    "file_name": file_name
                })

# Export to Excel
df = pd.DataFrame(data)
df = df.applymap(lambda x: ''.join(c for c in str(x) if c.isprintable()) if isinstance(x, str) else x)
output_excel_path = os.path.join(excel_path, "Grote_data_cleaned.xlsx")
df.to_excel(output_excel_path, index=False, engine="openpyxl")

print(f"✅ Extraction complete! File saved at: {output_excel_path}")

  df = df.applymap(lambda x: ''.join(c for c in str(x) if c.isprintable()) if isinstance(x, str) else x)


✅ Extraction complete! File saved at: C:\Users\jefva\Documents\Master\Thesis_s2\Code\Data\Grote_data_cleaned.xlsx
