**Code die questions haalt uit de txt files en ze in een excel plaatst met een thema en context zinnen. Onze definitie van een statistiche vraag: een vraag waarop een numeriek antwoord, gehaald uit een database, kan gegeven worden. De methodologie van hoe en op basis van welke woorden moet nog verder bekekenen en onderzocht worden. Momenteel plaatsen we bij elke vraag 2 contextzinnen. We gaan nog op zoek naar de ideale hoeveelheid context.**

In [1]:
#aanhef verwijderen zodat die niet mee in vraag of context komt alsook de full caption text in het midden
#Identifier vraagwoorden veranderd
import os
import re
import pandas as pd

"""
Script to extract questions from text files while:
✔ Skipping metadata (e.g., title, theme, PDF link, sender, recipient, date).
✔ Grouping sub-questions **only when they follow a question and start with "Zo ja" or "Graag".**
✔ Keeping all other questions separate.
✔ Ensuring that numbered list items (1., 2., etc.) are NOT used as context.
✔ Assigning the last two **non-question** sentences as context.
✔ Identifying statistical questions.
✔ Extracting the theme from the file.
"""



# Haal de huidige werkdirectory op (Jupyter gebruikt geen __file__)
script_dir = os.getcwd()# Ga één map omhoog om 'Identifier' te verwijderen en ga naar 'Data'
project_root = os.path.dirname(script_dir)  # Dit verwijdert 'Identifier'
folder_path = os.path.join(project_root, "Data", "ScrapeddocumentsCorneel(22-25)")
excel_path = os.path.join(project_root, "Data")


# Define Dutch patterns for identifying statistical questions
dutch_statistical_patterns = [
    r"\b(hoeveel|aantal|percentage van|percentage|cijfer over|data over|statistieken van)\b",
    r"\b(trend in|evolutie van|groei van|toename van|afname van|ontwikkeling van)\b",
    r"\b(?:verschaffen|geven|tonen|lijst|overzicht van)?\s*(de|een)?\s*(gegevens|statistieken|cijfers)\b"
]

# Function to determine if a question is statistical
def is_dutch_statistical(question):
    return any(re.search(pattern, question, re.IGNORECASE) for pattern in dutch_statistical_patterns)

# Function to remove metadata from the text
def remove_metadata(text):
    """
    Removes introductory metadata such as title, theme, PDF link, sender, recipient, date, and headers.
    Also removes any line that contains "MINISTER" in full uppercase and any fully capitalized lines.
    """
    # Define patterns to detect metadata (title, theme, sender, recipient, etc.)
    metadata_patterns = [
        r"(?i)^title:.*",  # Matches "title: ..."
        r"(?i)^thema:.*",  # Matches "thema: ..."
        r"(?i)^pdf link:.*",  # Matches "pdf link: ..."
        r"(?i)^thema link:.*",  # Matches "thema link: ..."
        r"(?i)^SCHRIFTELIJKE VRAAG.*",  # Matches "SCHRIFTELIJKE VRAAG ..."
        r"(?i)^nr\.\s*\d+\s*$",  # Matches "nr. 3 ..."
        r"(?i)^van\s+.*",  # Matches "van ANKE VAN DERMEERSCH"
        r"(?i)^datum:.*",  # Matches "datum: 23 juli 2024"
        r"(?i)^aan\s+.*",  # Matches "aan BENJAMIN DALLE"
        r"(?i)^onderwerp:.*",  # Matches "onderwerp: ..."
        r"(?i)^programma’s.*",  # Matches "Programma’s ..."
        r"(?i)^vraag van.*",  # Matches "vraag van ..."
    ]

    # Split text into lines
    lines = text.split("\n")

    # Function to check if a line is fully uppercase (ignoring numbers/symbols)
    def is_fully_uppercase(line):
        words = re.findall(r"[A-Za-zÀ-ÿ]", line)  # Get only letters (ignoring numbers/punctuation)
        return words and all(word.isupper() for word in words)  # Check if all words are uppercase

    # Remove lines that match metadata patterns or are fully uppercase
    filtered_lines = [line for line in lines if not any(re.search(pattern, line) for pattern in metadata_patterns) and not is_fully_uppercase(line)]

    # Return cleaned content (excluding metadata)
    return "\n".join(filtered_lines).strip()


# Function to extract questions while excluding metadata
def extract_questions_with_custom_subgrouping(text):
    """
    Extracts questions from text, grouping only when a question is immediately followed by 
    "Zo ja" or "Graag". Ensures other questions remain separate.
    Also ensures that numbered list items (1., 2.) are NOT used as context.

    Args:
    - text (str): The full document text.

    Returns:
    - List of tuples (context, question)
    """
    # Remove metadata first
    text = remove_metadata(text)

    import re

def smart_sentence_split(text):
    abbreviations = [
        "t.o.v.", "a.d.h.v.", "i.v.m.", "m.b.t.", "m.a.w.", "d.w.z.",
        "z.o.z.", "o.a.", "e.d.", "e.o.", "n.a.v.", "v.w.b.", "c.q.",
        "d.d.", "m.n.", "p.m.", "r.i.p.", "s.v.p.", "t.a.v.", "t.k.",
        "t.z.t.", "z.g.a.n.", "z.s.m.", "z.n.", "z.d.", "z.m.",
    ]

    # Create a mapping: abbreviation -> safe placeholder (e.g., t.o.v. -> __AFKORTING_0__)
    abbr_map = {abbr: f"__AFKORTING_{i}__" for i, abbr in enumerate(abbreviations)}
    for abbr, placeholder in abbr_map.items():
        text = text.replace(abbr, placeholder)

    # Now split sentences without worrying about abbreviation dots
    pattern = r'''
        (?<!\b\d)              # not after digit (to avoid 1., 2.)
        (?<!\bnr)              # not after "nr"
        (?<!\b[i,v,x]{1,5})    # not after roman numerals
        (?<=[?.!])             # lookbehind for end punctuation
        \s+                    # whitespace follows
    '''
    sentences = re.split(pattern, text, flags=re.IGNORECASE | re.VERBOSE)

    # Replace placeholders back to abbreviations
    restored = []
    for sentence in sentences:
        for abbr, placeholder in abbr_map.items():
            sentence = sentence.replace(placeholder, abbr)
        restored.append(sentence)

    return restored


    # Split text into sentences
    sentences = smart_sentence_split(text)


    extracted_data = []
    prev_non_question_sentences = []  # Tracks last two non-question sentences (excluding numbered list items)
    current_question_group = []  # Holds grouped main+sub-questions

    for i, sentence in enumerate(sentences):
        # Check if the sentence is a question
        is_question = re.search(r"\?", sentence)

        # Check if it starts with "Zo ja" or "Graag" (indicating a sub-question)
        is_subquestion = re.match(r"^\s*(Zo ja|Graag)\b", sentence, re.IGNORECASE)

        # Check if it's a numbered list item (e.g., "1.", "2.")
        is_numbered_list_item = re.match(r"^\s*\d+\.", sentence)

        if is_question:
            # If we're already grouping a main question and this sentence is "Zo ja"/"Graag", add it
            if current_question_group and is_subquestion:
                current_question_group.append(sentence)
            else:
                # If we already have a grouped question, finalize and store it
                if current_question_group:
                    context = " ".join(prev_non_question_sentences[-2:])  # Take last two non-question sentences
                    full_question = " ".join(current_question_group)  # Combine main question + sub-questions
                    extracted_data.append((context, full_question))
                    current_question_group = []  # Reset the group

                current_question_group.append(sentence)  # Start new question group
        else:
            # If it's a non-question sentence and NOT a numbered list item, add it to the tracking list
            # If it's a non-question sentence, not a numbered list item, and does NOT start with "Graag"
            if (
                not is_numbered_list_item and
                not re.search(r"\?", sentence) and
                not sentence.strip().lower().startswith("graag")
            ):
                prev_non_question_sentences.append(sentence)
                if len(prev_non_question_sentences) > 2:
                    prev_non_question_sentences.pop(0)  # Keep only the last two valid context sentences


            # If we were collecting a question group and hit a non-question, finalize it
            if current_question_group:
                context = " ".join(prev_non_question_sentences[-2:])
                full_question = " ".join(current_question_group)
                extracted_data.append((context, full_question))
                current_question_group = []  # Reset

    return extracted_data

# Process each file
data = []
for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()

            # Extract the theme (assuming it's always at the start of the file)
            theme_match = re.search(r"thema:\s*(.+)", content, re.IGNORECASE)
            theme = theme_match.group(1).strip() if theme_match else "Unknown"

            # Extract questions with updated grouping logic
            questions_with_custom_subgrouping = extract_questions_with_custom_subgrouping(content)

            # Store extracted questions and their classification
            for context, question in questions_with_custom_subgrouping:
                data.append({
                    "context": context,
                    "question": question,
                    "statistical": 1 if is_dutch_statistical(question) else 0,
                    "theme": theme,
                    "file_name": file_name  # Add the file name as a column
                })

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to Excel file
output_excel_path = os.path.join(excel_path, "Grote_data_cleaned.xlsx")
# Remove NULL bytes and control characters before saving
df = df.applymap(lambda x: ''.join(c for c in str(x) if c.isprintable()) if isinstance(x, str) else x)

df.to_excel(output_excel_path, index=False, engine="openpyxl")

print(f"Extraction complete! The file is saved at: {output_excel_path}")

TypeError: 'NoneType' object is not iterable