In [1]:
import pandas as pd
import re
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from imblearn.over_sampling import RandomOverSampler
import torch.nn.functional as F
import os 

In [2]:
# ✅ 1. Load & Preprocess Data
script_dir = os.getcwd() # Ga één map omhoog om 'baseline' te verwijderen en ga naar 'Data'
project_root = os.path.dirname(script_dir)  # Dit verwijdert 'baseline' van het script_dir
data_folder = os.path.join(project_root, "Data")

# 1. Dataset inladen
file_path = os.path.join(data_folder, "Grote_data_cleaned.xlsx")
df = pd.read_excel(file_path)

#visualize the data
print(df.head())
print(df.info())

                                             context  \
0  Een draft van deze visienota  is opgemaakt en ...   
1  Een draft van deze visienota  is opgemaakt en ...   
2  Een draft van deze visienota  is opgemaakt en ...   
3  Daarna  zal Fluvius  de visienota indienen bij...   
4  Wanneer ze zich daarna  opnieuw willen  inzett...   

                                            question  statistical    theme  \
0  1.Kan de VREG de opgestelde visienota van Fluv...            0  Energie   
1     a)Wat zijn de krijtlijnen  van deze visienota?            0  Energie   
2  b)Welke acties zullen op basis van deze visien...            0  Energie   
3  2.Wat is de reactie van de VREG op deze visien...            0  Energie   
4  1.Hoeveel 50-54, 55-59  en 60-plussers  zijn s...            1     Werk   

     file_name  
0  1919076.txt  
1  1919076.txt  
2  1919076.txt  
3  1919076.txt  
4  1919282.txt  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39125 entries, 0 to 39124
Data columns (to

In [3]:
# Drop unnecessary columns
if "TXT_file_name" in df.columns:
    df = df.drop(columns=["TXT_file_name"])
    df = df.drop(columns=["statistical"])


# Handle missing values
df = df.dropna(subset=["question"])
df["context"].fillna("", inplace=True)

# Clean text
def clean_text(text):
    text = re.sub(r'\n', ' ', text)  # Replace newlines with spaces
    text = re.sub(r'\b[a-z]\)\s+', ' ', text)  # Remove patterns like 'a)', 'b)', etc.
    text = re.sub(r'\b\d+\.\b', '', text)  # Remove patterns like '1.', '2.', etc.
    text = re.sub(r'\b\d+\)\b', '', text)  # Remove patterns like '1)', '2)', etc.
    text = re.sub(r'\b[i]+[.)]\b', '', text, flags=re.IGNORECASE)  # Remove patterns like 'i.', 'ii.', 'i)', etc.
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and trim
    return text

#df["clean_text"] = (df["context"] + " " + df["question"]).apply(clean_text)
df["clean_text"] = (df["question"]).apply(clean_text)

# Group by 'clean_text' and count unique themes
duplicates_with_diff_themes = df.groupby("clean_text")["theme"].nunique().reset_index()

# Filter rows where the number of unique themes is greater than 1
duplicates_with_diff_themes = duplicates_with_diff_themes[duplicates_with_diff_themes["theme"] > 1]

# Merge back with the original dataframe to get all rows with these 'clean_text'
filtered_df = df[df["clean_text"].isin(duplicates_with_diff_themes["clean_text"])]
# Exclude rows with these 'clean_text' from the original dataframe
df = df[~df["clean_text"].isin(duplicates_with_diff_themes["clean_text"])]


# ✅ Now: drop rare themes using original theme names
theme_counts = df["theme"].value_counts()
valid_themes = theme_counts[theme_counts >= 2].index
df = df[df["theme"].isin(valid_themes)]




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["context"].fillna("", inplace=True)


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

# Prepare features and labels
X = df["clean_text"]
y = df["theme"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# TF-IDF + SVM pipeline
svm_model = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 2), max_features=10000),            #maybe try binary=True also to see if it helps (Question Classification using Support Vector Machines wee sun lee)
    LinearSVC()
)

# Train
svm_model.fit(X_train, y_train)

# Predict + Evaluate
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))

                                                                     precision    recall  f1-score   support

                                                      Armoedebeleid       0.57      0.52      0.54        31
                                                          Begroting       0.73      0.63      0.68        68
                                         Brussel en de Vlaamse Rand       0.72      0.50      0.59        36
                                                 Buitenlands beleid       0.69      0.65      0.67       160
                                                            Cultuur       0.53      0.45      0.48       158
                                                      Dierenwelzijn       0.67      0.56      0.61       128
                                                           Economie       0.58      0.48      0.53       256
                                                            Energie       0.57      0.55      0.56       292
                  

Question + context for train and only question for Test

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

# Prepare labels
y = df["theme"]

# Train-test split (stratified)
train_df, test_df = train_test_split(df, stratify=y, test_size=0.2, random_state=42)

# Use full context + question for training
X_train = train_df["clean_text"]  # this is context + question
y_train = train_df["theme"]

# Use only the question for testing
X_test = test_df["question"].fillna("")  # only the raw question
y_test = test_df["theme"]

# TF-IDF + SVM pipeline
svm_model = make_pipeline(
    TfidfVectorizer(ngram_range=(1, 2), max_features=10000),
    LinearSVC()
)

# Train
svm_model.fit(X_train, y_train)

# Predict + Evaluate
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))


                                                                     precision    recall  f1-score   support

                                                      Armoedebeleid       0.00      0.00      0.00        24
                                                          Begroting       1.00      0.42      0.59        43
                                         Brussel en de Vlaamse Rand       0.73      0.31      0.43        26
                                                 Buitenlands beleid       0.70      0.54      0.61       158
                                                            Cultuur       0.42      0.16      0.23       113
                                                      Dierenwelzijn       0.69      0.29      0.41        85
                                                           Economie       0.43      0.21      0.28       179
                                                            Energie       0.31      0.29      0.30       212
                  