In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from imblearn.over_sampling import RandomOverSampler
from xgboost import XGBClassifier
import os


In [2]:
# ✅ 1. Load & Preprocess Data
script_dir = os.path.dirname(os.getcwd())# Ga één map omhoog om 'baseline' te verwijderen en ga naar 'Data'
project_root = os.path.dirname(script_dir)  # Dit verwijdert 'baseline' van het script_dir
data_folder = os.path.join(project_root, "Data")

# 1. Dataset inladen
file_path = os.path.join(data_folder, "Grote_data_NoDupsLessThemes.xlsx")
df = pd.read_excel(file_path)

#visualize the data
print(df.head())
print(df.info())

                                             context  \
0  Ondertussen is de eerstelijnszone BruZEL al me...   
1  Ondertussen is de eerstelijnszone BruZEL al me...   
2  Ondertussen is de eerstelijnszone BruZEL al me...   
3  Ondertussen is de eerstelijnszone BruZEL al me...   
4  Ondertussen is de eerstelijnszone BruZEL al me...   

                                            question  statistical  \
0  Zoals alle eerstelijnszones kreeg ook BruZEL h...            0   
1                            a)Wat liep er moeilijk?            0   
2  Met welke  uitdagingen  werd BruZEL het afgelo...            0   
3      Hoe kunnen  die uitdagingen worden aangepakt?            0   
4                                b)Wat liep er goed?            0   

                        theme    file_name  \
0  Brussel en de Vlaamse Rand  1752898.txt   
1  Brussel en de Vlaamse Rand  1752898.txt   
2  Brussel en de Vlaamse Rand  1752898.txt   
3  Brussel en de Vlaamse Rand  1752898.txt   
4  Brussel en de V

In [3]:
# Drop unnecessary columns
# Drop unnecessary columns
# Drop unnecessary columns
columns_to_drop = ["context","file_name","question","statistical"]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# ✅ Drop rare themes (appearing < 2 times)
theme_counts = df["theme"].value_counts()
valid_themes = theme_counts[theme_counts >= 100].index
df = df[df["theme"].isin(valid_themes)]



# Amount of rows
print(f"Number of rows after filtering: {len(df)}")

Number of rows after filtering: 92503


In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier


# 1. Encode labels
le = LabelEncoder()
y = le.fit_transform(df["theme"])
X = df["clean_text"]  

# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# 3. TF-IDF vectorization with Dutch stopwords and bigrams
from nltk.corpus import stopwords
import nltk
nltk.download("stopwords")

dutch_stopwords = stopwords.words("dutch")

vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=10000,
    stop_words=dutch_stopwords
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jefva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:

# Minimal parameter grid for text data
param_grid = {
    'max_depth': [4],
    'learning_rate': [ 0.3],
    'n_estimators': [250],
}

xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='mlogloss',
    n_jobs=-1,
    verbosity=1,
    random_state=42
)

grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=5,
    verbose=2,
    n_jobs=-1
)

# 4. Fit (X_train must be raw text!)
print("GridSearchCV wordt uitgevoerd... 🚀")
grid.fit(X_train_vec, y_train)

# 5. Predict on test set
best_model = grid.best_estimator_

# 7. Predict + decode
y_pred = best_model.predict(X_test_vec)
y_test_labels = le.inverse_transform(y_test)
y_pred_labels = le.inverse_transform(y_pred)



print(f"Beste parameters: {grid.best_params_}")
print(classification_report(y_test, y_pred, zero_division=0))

# 7. Summary table
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted", zero_division=1)

baseline_results = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1"],
    "Score": [accuracy, precision, recall, f1]
})

print(baseline_results)

GridSearchCV wordt uitgevoerd... 🚀
Fitting 5 folds for each of 1 candidates, totalling 5 fits


Parameters: { "use_label_encoder" } are not used.



Beste parameters: {'learning_rate': 0.3, 'max_depth': 4, 'n_estimators': 250}
              precision    recall  f1-score   support

           0       0.81      0.52      0.63       254
           1       0.74      0.57      0.64      1151
           2       0.70      0.39      0.50       155
           3       0.72      0.39      0.51       636
           4       0.71      0.53      0.61      1614
           5       0.75      0.44      0.56       649
           6       0.85      0.48      0.61       224
           7       0.73      0.55      0.62       397
           8       0.83      0.36      0.50       247
           9       0.52      0.56      0.54      2851
          10       0.53      0.89      0.66      5024
          11       0.70      0.52      0.60      2120
          12       0.81      0.35      0.49       207
          13       0.82      0.26      0.39       105
          14       0.75      0.41      0.53       281
          15       0.73      0.45      0.56      1903
   