In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from imblearn.over_sampling import RandomOverSampler
import os


In [2]:
# âœ… 1. Load & Preprocess Data
script_dir = os.getcwd() # Ga Ã©Ã©n map omhoog om 'baseline' te verwijderen en ga naar 'Data'
project_root = os.path.dirname(script_dir)  # Dit verwijdert 'baseline' van het script_dir
data_folder = os.path.join(project_root, "Data")

# 1. Dataset inladen
file_path = os.path.join(data_folder, "Grote_data_NoDupsLessThemesENnoWords9.xlsx")
df = pd.read_excel(file_path)

#visualize the data
print(df.head())
print(df.info())

                                             context  \
0  Ondertussen is de eerstelijnszone BruZEL al me...   
1  Ondertussen is de eerstelijnszone BruZEL al me...   
2  Ondertussen is de eerstelijnszone BruZEL al me...   
3  Ondertussen is de eerstelijnszone BruZEL al me...   
4  Ondertussen is de eerstelijnszone BruZEL al me...   

                                            question  statistical  \
0  Zoals alle eerstelijnszones kreeg ook BruZEL h...            0   
1                            a)Wat liep er moeilijk?            0   
2  Met welke  uitdagingen  werd BruZEL het afgelo...            0   
3      Hoe kunnen  die uitdagingen worden aangepakt?            0   
4                                b)Wat liep er goed?            0   

                        theme    file_name  \
0  Brussel en de Vlaamse Rand  1752898.txt   
1  Brussel en de Vlaamse Rand  1752898.txt   
2  Brussel en de Vlaamse Rand  1752898.txt   
3  Brussel en de Vlaamse Rand  1752898.txt   
4  Brussel en de V

In [3]:
# Drop unnecessary columns
# Drop unnecessary columns
columns_to_drop = [ "file_name", "statistical"]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])




#amount of rows 
print(f"Number of rows after filtering: {len(df)}")


Number of rows after filtering: 92503


In [5]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(df["clean_text"])
y = df["theme"]

# âœ… Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Original dataset size: {X_train.shape[0]}")

Original dataset size: 74002
After Random Oversampling: 74002


In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
import pandas as pd
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0]
}

grid = GridSearchCV(
    MultinomialNB(),
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=5,
    verbose=2,
    n_jobs=-1
)

# 4. Fit (X_train must be raw text!)
print("GridSearchCV wordt uitgevoerd... ðŸš€")
grid.fit(X_train, y_train)

# 5. Predict on test set
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)


print(f"Beste parameters: {grid.best_params_}")
print(classification_report(y_test, y_pred, zero_division=0))

# 7. Summary table
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted", zero_division=1)

baseline_results = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1"],
    "Score": [accuracy, precision, recall, f1]
})

print(baseline_results)


GridSearchCV wordt uitgevoerd... ðŸš€
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Beste parameters: {'alpha': 0.01}
                              precision    recall  f1-score   support

                   Begroting       0.84      0.40      0.54       272
           Bestuur en Beleid       0.72      0.52      0.61      1147
  Brussel en de Vlaamse Rand       0.91      0.26      0.41       165
     Cultuur en Communicatie       0.70      0.42      0.53       629
          Economie en Arbeid       0.66      0.61      0.64      1575
                     Energie       0.79      0.45      0.57       621
                   FinanciÃ«n       0.78      0.53      0.63       233
       Internationaal Beleid       0.81      0.50      0.62       390
      Justitie en Handhaving       0.87      0.33      0.48       260
          Milieu en Landbouw       0.54      0.68      0.60      2906
Mobiliteit en Infrastructuur       0.66      0.86      0.75      4985
    Onderwijs en Samenlevi