In [9]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import os


In [2]:
# ✅ 1. Load & Preprocess Data
script_dir = os.path.dirname(os.getcwd())# Ga één map omhoog om 'baseline' te verwijderen en ga naar 'Data'
project_root = os.path.dirname(script_dir)  # Dit verwijdert 'baseline' van het script_dir
data_folder = os.path.join(project_root, "Data")

# 1. Dataset inladen
file_path = os.path.join(data_folder, "Grote_data_NoDupsLessThemesENnoWords9.xlsx")
df = pd.read_excel(file_path)

#visualize the data
print(df.head())
print(df.info())

                                             context  \
0  Ondertussen is de eerstelijnszone BruZEL al me...   
1  Ondertussen is de eerstelijnszone BruZEL al me...   
2  Ondertussen is de eerstelijnszone BruZEL al me...   
3  Ondertussen is de eerstelijnszone BruZEL al me...   
4  Ondertussen is de eerstelijnszone BruZEL al me...   

                                            question  statistical  \
0  Zoals alle eerstelijnszones kreeg ook BruZEL h...            0   
1  2.Kan de minister toelichten op welke manier B...            0   
2  3.Kan de minister in het bijzonder toelichten ...            0   
3  4.Kan de minister in het bijzonder toelichten ...            0   
4  Zoals alle eerstelijnszones kreeg ook BruZEL h...            0   

                        theme    file_name  \
0  Brussel en de Vlaamse Rand  1752898.txt   
1  Brussel en de Vlaamse Rand  1752898.txt   
2  Brussel en de Vlaamse Rand  1752898.txt   
3  Brussel en de Vlaamse Rand  1752898.txt   
4  Brussel en de V

In [3]:
import re

# Drop unnecessary columns
columns_to_drop = ["context","file_name","question","statistical"]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# ✅ Drop rare themes (appearing < 2 times)
theme_counts = df["theme"].value_counts()
valid_themes = theme_counts[theme_counts >= 100].index
df = df[df["theme"].isin(valid_themes)]

# ✅ Recompute label encoding AFTER filtering
unique_themes = list(df["theme"].unique())
theme_to_id = {theme: idx for idx, theme in enumerate(unique_themes)}
id_to_theme = {idx: theme for theme, idx in theme_to_id.items()}
df["theme_id"] = df["theme"].map(theme_to_id)

# Amount of rows
print(f"Number of rows after filtering: {len(df)}")

Number of rows after filtering: 64572


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import nltk

# === Setup
nltk.download("stopwords")
from nltk.corpus import stopwords
dutch_stopwords = stopwords.words("dutch")

# === Define your custom theme-to-ID mapping
theme_to_id = {
    "Brussel en de Vlaamse Rand": 0,
    "Energie": 1,
    "Milieu en Landbouw": 2,
    "Toerisme": 3,
    "Economie en Arbeid": 4,
    "Sport": 5,
    "Bestuur en Beleid": 6,
    "Justitie en Handhaving": 7,
    "Cultuur en Communicatie": 8,
    "Mobiliteit en Infrastructuur": 9,
    "Welzijn en Gezondheid": 10,
    "Begroting": 11,
    "Wonen": 12,
    "Onderwijs en Samenleving": 13,
    "Internationaal Beleid": 14,
    "Onroerend erfgoed": 15,
    "Financiën": 16,
    "Wetenschap en Innovatie": 17
}

# === Apply the mapping manually
df["label"] = df["theme"].map(theme_to_id)

# Sanity check
assert df["label"].isnull().sum() == 0, "Some themes in df['theme'] are missing in the theme_to_id mapping."

# Create ID column if not present
if "id" not in df.columns:
    df = df.reset_index(drop=True)
    df["id"] = df.index

# === Split the data with row IDs tracked
X_train, X_temp, y_train, y_temp, id_train, id_temp = train_test_split(
    df["clean_text"].tolist(),
    df["label"].tolist(),
    df["id"].tolist(),
    test_size=0.3,
    random_state=42,
    stratify=df["label"]
)

X_val, X_test, y_val, y_test, id_val, id_test = train_test_split(
    X_temp, y_temp, id_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

# === Save test set with only IDs and theme IDs
pd.DataFrame({
    "clean_text": X_test,
    "label": y_test
}).to_excel("Test_data_HeldOut_15percentsvm.xlsx", index=False)

# === TF-IDF vectorization
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=10000,
    stop_words=dutch_stopwords
)

X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(X_test)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jefva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
from sklearn.svm import LinearSVC
import time

# 4. Define parameter grid for SVM
param_grid = {
    'C': [1],  # Regularization strength 'C': [0.1,1,10,100,1000]
    'class_weight': ['balanced']  # For imbalance handling
}

# 5. Set up GridSearchCV
svm = LinearSVC(max_iter=10000, dual=False)  # dual=False is faster for large sparse input
grid = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=5,
    verbose=2,
    n_jobs=-1
)

print("GridSearchCV wordt uitgevoerd... 🚀")
start_time = time.time()
grid.fit(X_train_vec, y_train)
end_time = time.time()
print(f"⏱️ Trainingstijd: {end_time - start_time:.2f} seconden")

# === 5. Predict on validation set
best_model = grid.best_estimator_
y_pred = best_model.predict(X_val_vec)

print(f"Beste parameters: {grid.best_params_}")
print("=== Evaluatie op validatieset ===")
print(classification_report(y_val, y_pred, zero_division=0))

# === 6. Summary table
accuracy = accuracy_score(y_val, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average="weighted", zero_division=1)

baseline_results = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1"],
    "Score": [accuracy, precision, recall, f1]
})
print(baseline_results)

# === 7. Save validation predictions to Excel (with IDs)
df_val_predictions = pd.DataFrame({
    "id": id_val,
    "True Label": y_val,
    "Predicted Label": y_pred
})
df_val_predictions.to_excel("svm.xlsx", index=False)
print("📄 Validatievoorspellingen opgeslagen in 'svm.xlsx'")

# === 8. Predict on test set
y_test_pred = best_model.predict(X_test_vec)

# === 9. Classification report on test set
print("=== Evaluatie op test set ===")
print(classification_report(y_test, y_test_pred, zero_division=0))

# === 10. Save test predictions to Excel (with IDs)
df_test_predictions = pd.DataFrame({
    "clean_text": X_test,
    "True Label": y_test,
    "Predicted Label": y_test_pred
})
df_test_predictions.to_excel("svm_test_predictions.xlsx", index=False)
print("📄 Testvoorspellingen opgeslagen in 'svm_test_predictions.xlsx'")

GridSearchCV wordt uitgevoerd... 🚀
Fitting 5 folds for each of 1 candidates, totalling 5 fits
⏱️ Trainingstijd: 11.21 seconden
Beste parameters: {'C': 1, 'class_weight': 'balanced'}
=== Evaluatie op validatieset ===
              precision    recall  f1-score   support

           0       0.30      0.52      0.38        84
           1       0.50      0.66      0.57       332
           2       0.73      0.57      0.64      1504
           3       0.36      0.54      0.43       162
           4       0.68      0.64      0.66       779
           5       0.20      0.42      0.27        60
           6       0.60      0.66      0.63       622
           7       0.34      0.58      0.43       129
           8       0.52      0.62      0.57       348
           9       0.88      0.77      0.82      2574
          10       0.66      0.62      0.64       996
          11       0.45      0.72      0.55       128
          12       0.50      0.62      0.56       344
          13       0.70    