In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import os


In [2]:
# ✅ 1. Load & Preprocess Data
script_dir = os.path.dirname(os.getcwd())# Ga één map omhoog om 'baseline' te verwijderen en ga naar 'Data'
project_root = os.path.dirname(script_dir)  # Dit verwijdert 'baseline' van het script_dir
data_folder = os.path.join(project_root, "Data")

# 1. Dataset inladen
file_path = os.path.join(data_folder, "Grote_data_NoDupsLessThemesENnoWords9.xlsx")
df = pd.read_excel(file_path)

#visualize the data
print(df.head())
print(df.info())

                                             context  \
0  Ondertussen is de eerstelijnszone BruZEL al me...   
1  Ondertussen is de eerstelijnszone BruZEL al me...   
2  Ondertussen is de eerstelijnszone BruZEL al me...   
3  Ondertussen is de eerstelijnszone BruZEL al me...   
4  Ondertussen is de eerstelijnszone BruZEL al me...   

                                            question  statistical  \
0  Zoals alle eerstelijnszones kreeg ook BruZEL h...            0   
1  2.Kan de minister toelichten op welke manier B...            0   
2  3.Kan de minister in het bijzonder toelichten ...            0   
3  4.Kan de minister in het bijzonder toelichten ...            0   
4  Zoals alle eerstelijnszones kreeg ook BruZEL h...            0   

                        theme    file_name  \
0  Brussel en de Vlaamse Rand  1752898.txt   
1  Brussel en de Vlaamse Rand  1752898.txt   
2  Brussel en de Vlaamse Rand  1752898.txt   
3  Brussel en de Vlaamse Rand  1752898.txt   
4  Brussel en de V

In [3]:
import re

# Drop unnecessary columns
columns_to_drop = ["context","file_name","question","statistical"]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# ✅ Drop rare themes (appearing < 2 times)
theme_counts = df["theme"].value_counts()
valid_themes = theme_counts[theme_counts >= 100].index
df = df[df["theme"].isin(valid_themes)]

# ✅ Recompute label encoding AFTER filtering
unique_themes = list(df["theme"].unique())
theme_to_id = {theme: idx for idx, theme in enumerate(unique_themes)}
id_to_theme = {idx: theme for theme, idx in theme_to_id.items()}
df["theme_id"] = df["theme"].map(theme_to_id)

# Amount of rows
print(f"Number of rows after filtering: {len(df)}")

Number of rows after filtering: 64572


In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

import time
import nltk

# === Setup
nltk.download("stopwords")
from nltk.corpus import stopwords
dutch_stopwords = stopwords.words("dutch")

# === Define your custom theme-to-ID mapping
theme_to_id = {
    "Brussel en de Vlaamse Rand": 0,
    "Energie": 1,
    "Milieu en Landbouw": 2,
    "Toerisme": 3,
    "Economie en Arbeid": 4,
    "Sport": 5,
    "Bestuur en Beleid": 6,
    "Justitie en Handhaving": 7,
    "Cultuur en Communicatie": 8,
    "Mobiliteit en Infrastructuur": 9,
    "Welzijn en Gezondheid": 10,
    "Begroting": 11,
    "Wonen": 12,
    "Onderwijs en Samenleving": 13,
    "Internationaal Beleid": 14,
    "Onroerend erfgoed": 15,
    "Financiën": 16,
    "Wetenschap en Innovatie": 17
}

# === Apply the mapping manually
df["label"] = df["theme"].map(theme_to_id)

# Sanity check
assert df["label"].isnull().sum() == 0, "Some themes in df['theme'] are missing in the theme_to_id mapping."

# Create ID column if not present
if "id" not in df.columns:
    df = df.reset_index(drop=True)
    df["id"] = df.index

# === Split the data with row IDs tracked
X_train, X_temp, y_train, y_temp, id_train, id_temp = train_test_split(
    df["clean_text"].tolist(),
    df["label"].tolist(),
    df["id"].tolist(),
    test_size=0.3,
    random_state=42,
    stratify=df["label"]
)

X_val, X_test, y_val, y_test, id_val, id_test = train_test_split(
    X_temp, y_temp, id_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

# === Save test set with only IDs and theme IDs
pd.DataFrame({
    "clean_text": X_test,
    "label": y_test
}).to_excel("Test_data_HeldOut_15percentxgb.xlsx", index=False)

# === TF-IDF vectorization
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=10000,
    stop_words=dutch_stopwords
)

X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(X_test)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jefva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from sklearn.model_selection import GridSearchCV

# # === 1. Define parameter grid
# param_grid = {
#     "learning_rate": [0.1, 0.05],            # 0.1 is standard, 0.05 often helps generalization
#     "max_depth": [3, 5],                     # 3 avoids overfitting; 5 adds flexibility
#     "gamma": [0, 1],                         # 0 = no regularization; 1 = mild pruning
#     "n_estimators": [200, 400],              # train faster, early-stopping can help later
#     "colsample_bytree": [0.5, 0.8]           # moderate feature subsampling helps regularization
# }

# # === 2. Instantiate model
# xgb = XGBClassifier(
#     use_label_encoder=False,
#     eval_metric='mlogloss',
#     n_jobs=-1,
#     verbosity=1,
#     random_state=42,
#     tree_method='hist',
#     device='cuda'      # 👈 GPU prediction
# )


# # === 3. Grid Search
# grid_search = GridSearchCV(
#     estimator=xgb,
#     param_grid=param_grid,
#     scoring="f1_weighted",
#     cv=3,
#     verbose=3,  # 👈 shows each fold of each fit
#     n_jobs=-1
# )

# Minimal parameter grid for text data
param_grid = {
    'max_depth': [4],
    'learning_rate': [ 0.5],#[ 0.5,0.3, 0.1]
    'n_estimators': [400],#[250,400]
}

xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='mlogloss',
    n_jobs=-1,
    verbosity=1,
    random_state=42
)

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=5,
    verbose=2,
    n_jobs=-1
)
print("GridSearchCV wordt uitgevoerd... 🚀")
start_time = time.time()
grid_search.fit(X_train_vec, y_train)
end_time = time.time()
print(f"⏱️ Trainingstijd: {end_time - start_time:.2f} seconden")

# === 5. Predict on validation set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val_vec)

print(f"Beste parameters: {grid_search.best_params_}")
print("=== Evaluatie op validatieset ===")
print(classification_report(y_val, y_pred, zero_division=0))

# === 6. Summary table
accuracy = accuracy_score(y_val, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average="weighted", zero_division=1)

baseline_results = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1"],
    "Score": [accuracy, precision, recall, f1]
})
print(baseline_results)

# === 7. Save validation predictions to Excel (with IDs)
df_val_predictions = pd.DataFrame({
    "id": id_val,
    "True Label": y_val,
    "Predicted Label": y_pred
})
df_val_predictions.to_excel("xgb_val.xlsx", index=False)
print("📄 Validatievoorspellingen opgeslagen in 'xgb_val.xlsx'")

# === 8. Predict on test set
y_test_pred = best_model.predict(X_test_vec)

# === 9. Classification report on test set
print("=== Evaluatie op test set ===")
print(classification_report(y_test, y_test_pred, zero_division=0))

# === 10. Save test predictions to Excel (with IDs)
df_test_predictions = pd.DataFrame({
    "clean_text": X_test,
    "True Label": y_test,
    "Predicted Label": y_test_pred
})
df_test_predictions.to_excel("xgb_test.xlsx", index=False)
print("📄 Testvoorspellingen opgeslagen in 'xgb_test_predictions.xlsx'")

GridSearchCV wordt uitgevoerd... 🚀
Fitting 5 folds for each of 1 candidates, totalling 5 fits


Parameters: { "use_label_encoder" } are not used.



⏱️ Trainingstijd: 583.41 seconden
Beste parameters: {'learning_rate': 0.5, 'max_depth': 4, 'n_estimators': 400}
=== Evaluatie op validatieset ===
              precision    recall  f1-score   support

           0       0.76      0.42      0.54        84
           1       0.73      0.51      0.60       332
           2       0.55      0.66      0.60      1504
           3       0.70      0.43      0.53       162
           4       0.71      0.61      0.65       779
           5       0.60      0.25      0.35        60
           6       0.69      0.62      0.65       622
           7       0.69      0.40      0.51       129
           8       0.76      0.53      0.62       348
           9       0.67      0.86      0.76      2574
          10       0.66      0.60      0.63       996
          11       0.81      0.59      0.68       128
          12       0.71      0.54      0.61       344
          13       0.67      0.61      0.64      1157
          14       0.71      0.55      0.62

In [None]:
import optuna
import time
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import cross_val_score

# === 1. Define the objective function ===
def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.0001, 0.5, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "gamma": trial.suggest_float("gamma", 1, 10),
        "n_estimators": trial.suggest_int("n_estimators", 500, 2000, step=500),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "use_label_encoder": False,
        "eval_metric": "mlogloss",
        "n_jobs": -1,
        "random_state": 42,
        "verbosity": 0,
        "tree_method": "hist",
        "device": "cuda"
    }

    model = XGBClassifier(**params)
    # === 3-fold CV on training set
    score = cross_val_score(model, X_train_vec, y_train, scoring="f1_weighted", cv=3).mean()
    return score

# === 2. Run Optuna Study ===
print("🚀 Optuna hyperparameter search gestart...")
start = time.time()
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)  # ⏱️ Adjust number of trials as needed
end = time.time()

print(f"⏱️ Totale zoektijd: {end - start:.2f} seconden")
print("✅ Beste parameters:", study.best_params)

# === 3. Train final model with best parameters ===
best_params = study.best_params
best_params.update({
    "use_label_encoder": False,
    "eval_metric": "mlogloss",
    "n_jobs": -1,
    "random_state": 42,
    "verbosity": 1,
    "tree_method": "hist",
    "device": "cuda"
})

best_model = XGBClassifier(**best_params)
best_model.fit(X_train_vec, y_train)

# === 4. Predict on validation set ===
y_pred = best_model.predict(X_val_vec)
y_val_labels = le.inverse_transform(y_val)
y_pred_labels = le.inverse_transform(y_pred)

print("📊 Evaluatie op validatieset:")
print(classification_report(y_val, y_pred, zero_division=0))

# === 5. Score overview ===
accuracy = accuracy_score(y_val, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average="weighted", zero_division=1)

baseline_results = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1"],
    "Score": [accuracy, precision, recall, f1]
})
print(baseline_results)

# === 6. Predict on test set ===
y_test_pred = best_model.predict(X_test_vec)
y_test_labels = le.inverse_transform(y_test)
y_test_pred_labels = le.inverse_transform(y_test_pred)

print("📊 Evaluatie op testset:")
print(classification_report(y_test, y_test_pred, zero_division=0))

# === 7. Save predictions ===
df_test_predictions = pd.DataFrame({
    "Question": X_test,
    "True Label": y_test_labels,
    "Predicted Label": y_test_pred_labels
})
df_test_predictions.to_excel("xgb_test_predictions_optuna.xlsx", index=False)
print("📄 Testvoorspellingen opgeslagen in 'xgb_test_predictions_optuna.xlsx'")
