In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os


In [2]:
# ✅ 1. Load & Preprocess Data
script_dir = os.path.dirname(os.getcwd())# Ga één map omhoog om 'baseline' te verwijderen en ga naar 'Data'
project_root = os.path.dirname(script_dir)  # Dit verwijdert 'baseline' van het script_dir
data_folder = os.path.join(project_root, "Data")

# 1. Dataset inladen
file_path = os.path.join(data_folder, r"C:\Users\jefva\Documents\Master\Thesis_s2\Code\Identifier\Trainig_data.xlsx")
df = pd.read_excel(file_path)

#visualize the data
print(df.head())
print(df.info())

                                            question  label
0  Hoeveel personen waren er voor de Vlaamse over...      1
1  Hoeveel dagen/uren is die delegatie er geweest...      0
2                             Hoe verklaart hij dat?      0
3  Hoeveel bedroeg de totale factuur voor de Vlaa...      0
4  Verder sprak ik de Catalaanse minister van Fin...      0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2386 entries, 0 to 2385
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  2386 non-null   object
 1   label     2386 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 37.4+ KB
None


In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import time
import nltk


# === Setup
nltk.download("stopwords")
from nltk.corpus import stopwords
dutch_stopwords = stopwords.words("dutch")



# === Split the data with row IDs tracked
X_train, X_temp, y_train, y_temp = train_test_split(
    df["question"].tolist(),
    df["label"].tolist(),
    test_size=0.3,
    random_state=42,
    stratify=df["label"]
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

# === Save test set with only IDs and theme IDs
# pd.DataFrame({
#     "clean_text": X_test,
#     "label": y_test
# }).to_excel("Test_data_HeldOut_15percentlogreg.xlsx", index=False)

# === TF-IDF vectorization
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=10000,
    stop_words=dutch_stopwords
)

X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(X_test)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jefva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:

# 4. Define a small parameter grid for RF
param_grid = {
    'n_estimators': [200,300,400,600,800],     # Number of trees [200,300,400,600,800]
    'max_features': ['sqrt', 'log2'],        # Full trees vs shallower trees ['sqrt', 'log2']
    'class_weight': ['balanced']  # Handle imbalance
}

# 5. GridSearchCV setup
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=5,  # keep folds lower for speed
    verbose=2,
    n_jobs=2
)

print("GridSearchCV wordt uitgevoerd... 🚀")
start_time = time.time()
grid.fit(X_train_vec, y_train)
end_time = time.time()
print(f"⏱️ Trainingstijd: {end_time - start_time:.2f} seconden")

# === 5. Predict on validation set
best_model = grid.best_estimator_
y_pred = best_model.predict(X_val_vec)

print(f"Beste parameters: {grid.best_params_}")
print("=== Evaluatie op validatieset ===")
print(classification_report(y_val, y_pred, zero_division=0))

# === 6. Summary table
accuracy = accuracy_score(y_val, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average="weighted", zero_division=1)

baseline_results = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1"],
    "Score": [accuracy, precision, recall, f1]
})
print(baseline_results)

# === 7. Save validation predictions to Excel (with IDs)
df_val_predictions = pd.DataFrame({
    "True Label": y_val,
    "Predicted Label": y_pred
})
df_val_predictions.to_excel("rf_val.xlsx", index=False)
print("📄 Validatievoorspellingen opgeslagen in 'rf_val.xlsx'")

# === 8. Predict on test set
y_test_pred = best_model.predict(X_test_vec)

# === 9. Classification report on test set
print("=== Evaluatie op test set ===")
print(classification_report(y_test, y_test_pred, zero_division=0))

# === 10. Save test predictions to Excel (with IDs)
df_test_predictions = pd.DataFrame({
    "clean_text": X_test,
    "True Label": y_test,
    "Predicted Label": y_test_pred
})
df_test_predictions.to_excel("rf.xlsx", index=False)
print("📄 Testvoorspellingen opgeslagen in 'rf_test_predictions.xlsx'")



GridSearchCV wordt uitgevoerd... 🚀
Fitting 5 folds for each of 10 candidates, totalling 50 fits
⏱️ Trainingstijd: 77.70 seconden
Beste parameters: {'class_weight': 'balanced', 'max_features': 'sqrt', 'n_estimators': 300}
=== Evaluatie op validatieset ===
              precision    recall  f1-score   support

           0       0.89      0.93      0.91       282
           1       0.70      0.58      0.63        76

    accuracy                           0.86       358
   macro avg       0.79      0.76      0.77       358
weighted avg       0.85      0.86      0.85       358

      Metric     Score
0   Accuracy  0.857542
1  Precision  0.850529
2     Recall  0.857542
3         F1  0.852485
📄 Validatievoorspellingen opgeslagen in 'rf_val.xlsx'
=== Evaluatie op test set ===
              precision    recall  f1-score   support

           0       0.88      0.94      0.91       283
           1       0.70      0.52      0.60        75

    accuracy                           0.85       358
 