In [1]:
# 🇳🇱 Dutch LSTM on Mac mini M4 Pro

import os
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


# 1) File paths — update these to wherever you saved them
dutch_csv      = "/Users/feysal/Downloads/Dutch_sample_manually_labelled - dutch_comments_with_mapped_sentiment.csv"
fasttext_vec   = "/Users/feysal/Downloads/cc.nl.300.vec"

# 2) Load & filter your manually labeled Dutch comments
df = pd.read_csv(dutch_csv)
df = df.dropna(subset=["Cleaned Comment Text"])
df = df[df["real_sentiment"].isin([-1, 0, 1])]

texts = df["Cleaned Comment Text"].astype(str).tolist()
labels = df["real_sentiment"].map({-1:0, 0:1, 1:2}).to_numpy()  # map to 0/1/2 for sparse_categorical

# 3) Tokenize & pad
max_words = 10000
max_len   = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")

# 4) Stratified train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, labels,
    test_size=0.20,
    stratify=labels,
    random_state=42
)

# 5) Compute class weights
classes = np.unique(y_train)
weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))
print("Class weights:", class_weights)

# 6) Load FastText Dutch embeddings
emb_dim = 300
emb_index = {}
with open(fasttext_vec, encoding="utf-8", errors="ignore") as f:
    # If the first line is a header (vocab + dim), skip it:
    first = f.readline().split()
    if len(first) != emb_dim+1:
        f.seek(0)  # not a header, rewind
    for line in tqdm(f, desc="Loading FastText"):
        parts = line.rstrip().split(" ")
        word = parts[0]
        vec  = np.asarray(parts[1:], dtype="float32")
        emb_index[word] = vec

# 7) Build embedding matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((max_words, emb_dim), dtype="float32")

for word, idx in word_index.items():
    if idx < max_words:
        vec = emb_index.get(word)
        if vec is not None:
            embedding_matrix[idx] = vec

# 8) Construct the model
model = Sequential([
    Embedding(input_dim = max_words,
              output_dim = emb_dim,
              weights    = [embedding_matrix],
              input_length = max_len,
              trainable = True),
    Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3)),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(3, activation="softmax")
])

model.compile(
    loss      = "sparse_categorical_crossentropy",
    optimizer = "adam",
    metrics   = ["accuracy"]
)
model.summary()

# 9) Callbacks
es = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
mc = ModelCheckpoint("best_dutch_lstm.h5", monitor="val_loss", save_best_only=True)

# 10) Train
history = model.fit(
    X_train, y_train,
    validation_data = (X_val, y_val),
    epochs          = 15,
    batch_size      = 64,
    class_weight    = class_weights,
    callbacks       = [es, mc]
)

# 11) Evaluate
model.load_weights("best_dutch_lstm.h5")
y_pred = model.predict(X_val).argmax(axis=1)

print("\n🏁 Final Evaluation on Dutch Validation Set")
print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n",
      classification_report(y_val, y_pred, target_names=["Negative","Neutral","Positive"]))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))

Class weights: {0: 0.8208955223880597, 1: 0.9683098591549296, 2: 1.3349514563106797}


Loading FastText: 2000001it [00:41, 48331.47it/s]


Epoch 1/15
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 353ms/step - accuracy: 0.3478 - loss: 1.1009



[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 401ms/step - accuracy: 0.3460 - loss: 1.1008 - val_accuracy: 0.3816 - val_loss: 1.0881
Epoch 2/15
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 381ms/step - accuracy: 0.4647 - loss: 1.0704



[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 422ms/step - accuracy: 0.4652 - loss: 1.0697 - val_accuracy: 0.4541 - val_loss: 1.0554
Epoch 3/15
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 377ms/step - accuracy: 0.5473 - loss: 0.9830



[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 417ms/step - accuracy: 0.5479 - loss: 0.9821 - val_accuracy: 0.4638 - val_loss: 0.9797
Epoch 4/15
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 373ms/step - accuracy: 0.6141 - loss: 0.8576



[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 413ms/step - accuracy: 0.6164 - loss: 0.8547 - val_accuracy: 0.5314 - val_loss: 0.9314
Epoch 5/15
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 416ms/step - accuracy: 0.7801 - loss: 0.5912 - val_accuracy: 0.5507 - val_loss: 0.9656
Epoch 6/15
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 414ms/step - accuracy: 0.8476 - loss: 0.3859 - val_accuracy: 0.5459 - val_loss: 1.1243
Epoch 7/15
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 447ms/step - accuracy: 0.8973 - loss: 0.2776 - val_accuracy: 0.5845 - val_loss: 1.1646
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 85ms/step

🏁 Final Evaluation on Dutch Validation Set
Accuracy: 0.5314009661835749

Classification Report:
               precision    recall  f1-score   support

    Negative       0.53      0.48      0.50        84
     Neutral       0.48      0.58      0.53        71
    Positive       0.62      0