In [2]:
# 📦 Install dependencies (if not pre-installed)
!pip install pandas scikit-learn

# 🧠 Import libraries
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 📂 Upload your CSV in the sidebar or use the code below
from google.colab import files
uploaded = files.upload()

# 📄 Load dataset
df = pd.read_csv("Dutch_sample_manually_labelled - dutch_comments_with_mapped_sentiment.csv")

# 🧹 Clean and encode labels
df = df.dropna(subset=["Cleaned Comment Text"])
df = df[df["real_sentiment"].isin([-1, 0, 1])]
texts = df["Cleaned Comment Text"].astype(str).tolist()
labels = df["real_sentiment"].map({-1: 0, 0: 1, 1: 2}).values  # map to 0,1,2

# 🔠 Tokenize and pad
MAX_WORDS = 10000
MAX_LEN = 100
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN)

# 🎯 One-hot encode labels
categorical_labels = to_categorical(labels, num_classes=3)

# 🔀 Split dataset
X_train, X_val, y_train, y_val = train_test_split(
    padded_sequences, categorical_labels, test_size=0.2, random_state=40, stratify=labels
)

# 🧱 Build CNN model
model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=100, input_length=MAX_LEN),
    Conv1D(128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')
])

# ⚙️ Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# 🚀 Train model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val), verbose=1)

# 📊 Evaluate model
y_pred_probs = model.predict(X_val)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_val, axis=1)

# 🧾 Classification report
print("📈 Classification Report:")
print(classification_report(y_true, y_pred, target_names=["Negative", "Neutral", "Positive"]))
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true, y_pred)
print("\nConfusion Matrix (Dutch CNN):")
print(cm)



Saving Dutch_sample_manually_labelled - dutch_comments_with_mapped_sentiment.csv to Dutch_sample_manually_labelled - dutch_comments_with_mapped_sentiment (1).csv
Epoch 1/5




[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 133ms/step - accuracy: 0.3697 - loss: 1.0904 - val_accuracy: 0.4058 - val_loss: 1.0771
Epoch 2/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4265 - loss: 1.0563 - val_accuracy: 0.4058 - val_loss: 1.0708
Epoch 3/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5036 - loss: 1.0145 - val_accuracy: 0.4155 - val_loss: 1.0552
Epoch 4/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6746 - loss: 0.9160 - val_accuracy: 0.4686 - val_loss: 1.0090
Epoch 5/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8229 - loss: 0.6882 - val_accuracy: 0.4928 - val_loss: 0.9394
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step
📈 Classification Report:
              precision    recall  f1-score   support

    Negative       0.48      0.73      0.58        84
     Ne

In [None]:
# Step 1: Install & Import
!pip install -q tqdm

import pandas as pd
import numpy as np
import os
import re
import zipfile
import urllib.request

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tqdm import tqdm

# Step 2: Load Preprocessed English Data
df = pd.read_csv('/content/sample_english_with_real_sentiment - sample_english_with_real_sentiment.csv-2.csv')
df = df[df['real_sentiment'].isin([-1, 0, 1])]  # Keep only labeled data

texts = df["Cleaned Comment Text"].astype(str).tolist()
labels = df["real_sentiment"].tolist()

# Step 3: Tokenization
max_features = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_features, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=max_len, padding='post', truncating='post')

# Step 4: Labels to categorical
y = to_categorical(np.array([x + 1 for x in labels]), num_classes=3)

# Step 5: Train/Validation Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=40)

# Step 6: Load GloVe Embeddings
glove_path = "glove.6B.100d.txt"
if not os.path.exists(glove_path):
    !wget http://nlp.stanford.edu/data/glove.6B.zip
    with zipfile.ZipFile("glove.6B.zip", 'r') as zip_ref:
        zip_ref.extractall()

embeddings_index = {}
with open("glove.6B.100d.txt", encoding='utf-8') as f:
    for line in tqdm(f, desc="Loading GloVe"):
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector

embedding_dim = 100
word_index = tokenizer.word_index
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features and word in embeddings_index:
        embedding_matrix[i] = embeddings_index[word]

# Step 7: CNN Model with GloVe
model = Sequential([
    Embedding(input_dim=max_features, output_dim=embedding_dim, weights=[embedding_matrix],
              input_length=max_len, trainable=True),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Step 8: Train
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_val, y_val)
)

# Step 9: Evaluation
y_pred = model.predict(X_val)
y_pred_classes = np.argmax(y_pred, axis=1) - 1
y_true = np.argmax(y_val, axis=1) - 1

print("\n✅ Evaluation on English Manual Comments:")
print("Accuracy:", accuracy_score(y_true, y_pred_classes))
print("\nClassification Report:\n", classification_report(y_true, y_pred_classes, target_names=["Negative", "Neutral", "Positive"]))
print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred_classes))


Loading GloVe: 400000it [00:08, 47448.39it/s]


Epoch 1/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 81ms/step - accuracy: 0.3638 - loss: 1.2822 - val_accuracy: 0.4328 - val_loss: 1.0843
Epoch 2/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4486 - loss: 1.0525 - val_accuracy: 0.4414 - val_loss: 1.0555
Epoch 3/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5017 - loss: 1.0068 - val_accuracy: 0.4861 - val_loss: 1.0281
Epoch 4/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5474 - loss: 0.9474 - val_accuracy: 0.5203 - val_loss: 1.0120
Epoch 5/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6058 - loss: 0.8662 - val_accuracy: 0.5458 - val_loss: 0.9897
Epoch 6/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6767 - loss: 0.7453 - val_accuracy: 0.5309 - val_loss: 0.9911
Epoch 7/10
[1m30/30[0m [32m━━━━━━━━━

In [None]:
print(f"Original rows: {len(df)}")

Original rows: 2344


In [None]:
df = df[df['real_sentiment'].isin([-1, 0, 1])]
print(f"Rows after filtering valid sentiments: {len(df)}")

Rows after filtering valid sentiments: 2344


In [None]:
texts = df["Cleaned Comment Text"].astype(str).tolist()
labels = df["real_sentiment"].tolist()
print(f"Texts available for training: {len(texts)}")

Texts available for training: 2344


In [3]:
# ✅ CNN (Combined Manual Comments) — Google Colab

# 1️⃣ Install dependencies (only if needed)
!pip install -q tensorflow scikit-learn

# 2️⃣ Imports
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection      import train_test_split
from sklearn.utils.class_weight   import compute_class_weight
from sklearn.metrics              import accuracy_score, classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.models      import Model
from tensorflow.keras.layers      import (Input, Embedding, Conv1D, GlobalMaxPooling1D,
                                          Concatenate, Dense, Dropout)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks   import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# 3️⃣ Upload your combined CSV
from google.colab import files
print("▶️ Please upload combined_sentiment_training_data.csv")
uploaded = files.upload()
combined_csv = next(iter(uploaded))  # e.g. "combined_sentiment_training_data.csv"

# 4️⃣ Load & preprocess DataFrame
df = pd.read_csv(combined_csv)
df = df.dropna(subset=["Cleaned Comment Text"])
df = df[df["real_sentiment"].isin([-1,0,1])]

texts  = df["Cleaned Comment Text"].astype(str).tolist()
labels = df["real_sentiment"].map({-1:0, 0:1, 1:2}).to_numpy()  # map to 0/1/2

# 5️⃣ Tokenize & pad sequences
MAX_WORDS = 15000
MAX_LEN   = 150

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=MAX_LEN, padding="post", truncating="post")

# 6️⃣ Stratified train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, labels, test_size=0.2, stratify=labels, random_state=42
)

# 7️⃣ Compute class weights
classes = np.unique(y_train)
weights = compute_class_weight("balanced", classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))
print("Class weights:", class_weights)

# 8️⃣ Build a multi‐filter CNN
filter_sizes = [3,4,5]
num_filters  = 128
embed_dim    = 200

inp    = Input(shape=(MAX_LEN,))
embed  = Embedding(input_dim=MAX_WORDS,
                   output_dim=embed_dim,
                   input_length=MAX_LEN,
                   trainable=True)(inp)

pooled = []
for sz in filter_sizes:
    c = Conv1D(num_filters, sz, activation="relu")(embed)
    p = GlobalMaxPooling1D()(c)
    pooled.append(p)

merge = Concatenate()(pooled)
d1    = Dropout(0.5)(merge)
d2    = Dense(64, activation="relu")(d1)
d3    = Dropout(0.5)(d2)
out   = Dense(3, activation="softmax")(d3)

model = Model(inp, out)
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)
model.summary()

# 9️⃣ Callbacks
es  = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
rlr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2)
mc  = ModelCheckpoint("best_cnn_combined.h5", monitor="val_loss", save_best_only=True)

# 🔟 Train
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=15,
    batch_size=64,
    class_weight=class_weights,
    callbacks=[es, rlr, mc]
)

# 1️⃣1️⃣ Evaluate
model.load_weights("best_cnn_combined.h5")
preds = model.predict(X_val).argmax(axis=1)

print("🔹 Accuracy:", accuracy_score(y_val, preds))
print("\n🔹 Classification Report:\n",
      classification_report(y_val, preds, target_names=["Negative","Neutral","Positive"]))
print("\n🔹 Confusion Matrix:\n",
      confusion_matrix(y_val, preds))


▶️ Please upload combined_sentiment_training_data.csv


Saving combined_sentiment_training_data.csv to combined_sentiment_training_data.csv
Class weights: {np.int64(0): np.float64(0.994475138121547), np.int64(1): np.float64(1.1349306431273645), np.int64(2): np.float64(0.8982035928143712)}




Epoch 1/15
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.3450 - loss: 1.1038



[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 115ms/step - accuracy: 0.3450 - loss: 1.1037 - val_accuracy: 0.3580 - val_loss: 1.0927 - learning_rate: 0.0010
Epoch 2/15
[1m40/43[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 4ms/step - accuracy: 0.4366 - loss: 1.0660



[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.4383 - loss: 1.0651 - val_accuracy: 0.4808 - val_loss: 1.0396 - learning_rate: 0.0010
Epoch 3/15
[1m40/43[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 4ms/step - accuracy: 0.5804 - loss: 0.9579



[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5817 - loss: 0.9554 - val_accuracy: 0.5251 - val_loss: 0.9575 - learning_rate: 0.0010
Epoch 4/15
[1m41/43[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.7704 - loss: 0.6789



[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7698 - loss: 0.6778 - val_accuracy: 0.5636 - val_loss: 0.9382 - learning_rate: 0.0010
Epoch 5/15
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8935 - loss: 0.3829 - val_accuracy: 0.5740 - val_loss: 0.9816 - learning_rate: 0.0010
Epoch 6/15
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9279 - loss: 0.2287 - val_accuracy: 0.5843 - val_loss: 1.1304 - learning_rate: 0.0010
Epoch 7/15
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9700 - loss: 0.1195 - val_accuracy: 0.6021 - val_loss: 1.1403 - learning_rate: 5.0000e-04
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step
🔹 Accuracy: 0.5636094674556213

🔹 Classification Report:
               precision    recall  f1-score   support

    Negativ