In [None]:
from google.colab import files
import pandas as pd

uploaded = files.upload()
df = pd.read_csv(next(iter(uploaded)))  # Assumes only 1 file uploaded

Saving combined_sentiment_training_data.csv to combined_sentiment_training_data.csv


In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Clean
df = df.dropna(subset=["Cleaned Comment Text"])
df = df[df["real_sentiment"].isin([-1, 0, 1])]

texts = df["Cleaned Comment Text"].astype(str).tolist()
labels = [s + 1 for s in df["real_sentiment"]]  # -1,0,1 → 0,1,2

# Tokenize
max_words = 10000
max_len = 200
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=max_len)
y = to_categorical(np.array(labels), num_classes=3)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
!wget -q https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
!gunzip cc.en.300.vec.gz

embedding_index = {}
with open("cc.en.300.vec", encoding="utf-8", newline="\n", errors="ignore") as f:
    next(f)  # Skip header
    for line in f:
        values = line.rstrip().split(" ")
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = vector

# Build embedding matrix
embedding_dim = 300
word_index = tokenizer.word_index
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in word_index.items():
    if i < max_words:
        vec = embedding_index.get(word)
        if vec is not None:
            embedding_matrix[i] = vec

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim,
              weights=[embedding_matrix], input_length=max_len, trainable=False),
    LSTM(128, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

history = model.fit(X_train, y_train, epochs=6, batch_size=64, validation_data=(X_val, y_val))

# Evaluate
y_pred = model.predict(X_val)
y_pred_classes = np.argmax(y_pred, axis=1) - 1
y_true = np.argmax(y_val, axis=1) - 1

print("Accuracy:", accuracy_score(y_true, y_pred_classes))
print("Classification Report:\n", classification_report(y_true, y_pred_classes, target_names=["Negative", "Neutral", "Positive"]))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred_classes))

Epoch 1/6
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 24ms/step - accuracy: 0.3888 - loss: 1.0907 - val_accuracy: 0.4334 - val_loss: 1.0665
Epoch 2/6
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.4198 - loss: 1.0688 - val_accuracy: 0.4896 - val_loss: 1.0394
Epoch 3/6
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.4946 - loss: 1.0200 - val_accuracy: 0.5000 - val_loss: 0.9987
Epoch 4/6
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.5051 - loss: 1.0147 - val_accuracy: 0.4837 - val_loss: 1.0040
Epoch 5/6
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.5317 - loss: 0.9606 - val_accuracy: 0.5266 - val_loss: 0.9745
Epoch 6/6
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.5651 - loss: 0.9193 - val_accuracy: 0.5133 - val_loss: 0.9962
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2025-05-19 11:42:02--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-05-19 11:42:02--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-05-19 11:42:02--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
Unable to establish SSL connection.
unzip:  cannot find or open glove.6B.zip, glove.6B.zip.zip or glove.6B.zip.ZIP.


In [None]:
!pip install -q kagglehub

import kagglehub

path = kagglehub.dataset_download("danielwillgeorge/glove6b100dtxt")

print(" Downloaded GloVe path:", path)

glove_file = f"{path}/glove.6B.100d.txt"

import numpy as np
from tqdm import tqdm

embeddings_index = {}
with open(glove_file, encoding='utf-8') as f:
    for line in tqdm(f, desc=" Loading GloVe Embeddings"):
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector

print(f" Loaded {len(embeddings_index)} word vectors.")



Downloading from https://www.kaggle.com/api/v1/datasets/download/danielwillgeorge/glove6b100dtxt?dataset_version_number=1...


100%|██████████| 131M/131M [00:00<00:00, 211MB/s]

Extracting files...





✅ Downloaded GloVe path: /root/.cache/kagglehub/datasets/danielwillgeorge/glove6b100dtxt/versions/1


🔤 Loading GloVe Embeddings: 400000it [00:08, 46366.28it/s]

✅ Loaded 400000 word vectors.





In [None]:
# Install & imports
!pip install -q kagglehub
!pip install -q tensorflow

import os
import numpy as np
import pandas as pd
from google.colab import files
import kagglehub
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# 1) Upload & load your English CSV
uploaded = files.upload()
english_csv = next(iter(uploaded))
df = pd.read_csv(english_csv)

# 2) Filter & prepare texts + integer labels (–1→0, 0→1, 1→2)
df = df[df["real_sentiment"].isin([-1,0,1])].dropna(subset=["Cleaned Comment Text"])
texts = df["Cleaned Comment Text"].astype(str).tolist()
labels = df["real_sentiment"].map({-1:0, 0:1, 1:2}).to_numpy()

# 3) Tokenize + pad sequences
max_words = 10000
max_len   = 200
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=max_len, padding="post", truncating="post")

# 4) Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, labels, test_size=0.2, stratify=labels, random_state=42
)

# 5) Compute class weights
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
cw = dict(enumerate(class_weights))
print("Class weights:", cw)

# 6) Fetch GloVe via KaggleHub and load embeddings
glove_path = kagglehub.dataset_download("danielwillgeorge/glove6b100dtxt")
glove_file = os.path.join(glove_path, "glove.6B.100d.txt")

emb_dim = 100
emb_index = {}
with open(glove_file, encoding="utf-8") as f:
    for line in f:
        parts = line.split()
        word = parts[0]
        vec  = np.asarray(parts[1:], dtype="float32")
        emb_index[word] = vec

# 7) Build embedding matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((max_words, emb_dim))
for w, i in word_index.items():
    if i<max_words and w in emb_index:
        embedding_matrix[i] = emb_index[w]

# 8) Build the model
model = Sequential([
    Embedding(max_words, emb_dim,
              weights=[embedding_matrix],
              input_length=max_len,
              trainable=True),
    Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3)),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(3, activation="softmax")
])
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)
model.summary()

# 9) Callbacks for early stopping & best-model checkpoint
es = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
mc = ModelCheckpoint("best_lstm.h5", monitor="val_loss", save_best_only=True)

# 10) Train with class weights
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=15,
    batch_size=64,
    class_weight=cw,
    callbacks=[es, mc]
)

# 11) Load best weights & evaluate
model.load_weights("best_lstm.h5")
preds = model.predict(X_val).argmax(axis=1)
print("Accuracy:", accuracy_score(y_val, preds))
print("Classification Report:\n", classification_report(y_val, preds,
      target_names=["Negative","Neutral","Positive"]))
print("Confusion Matrix:\n", confusion_matrix(y_val, preds))


Saving sample_english_with_real_sentiment - sample_english_with_real_sentiment.csv-2.csv to sample_english_with_real_sentiment - sample_english_with_real_sentiment.csv-2 (5).csv
Class weights: {0: np.float64(1.0964912280701755), 1: np.float64(1.2278978388998036), 2: np.float64(0.785175879396985)}




Epoch 1/15
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 880ms/step - accuracy: 0.3618 - loss: 1.1118



[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 966ms/step - accuracy: 0.3624 - loss: 1.1115 - val_accuracy: 0.3220 - val_loss: 1.0887
Epoch 2/15
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 866ms/step - accuracy: 0.4282 - loss: 1.0654



[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 912ms/step - accuracy: 0.4293 - loss: 1.0651 - val_accuracy: 0.4286 - val_loss: 1.0688
Epoch 3/15
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 875ms/step - accuracy: 0.4924 - loss: 1.0379



[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 919ms/step - accuracy: 0.4928 - loss: 1.0371 - val_accuracy: 0.5245 - val_loss: 0.9924
Epoch 4/15
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 872ms/step - accuracy: 0.5443 - loss: 0.9690



[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 917ms/step - accuracy: 0.5446 - loss: 0.9689 - val_accuracy: 0.5458 - val_loss: 0.9654
Epoch 5/15
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 926ms/step - accuracy: 0.5839 - loss: 0.9169 - val_accuracy: 0.5181 - val_loss: 0.9698
Epoch 6/15
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 885ms/step - accuracy: 0.6081 - loss: 0.8748



[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 928ms/step - accuracy: 0.6086 - loss: 0.8745 - val_accuracy: 0.5629 - val_loss: 0.9581
Epoch 7/15
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 887ms/step - accuracy: 0.6476 - loss: 0.8090



[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 931ms/step - accuracy: 0.6475 - loss: 0.8091 - val_accuracy: 0.5629 - val_loss: 0.9316
Epoch 8/15
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 766ms/step - accuracy: 0.6672 - loss: 0.7620



[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 808ms/step - accuracy: 0.6669 - loss: 0.7626 - val_accuracy: 0.5736 - val_loss: 0.9233
Epoch 9/15
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 920ms/step - accuracy: 0.6991 - loss: 0.7304 - val_accuracy: 0.5437 - val_loss: 0.9454
Epoch 10/15
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 924ms/step - accuracy: 0.7134 - loss: 0.6770 - val_accuracy: 0.5821 - val_loss: 0.9495
Epoch 11/15
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 923ms/step - accuracy: 0.7399 - loss: 0.6289 - val_accuracy: 0.5672 - val_loss: 1.0182
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 213ms/step
Accuracy: 0.5735607675906184
Classification Report:
               precision    recall  f1-score   support

    Negative       0.62      0.53      0.57       143
     Neutral       0.40      0.55      0.46  