In [1]:
!pip install -q torch torchvision torchaudio
!pip install -q transformers scikit-learn nltk emoji


In [2]:
import os, re, ast, random
import numpy as np
import pandas as pd

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.metrics import f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<torch._C.Generator at 0x7b33463e3430>

In [3]:
TRAIN_PATH = "/content/emoHi-train.csv"
VALID_PATH = "/content/emoHi-valid.csv"
TEST_PATH  = "/content/emoHi-test.csv"

train_df = pd.read_csv(TRAIN_PATH)
valid_df = pd.read_csv(VALID_PATH)
test_df  = pd.read_csv(TEST_PATH)

for df in (train_df, valid_df, test_df):
    if 'Unnamed: 0' in df.columns:
        df.drop(columns=['Unnamed: 0'], inplace=True)
    df.rename(columns={'labels':'label'}, inplace=True)

print(train_df.shape, valid_df.shape, test_df.shape)
print(train_df.head())


(43410, 3) (5426, 3) (5427, 3)
        id label                                               text
0  eebbqej  [27]  मेरा पसंदीदा खाना कुछ भी है जो मुझे खुद नहीं ब...
1  ed00q6i  [27]  अब अगर वह खुद से दूर हो जाता है, तो हर कोई सोच...
2  eezlygj   [2]                     बकवास बेअदब आईएसओिंग क्यों है?
3  ed7ypvh  [14]                        उसे खतरा महसूस कराने के लिए
4  ed0bdzj   [3]                                डर्टी सदर्न वेंकर्स


In [4]:
GOEMOTIONS = [
    "admiration","amusement","anger","annoyance","approval","caring","confusion",
    "curiosity","desire","disappointment","disapproval","disgust","embarrassment",
    "excitement","fear","gratitude","grief","joy","love","nervousness","optimism",
    "pride","realization","relief","remorse","sadness","surprise","neutral"
]
NUM_LABELS = len(GOEMOTIONS)

def parse_label(x):
    if pd.isna(x): return []
    s = str(x)
    nums = re.findall(r"\d+", s)
    return [int(v) for v in nums]

for df in (train_df, valid_df, test_df):
    df['labels_parsed'] = df['label'].apply(parse_label)

def to_multi_hot(lst):
    arr = np.zeros(NUM_LABELS, dtype=int)
    for i in lst:
        if 0 <= i < NUM_LABELS: arr[i] = 1
    return arr

y_train = np.vstack(train_df['labels_parsed'].apply(to_multi_hot))
y_valid = np.vstack(valid_df['labels_parsed'].apply(to_multi_hot))
y_test  = np.vstack(test_df['labels_parsed'].apply(to_multi_hot))

print("Multi-label shape:", y_train.shape)


Multi-label shape: (43410, 28)


In [5]:
stop_en = set(stopwords.words('english'))
stop_hi = set(['है','थे','हो','था','हूँ','यह','वे','और','पर','को','कर','से','में','कि','भी','जो','का','की'])

def clean(t):
    t = str(t).lower()
    t = re.sub(r'[^a-zA-Z\u0900-\u097F\s]', ' ', t)
    t = re.sub(r'\s+', ' ', t).strip()
    t = " ".join(w for w in t.split() if w not in stop_en and w not in stop_hi)
    return t

for df in (train_df, valid_df, test_df):
    df['clean'] = df['text'].apply(clean)


In [6]:
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
Xtr = tfidf.fit_transform(train_df['clean'])
Xva = tfidf.transform(valid_df['clean'])
Xte = tfidf.transform(test_df['clean'])

svm = OneVsRestClassifier(LogisticRegression(max_iter=200))
svm.fit(Xtr, y_train)

svm_va_pred = svm.predict(Xva)
svm_te_pred = svm.predict(Xte)

def example_f1(y_true, y_pred):
    scores=[]
    for yt, yp in zip(y_true, y_pred):
        tp = np.sum((yt==1)&(yp==1))
        fp = np.sum((yt==0)&(yp==1))
        fn = np.sum((yt==1)&(yp==0))
        prec = tp/(tp+fp+1e-9); rec = tp/(tp+fn+1e-9)
        scores.append(2*prec*rec/(prec+rec+1e-9))
    return float(np.mean(scores))

svm_valid_f1 = example_f1(y_valid, svm_va_pred)
svm_test_f1  = example_f1(y_test,  svm_te_pred)
svm_valid_acc = (y_valid == svm_va_pred).all(axis=1).mean()
svm_test_acc  = (y_test  == svm_te_pred).all(axis=1).mean()

print("SVM valid acc:", svm_valid_acc, "F1:", svm_valid_f1)
print("SVM test  acc:", svm_test_acc,  "F1:", svm_test_f1)


SVM valid acc: 0.16992259491338002 F1: 0.1931686936577553
SVM test  acc: 0.16989128431914502 F1: 0.19313924177664718


In [7]:
MAX_LEN = 64
tok = Tokenizer(num_words=40000, oov_token="<unk>")
tok.fit_on_texts(train_df['clean'])

Xtr_lstm = pad_sequences(tok.texts_to_sequences(train_df['clean']), maxlen=MAX_LEN)
Xva_lstm = pad_sequences(tok.texts_to_sequences(valid_df['clean']), maxlen=MAX_LEN)
Xte_lstm = pad_sequences(tok.texts_to_sequences(test_df['clean']),  maxlen=MAX_LEN)

lstm = Sequential([
    Embedding(len(tok.word_index)+1, 128, input_length=MAX_LEN),
    Bidirectional(LSTM(128)),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(NUM_LABELS, activation='sigmoid')
])

lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm.fit(Xtr_lstm, y_train, validation_data=(Xva_lstm, y_valid), epochs=3, batch_size=64)

va_pred = (lstm.predict(Xva_lstm) >= 0.5).astype(int)
te_pred = (lstm.predict(Xte_lstm) >= 0.5).astype(int)

lstm_valid_f1 = example_f1(y_valid, va_pred)
lstm_test_f1  = example_f1(y_test,  te_pred)
lstm_valid_acc = (y_valid == va_pred).all(axis=1).mean()
lstm_test_acc  = (y_test  == te_pred).all(axis=1).mean()

print("LSTM valid acc:", lstm_valid_acc, "F1:", lstm_valid_f1)
print("LSTM test  acc:", lstm_test_acc,  "F1:", lstm_test_f1)




Epoch 1/3
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 28ms/step - accuracy: 0.2514 - loss: 0.1942 - val_accuracy: 0.4073 - val_loss: 0.1280
Epoch 2/3
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.4211 - loss: 0.1278 - val_accuracy: 0.4589 - val_loss: 0.1155
Epoch 3/3
[1m679/679[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.4807 - loss: 0.1114 - val_accuracy: 0.4642 - val_loss: 0.1135
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
LSTM valid acc: 0.3026170291190564 F1: 0.33755989629367145
LSTM test  acc: 0.2939008660401695 F1: 0.32962348701366573


In [14]:
# ======== FAST DistilBERT (PyTorch, AMP, works great on T4) ========
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW

MODEL = "distilbert-base-multilingual-cased"
EPOCHS = 2           # set 1 for faster debug, 2 for better scores
BATCH_SIZE = 16      # try 24/32 if VRAM allows; drop to 8 if OOM
MAX_LEN = 96
LR = 2e-5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

tokenizer = AutoTokenizer.from_pretrained(MODEL)

class EmoDataset(Dataset):
    def __init__(self, texts, labels):
        self.enc = tokenizer(list(texts), truncation=True, padding=True, max_length=MAX_LEN)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    def __len__(self): return len(self.labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k,v in self.enc.items()}
        item["labels"] = self.labels[idx]
        return item

train_ds = EmoDataset(train_df['text'], y_train)
valid_ds = EmoDataset(valid_df['text'], y_valid)
test_ds  = EmoDataset(test_df['text'],  y_test)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_ds, batch_size=BATCH_SIZE, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, pin_memory=True)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL, num_labels=NUM_LABELS, problem_type="multi_label_classification"
).to(device)

# Optional warm start: freeze base transformer for 1/2 epoch to speed up
# for p in model.base_model.parameters(): p.requires_grad = False

opt = AdamW(model.parameters(), lr=LR)
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
loss_fn = torch.nn.BCEWithLogitsLoss()

def train_one_epoch():
    model.train()
    for batch in train_loader:
        opt.zero_grad(set_to_none=True)
        batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            out = model(input_ids=batch["input_ids"],
                        attention_mask=batch["attention_mask"])
            loss = loss_fn(out.logits, batch["labels"])
        scaler.scale(loss).backward()
        scaler.step(opt)
        scaler.update()

@torch.no_grad()
def predict(loader):
    model.eval()
    preds = []
    for batch in loader:
        batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            logits = model(input_ids=batch["input_ids"],
                           attention_mask=batch["attention_mask"]).logits
        preds.append(torch.sigmoid(logits).float().cpu().numpy())
    return np.vstack(preds)

for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    train_one_epoch()

va_pred = (predict(valid_loader) >= 0.5).astype(int)
te_pred = (predict(test_loader)  >= 0.5).astype(int)

# metrics (reuse example_f1 from earlier cell)
bert_valid_f1 = example_f1(y_valid, va_pred)
bert_test_f1  = example_f1(y_test,  te_pred)
bert_valid_acc = (y_valid == va_pred).all(axis=1).mean()
bert_test_acc  = (y_test  == te_pred).all(axis=1).mean()

print("BERT valid acc:", round(bert_valid_acc,3), "F1:", round(bert_valid_f1,3))
print("BERT test  acc:", round(bert_test_acc,3),  "F1:", round(bert_test_f1,3))


Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


Epoch 1/2
Epoch 2/2


  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


BERT valid acc: 0.388 F1: 0.44
BERT test  acc: 0.383 F1: 0.434


In [16]:
try:
    print(bert)
except NameError:
    print("BERT model not loaded.")


BERT model not loaded.


In [18]:
# ============================================
# ✅ Simple Emotion Prediction Function (BERT)
# ============================================

# Threshold for deciding if an emotion is present (0.5 default)
THRESHOLD = 0.5

def predict_text(text, top_k=None, threshold=THRESHOLD):
    enc = tokenizer([text], truncation=True, padding=True, max_length=MAX_LEN, return_tensors="pt")
    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        logits = model(**enc).logits   # ✅ FIXED: model instead of bert
        probs = torch.sigmoid(logits).cpu().numpy()[0]

    if top_k:
        idx = np.argsort(probs)[-top_k:][::-1]
        return [(GOEMOTIONS[i], float(probs[i])) for i in idx]

    result = [(GOEMOTIONS[i], float(probs[i])) for i in range(len(probs)) if probs[i] >= threshold]
    result = sorted(result, key=lambda x: x[1], reverse=True)

    if not result:
        idx = int(np.argmax(probs))
        result = [(GOEMOTIONS[idx], float(probs[idx]))]

    return result


In [19]:
print(predict_text("मुझे बहुत गुस्सा आ रहा है"))


[('anger', 0.5440922379493713)]


In [20]:
results = [
    ["SVM",  svm_valid_acc,  svm_valid_f1,  svm_test_acc,  svm_test_f1],
    ["LSTM", lstm_valid_acc, lstm_valid_f1, lstm_test_acc, lstm_test_f1],
    ["BERT", bert_valid_acc, bert_valid_f1, bert_test_acc, bert_test_f1],
]
dfres = pd.DataFrame(results, columns=["Model","Valid_Acc","Valid_F1","Test_Acc","Test_F1"])
os.makedirs("results", exist_ok=True)
dfres.to_csv("results/metrics.csv", index=False)
dfres


Unnamed: 0,Model,Valid_Acc,Valid_F1,Test_Acc,Test_F1
0,SVM,0.169923,0.193169,0.169891,0.193139
1,LSTM,0.302617,0.33756,0.293901,0.329623
2,BERT,0.3885,0.44022,0.3829,0.434439
