In [1]:
import pandas as pd
import numpy as np

In [30]:
df=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')


In [31]:
import re
import html
import nltk

# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # 1. Decode HTML entities
    text = html.unescape(text)

    # 2. Lowercase
    text = text.lower()

    # 3. Remove unwanted symbols/punctuation/digits
    text = re.sub(r'["‚Äú‚Äù\'\*\`~\-=&;#\\/<>\|\[\]\(\)_¬∂]', ' ', text)  # symbols
    text = re.sub(r'\.{2,}', ' ', text)  # multiple dots


    # 4. Tokenize
    tokens = word_tokenize(text)


    # 6. Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # 7. Join back to string
    return ' '.join(tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [32]:
df['clean_text']=df['text'].apply(preprocess_text)

In [33]:
test['clean_text']=test['text'].apply(preprocess_text)


In [34]:
df[df.duplicated(subset=['clean_text','emotions'],keep=False)].sort_values(by='clean_text')

Unnamed: 0,id,text,anger,fear,joy,sadness,surprise,emotions,clean_text
1660,1660,", the blisters on my feet ( the wound on my la...",0,1,0,1,0,['fear' 'sadness'],", the blister on my foot the wound on my large..."
2591,2591,", the blisters on my feet ( the wound on my la...",0,1,0,1,0,['fear' 'sadness'],", the blister on my foot the wound on my large..."
4270,4270,12:00 -- making lunch while a toddler hangs on...,0,1,0,1,0,['fear' 'sadness'],12:00 making lunch while a toddler hang on my ...
3884,3884,12:00 -- making lunch while a toddler hangs on...,0,1,0,1,0,['fear' 'sadness'],12:00 making lunch while a toddler hang on my ...
2085,2085,"1991, about 8:00 a.m. in the morning on a satu...",0,1,0,0,0,['fear'],"1991 , about 8:00 a.m. in the morning on a sat..."
...,...,...,...,...,...,...,...,...,...
3415,3415,your just at work.,0,0,0,0,0,[],your just at work .
3595,3595,your mom is nuts.,1,0,0,0,0,['anger'],your mom is nut .
5495,5495,your mom is nuts.,1,0,0,0,0,['anger'],your mom is nut .
1252,1252,your story is really creepy.,0,1,0,0,1,['fear' 'surprise'],your story is really creepy .


In [35]:
df=df.drop_duplicates(subset=['clean_text', 'emotions'], keep='first').reset_index()



In [36]:

import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from collections import Counter
from tqdm import tqdm
import pandas as pd

# ---------- Reproducibility ----------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ---------- Settings ----------
EMOTION_COLS = ['anger', 'fear', 'joy', 'sadness', 'surprise']


X = df['clean_text'].astype(str)
Y = df[EMOTION_COLS].astype(int)

x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.20, random_state=SEED, shuffle=True)


Device: cuda


In [37]:
def build_vocab(texts, min_freq=2):
    counter = Counter()
    for t in texts:
        for tok in t.split():
            counter[tok] += 1
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

vocab = build_vocab(x_train, min_freq=1)
vocab_size = len(vocab)
print("‚úÖ Vocab size:", vocab_size)

‚úÖ Vocab size: 6564


In [None]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from collections import Counter
from tqdm import tqdm
import pandas as pd

# ----------------- CONFIG -----------------
EMOTION_COLS = ['anger', 'fear', 'joy', 'sadness', 'surprise']
MAX_LEN = 80           # üîº slightly longer for more context
MIN_FREQ = 1
BATCH_SIZE = 64        # üîº larger batch = better gradient estimates
EMBED_DIM = 256        # üîº richer word features
HIDDEN_DIM = 256       # üîº more capacity
NUM_LAYERS = 3         # reduced for efficiency
DROPOUT = 0.4          # slightly higher dropout for regularization
NUM_EPOCHS = 15
LEARNING_RATE = 0.07
WEIGHT_DECAY = 1e-5    # üî• helps avoid overfitting (L2 regularization)
GRAD_CLIP = 1.0        # üî• gradient clipping to stabilize RNN
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ----------------- BUILD VOCAB -----------------
def build_vocab(texts, min_freq=2):
    counter = Counter()
    for t in texts:
        for tok in t.split():
            counter[tok] += 1
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

vocab = build_vocab(x_train, min_freq=MIN_FREQ)
vocab_size = len(vocab)
print("‚úÖ Vocab size:", vocab_size)

def text_to_ids(text, vocab):
    return [vocab.get(tok, vocab['<UNK>']) for tok in text.split()]

# ----------------- DATASET -----------------
class RNNTextDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.texts = texts.tolist()
        self.labels = labels.values
        self.vocab = vocab

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        ids = text_to_ids(self.texts[idx], self.vocab)
        label = torch.tensor(self.labels[idx]).float()
        return torch.tensor(ids, dtype=torch.long), label

def collate_batch(batch, max_len=MAX_LEN):
    seqs, labels = zip(*batch)
    lengths = [min(len(s), max_len) for s in seqs]
    padded = torch.full((len(seqs), max_len), fill_value=vocab['<PAD>'], dtype=torch.long)
    for i, s in enumerate(seqs):
        truncated = s[:max_len]
        padded[i, :len(truncated)] = truncated
    lengths = torch.tensor(lengths, dtype=torch.long)
    labels = torch.stack(labels)
    return padded, lengths, labels

train_ds = RNNTextDataset(x_train, y_train, vocab)
val_ds   = RNNTextDataset(x_val,   y_val,   vocab)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=lambda b: collate_batch(b, MAX_LEN))
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE*2, shuffle=False, collate_fn=lambda b: collate_batch(b, MAX_LEN))

# ----------------- MODEL -----------------
class VanillaRNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=1, dropout=0.3):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.RNN(input_size=embed_dim,
                          hidden_size=hidden_dim,
                          num_layers=num_layers,
                          batch_first=True,
                          nonlinearity='tanh',
                          dropout=dropout if num_layers > 1 else 0.0)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)

        # üî• Xavier initialization for better convergence
        for name, param in self.rnn.named_parameters():
            if "weight" in name:
                nn.init.xavier_uniform_(param)
            elif "bias" in name:
                nn.init.constant_(param, 0)

    def forward(self, input_ids, lengths):
        emb = self.embed(input_ids)
        outputs, h_n = self.rnn(emb)
        last_hidden = h_n[-1]
        out = self.dropout(last_hidden)
        logits = self.fc(out)
        return logits

model = VanillaRNNClassifier(
    vocab_size=vocab_size,
    embed_dim=EMBED_DIM,
    hidden_dim=HIDDEN_DIM,
    num_classes=len(EMOTION_COLS),
    num_layers=NUM_LAYERS,
    dropout=DROPOUT
).to(device)

# ----------------- LOSS / OPTIMIZER -----------------
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)  # üî• AdamW > Adam
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1)

# ----------------- TRAIN / EVAL -----------------
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for input_ids, lengths, labels in tqdm(loader, desc="Train", leave=False):
        input_ids, lengths, labels = input_ids.to(device), lengths.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = model(input_ids, lengths)
        loss = criterion(logits, labels)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)  # üî• prevent exploding gradients
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0.0
    preds_list, labels_list = [], []
    with torch.no_grad():
        for input_ids, lengths, labels in loader:
            input_ids, lengths, labels = input_ids.to(device), lengths.to(device), labels.to(device)
            logits = model(input_ids, lengths)
            loss = criterion(logits, labels)
            total_loss += loss.item()
            probs = torch.sigmoid(logits).cpu().numpy()
            preds_list.append(probs)
            labels_list.append(labels.cpu().numpy())

    preds = np.vstack(preds_list)
    labels = np.vstack(labels_list)
    preds_bin = (preds >= 0.5).astype(int)
    f1 = f1_score(labels, preds_bin, average='macro', zero_division=0)
    return total_loss / len(loader), f1

# ----------------- TRAINING LOOP -----------------
best_val_f1 = 0.0
for epoch in range(1, NUM_EPOCHS + 1):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_f1 = evaluate(model, val_loader, criterion)
    scheduler.step(val_f1)

    print(f"Epoch {epoch:02d}/{NUM_EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val F1: {val_f1:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), "best_vanilla_rnn.pth")
        print(f"‚úÖ Saved best model (F1={best_val_f1:.4f})")

print("üéØ Training finished. Best Val F1:", best_val_f1)


‚úÖ Vocab size: 6564




Epoch 01/15 | Train Loss: 1.0828 | Val Loss: 0.6487 | Val F1: 0.1489
‚úÖ Saved best model (F1=0.1489)




Epoch 02/15 | Train Loss: 0.8568 | Val Loss: 0.8240 | Val F1: 0.2598
‚úÖ Saved best model (F1=0.2598)




Epoch 03/15 | Train Loss: 0.8393 | Val Loss: 0.6522 | Val F1: 0.1387




Epoch 04/15 | Train Loss: 0.8935 | Val Loss: 1.0725 | Val F1: 0.2804
‚úÖ Saved best model (F1=0.2804)




Epoch 05/15 | Train Loss: 0.8400 | Val Loss: 0.7204 | Val F1: 0.1778




Epoch 06/15 | Train Loss: 0.8401 | Val Loss: 0.7843 | Val F1: 0.2182




Epoch 07/15 | Train Loss: 0.7658 | Val Loss: 0.5922 | Val F1: 0.1116




Epoch 08/15 | Train Loss: 0.6768 | Val Loss: 0.6058 | Val F1: 0.1095




Epoch 09/15 | Train Loss: 0.6426 | Val Loss: 0.6121 | Val F1: 0.2208




Epoch 10/15 | Train Loss: 0.6332 | Val Loss: 0.5724 | Val F1: 0.1468




Epoch 11/15 | Train Loss: 0.6065 | Val Loss: 0.6735 | Val F1: 0.1705




Epoch 12/15 | Train Loss: 0.6136 | Val Loss: 0.6168 | Val F1: 0.1034




Epoch 13/15 | Train Loss: 0.5943 | Val Loss: 0.5712 | Val F1: 0.1468




Epoch 14/15 | Train Loss: 0.5921 | Val Loss: 0.5683 | Val F1: 0.1468


                                                      

Epoch 15/15 | Train Loss: 0.5760 | Val Loss: 0.5600 | Val F1: 0.1468
üéØ Training finished. Best Val F1: 0.280402782084068




In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import f1_score
from collections import Counter
from tqdm import tqdm

# ---------------- Hyperparameters ----------------
EMOTION_COLS = ['anger', 'fear', 'joy', 'sadness', 'surprise']
MAX_LEN = 80
MIN_FREQ = 1
BATCH_SIZE = 64
EMBED_DIM = 256
HIDDEN_DIM = 256
NUM_LAYERS = 2
DROPOUT = 0.4
NUM_EPOCHS = 50
LEARNING_RATE = 5e-4
device = "cuda" if torch.cuda.is_available() else "cpu"

# ---------------- Vocabulary ----------------
def build_vocab(texts, min_freq=2):
    counter = Counter()
    for t in texts:
        for tok in t.split():
            counter[tok] += 1
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

def text_to_ids(text, vocab):
    return [vocab.get(tok, vocab['<UNK>']) for tok in text.split()]

# ---------------- Dataset ----------------
class TextDataset(Dataset):
    def __init__(self, texts, labels=None, vocab=None):
        self.texts = texts.tolist()
        self.labels = labels.values if labels is not None else None
        self.vocab = vocab

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        ids = text_to_ids(self.texts[idx], self.vocab)
        if self.labels is not None:
            label = torch.tensor(self.labels[idx]).float()
            return torch.tensor(ids, dtype=torch.long), label
        return torch.tensor(ids, dtype=torch.long)

def collate_batch(batch, max_len=MAX_LEN):
    if isinstance(batch[0], tuple):
        seqs, labels = zip(*batch)
    else:
        seqs = batch
        labels = None
    lengths = [min(len(s), max_len) for s in seqs]
    padded = torch.full((len(seqs), max_len), fill_value=vocab['<PAD>'], dtype=torch.long)
    for i, s in enumerate(seqs):
        truncated = s[:max_len]
        padded[i, :len(truncated)] = truncated
    lengths = torch.tensor(lengths, dtype=torch.long)
    if labels is not None:
        labels = torch.stack(labels)
        return padded, lengths, labels
    return padded, lengths

# ---------------- GRU Model ----------------
class GRUClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=1, dropout=0.3, bidirectional=True):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.gru = nn.GRU(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=bidirectional
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, num_classes)

    def forward(self, input_ids, lengths):
        emb = self.embed(input_ids)
        packed_output, h_n = self.gru(emb)
        # h_n: (num_layers * num_directions, batch, hidden)
        if self.gru.bidirectional:
            last_hidden = torch.cat((h_n[-2], h_n[-1]), dim=1)  # concat last forward + backward
        else:
            last_hidden = h_n[-1]
        out = self.dropout(last_hidden)
        logits = self.fc(out)
        return logits

# ---------------- Initialize ----------------
vocab = build_vocab(x_train, min_freq=MIN_FREQ)
vocab_size = len(vocab)
model = GRUClassifier(
    vocab_size=vocab_size,
    embed_dim=EMBED_DIM,
    hidden_dim=HIDDEN_DIM,
    num_classes=len(EMOTION_COLS),
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    bidirectional=True
).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# ---------------- Training ----------------
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for input_ids, lengths, labels in tqdm(loader, leave=False):
        input_ids, lengths, labels = input_ids.to(device), lengths.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = model(input_ids, lengths)
        loss = criterion(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)  # gradient clipping
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0.0
    preds_list, labels_list = [], []
    with torch.no_grad():
        for input_ids, lengths, labels in loader:
            input_ids, lengths, labels = input_ids.to(device), lengths.to(device), labels.to(device)
            logits = model(input_ids, lengths)
            loss = criterion(logits, labels)
            total_loss += loss.item()
            probs = torch.sigmoid(logits).cpu().numpy()
            preds_list.append(probs)
            labels_list.append(labels.cpu().numpy())
    preds = np.vstack(preds_list)
    labels = np.vstack(labels_list)
    preds_bin = (preds >= 0.5).astype(int)
    f1 = f1_score(labels, preds_bin, average='macro', zero_division=0)
    return total_loss / len(loader), f1

# ---------------- Dataloaders ----------------
train_ds = TextDataset(x_train, y_train, vocab)
val_ds   = TextDataset(x_val, y_val, vocab)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE*2, shuffle=False, collate_fn=collate_batch)

# ---------------- Training Loop ----------------
best_val_f1 = 0.0
for epoch in range(1, NUM_EPOCHS + 1):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_f1 = evaluate(model, val_loader, criterion)
    print(f"Epoch {epoch}/{NUM_EPOCHS} | Train Loss: {train_loss:.4f} | Val F1: {val_f1:.4f}")
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), "gru_multi_label.pth")
        print("‚úÖ Saved best model (F1:", best_val_f1, ")")




Epoch 1/50 | Train Loss: 0.5753 | Val F1: 0.1589
‚úÖ Saved best model (F1: 0.15888636741019538 )




Epoch 2/50 | Train Loss: 0.5376 | Val F1: 0.2408
‚úÖ Saved best model (F1: 0.24082745025699032 )




Epoch 3/50 | Train Loss: 0.5065 | Val F1: 0.3186
‚úÖ Saved best model (F1: 0.31858049909342834 )




Epoch 4/50 | Train Loss: 0.4602 | Val F1: 0.3829
‚úÖ Saved best model (F1: 0.3829316184621093 )




Epoch 5/50 | Train Loss: 0.4051 | Val F1: 0.4066
‚úÖ Saved best model (F1: 0.40661196481067163 )




Epoch 6/50 | Train Loss: 0.3457 | Val F1: 0.4185
‚úÖ Saved best model (F1: 0.41851014552458016 )




Epoch 7/50 | Train Loss: 0.2836 | Val F1: 0.4420
‚úÖ Saved best model (F1: 0.44200443801832795 )




Epoch 8/50 | Train Loss: 0.2256 | Val F1: 0.4472
‚úÖ Saved best model (F1: 0.4472125304194683 )




Epoch 9/50 | Train Loss: 0.1663 | Val F1: 0.4505
‚úÖ Saved best model (F1: 0.4504866796559659 )




Epoch 10/50 | Train Loss: 0.1226 | Val F1: 0.4404




Epoch 11/50 | Train Loss: 0.0973 | Val F1: 0.4560
‚úÖ Saved best model (F1: 0.4560297798340091 )




Epoch 12/50 | Train Loss: 0.0653 | Val F1: 0.4431




Epoch 13/50 | Train Loss: 0.0470 | Val F1: 0.4282




Epoch 14/50 | Train Loss: 0.0396 | Val F1: 0.4351




Epoch 15/50 | Train Loss: 0.0316 | Val F1: 0.4485




Epoch 16/50 | Train Loss: 0.0258 | Val F1: 0.4450




Epoch 17/50 | Train Loss: 0.0209 | Val F1: 0.4547




Epoch 18/50 | Train Loss: 0.0208 | Val F1: 0.4337




Epoch 19/50 | Train Loss: 0.0210 | Val F1: 0.4388




Epoch 20/50 | Train Loss: 0.0252 | Val F1: 0.4499




Epoch 21/50 | Train Loss: 0.0237 | Val F1: 0.4474




Epoch 22/50 | Train Loss: 0.0182 | Val F1: 0.4254




Epoch 23/50 | Train Loss: 0.0132 | Val F1: 0.4352




Epoch 24/50 | Train Loss: 0.0115 | Val F1: 0.4533




Epoch 25/50 | Train Loss: 0.0105 | Val F1: 0.4443




Epoch 26/50 | Train Loss: 0.0093 | Val F1: 0.4370




Epoch 27/50 | Train Loss: 0.0065 | Val F1: 0.4486




Epoch 28/50 | Train Loss: 0.0147 | Val F1: 0.4486




Epoch 29/50 | Train Loss: 0.0115 | Val F1: 0.4466




Epoch 30/50 | Train Loss: 0.0087 | Val F1: 0.4521




Epoch 31/50 | Train Loss: 0.0097 | Val F1: 0.4577
‚úÖ Saved best model (F1: 0.4577198104055354 )




Epoch 32/50 | Train Loss: 0.0080 | Val F1: 0.4578
‚úÖ Saved best model (F1: 0.45775551972953166 )




Epoch 33/50 | Train Loss: 0.0053 | Val F1: 0.4624
‚úÖ Saved best model (F1: 0.46237754772246065 )




Epoch 34/50 | Train Loss: 0.0059 | Val F1: 0.4623




Epoch 35/50 | Train Loss: 0.0065 | Val F1: 0.4617




Epoch 36/50 | Train Loss: 0.0115 | Val F1: 0.4529




Epoch 37/50 | Train Loss: 0.0121 | Val F1: 0.4345




Epoch 38/50 | Train Loss: 0.0119 | Val F1: 0.4500




Epoch 39/50 | Train Loss: 0.0087 | Val F1: 0.4588




Epoch 40/50 | Train Loss: 0.0098 | Val F1: 0.4547




Epoch 41/50 | Train Loss: 0.0071 | Val F1: 0.4507




Epoch 42/50 | Train Loss: 0.0067 | Val F1: 0.4337




Epoch 43/50 | Train Loss: 0.0060 | Val F1: 0.4524




Epoch 44/50 | Train Loss: 0.0060 | Val F1: 0.4612




Epoch 45/50 | Train Loss: 0.0058 | Val F1: 0.4496




Epoch 46/50 | Train Loss: 0.0086 | Val F1: 0.4469




Epoch 47/50 | Train Loss: 0.0217 | Val F1: 0.4390




Epoch 48/50 | Train Loss: 0.0141 | Val F1: 0.4616




Epoch 49/50 | Train Loss: 0.0065 | Val F1: 0.4510




Epoch 50/50 | Train Loss: 0.0037 | Val F1: 0.4534


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from tqdm import tqdm

# --- Dataset ---
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=80):
        self.texts = texts
        self.labels = labels.values
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.texts.iloc[idx].split()
        # numericalize + pad
        ids = [self.vocab.get(w, self.vocab["<unk>"]) for w in tokens[:self.max_len]]
        ids += [self.vocab["<pad>"]] * (self.max_len - len(ids))
        return torch.tensor(ids), torch.tensor(self.labels[idx], dtype=torch.float32)

# --- Model ---
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=2, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True  # bidirectional LSTM
        )
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        emb = self.embedding(x)  # (batch, seq, embed_dim)
        out, (h_n, c_n) = self.lstm(emb)  # h_n: (num_layers*2, batch, hidden)
        last_hidden = torch.cat((h_n[-2], h_n[-1]), dim=1)  # concat forward + backward
        out = self.layer_norm(last_hidden)
        out = self.dropout(out)
        logits = self.fc(out)
        return logits

# --- Training Utilities ---
def train_model(model, train_loader, val_loader, epochs=10, lr=1e-3, device='cuda'):
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    best_f1 = 0.0

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for x_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            logits = model(x_batch)
            loss = criterion(logits, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)

        # --- Validation ---
        model.eval()
        val_preds, val_labels = [], []
        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                x_batch = x_batch.to(device)
                logits = model(x_batch)
                preds = torch.sigmoid(logits).cpu().numpy() > 0.5
                val_preds.extend(preds)
                val_labels.extend(y_batch.numpy())

        f1 = f1_score(val_labels, val_preds, average='macro')
        print(f"Epoch {epoch+1:02d} | Train Loss: {avg_train_loss:.4f} | Val F1: {f1:.4f}")

        if f1 > best_f1:
            best_f1 = f1
            torch.save(model.state_dict(), "best_lstm_model.pt")
            print(f"‚úÖ Saved best model (F1={f1:.4f})")

# --- Vocabulary Builder ---
def build_vocab(texts, min_freq=2):
    from collections import Counter
    counter = Counter()
    for text in texts:
        counter.update(text.split())

    vocab = {"<pad>": 0, "<unk>": 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab


In [None]:
# 1. Build vocab
vocab = build_vocab(x_train, min_freq=2)
print("‚úÖ Vocab size:", len(vocab))

# 2. Prepare datasets
train_dataset = TextDataset(x_train, y_train, vocab)
val_dataset = TextDataset(x_val, y_val, vocab)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# 3. Initialize and train
device = "cuda" if torch.cuda.is_available() else "cpu"
model = LSTMClassifier(
    vocab_size=len(vocab),
    embed_dim=128,
    hidden_dim=128,
    num_classes=y_train.shape[1],
    num_layers=2,
    dropout=0.3
).to(device)

train_model(model, train_loader, val_loader, epochs=20, lr=1e-3, device=device)


‚úÖ Vocab size: 3043


Epoch 1/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 64.74it/s]


Epoch 01 | Train Loss: 0.5834 | Val F1: 0.1358
‚úÖ Saved best model (F1=0.1358)


Epoch 2/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 79.20it/s]


Epoch 02 | Train Loss: 0.5515 | Val F1: 0.2658
‚úÖ Saved best model (F1=0.2658)


Epoch 3/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 79.34it/s]


Epoch 03 | Train Loss: 0.5225 | Val F1: 0.2808
‚úÖ Saved best model (F1=0.2808)


Epoch 4/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 78.82it/s]


Epoch 04 | Train Loss: 0.4890 | Val F1: 0.3291
‚úÖ Saved best model (F1=0.3291)


Epoch 5/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 78.97it/s]


Epoch 05 | Train Loss: 0.4430 | Val F1: 0.3915
‚úÖ Saved best model (F1=0.3915)


Epoch 6/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 75.11it/s]


Epoch 06 | Train Loss: 0.3926 | Val F1: 0.3999
‚úÖ Saved best model (F1=0.3999)


Epoch 7/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 72.02it/s]


Epoch 07 | Train Loss: 0.3335 | Val F1: 0.4086
‚úÖ Saved best model (F1=0.4086)


Epoch 8/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 70.45it/s]


Epoch 08 | Train Loss: 0.2808 | Val F1: 0.4329
‚úÖ Saved best model (F1=0.4329)


Epoch 9/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 77.60it/s]


Epoch 09 | Train Loss: 0.2230 | Val F1: 0.4266


Epoch 10/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 78.14it/s]


Epoch 10 | Train Loss: 0.1911 | Val F1: 0.4344
‚úÖ Saved best model (F1=0.4344)


Epoch 11/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 78.00it/s]


Epoch 11 | Train Loss: 0.1408 | Val F1: 0.4258


Epoch 12/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 77.98it/s]


Epoch 12 | Train Loss: 0.1146 | Val F1: 0.4515
‚úÖ Saved best model (F1=0.4515)


Epoch 13/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 78.28it/s]


Epoch 13 | Train Loss: 0.0854 | Val F1: 0.4431


Epoch 14/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 78.38it/s]


Epoch 14 | Train Loss: 0.0756 | Val F1: 0.4378


Epoch 15/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 78.11it/s]


Epoch 15 | Train Loss: 0.0567 | Val F1: 0.4393


Epoch 16/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 77.01it/s]


Epoch 16 | Train Loss: 0.0471 | Val F1: 0.4565
‚úÖ Saved best model (F1=0.4565)


Epoch 17/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 77.30it/s]


Epoch 17 | Train Loss: 0.0421 | Val F1: 0.4494


Epoch 18/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 77.49it/s]


Epoch 18 | Train Loss: 0.0390 | Val F1: 0.4509


Epoch 19/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 73.74it/s]


Epoch 19 | Train Loss: 0.0299 | Val F1: 0.4590
‚úÖ Saved best model (F1=0.4590)


Epoch 20/20: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:00<00:00, 69.35it/s]


Epoch 20 | Train Loss: 0.0273 | Val F1: 0.4517


# GRU + ATTENTion

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import f1_score
from collections import Counter
from tqdm import tqdm

# ---------------- Hyperparameters ----------------
EMOTION_COLS = ['anger', 'fear', 'joy', 'sadness', 'surprise']
MAX_LEN = 80
MIN_FREQ = 1
BATCH_SIZE = 64
EMBED_DIM = 256
HIDDEN_DIM = 256
NUM_LAYERS = 2
DROPOUT = 0.4
NUM_EPOCHS = 50
LEARNING_RATE = 5e-4
device = "cuda" if torch.cuda.is_available() else "cpu"

# ---------------- Vocabulary ----------------
def build_vocab(texts, min_freq=2):
    counter = Counter()
    for t in texts:
        for tok in t.split():
            counter[tok] += 1
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

def text_to_ids(text, vocab):
    return [vocab.get(tok, vocab['<UNK>']) for tok in text.split()]

# ---------------- Dataset ----------------
class TextDataset(Dataset):
    def __init__(self, texts, labels=None, vocab=None):
        self.texts = texts.tolist()
        self.labels = labels.values if labels is not None else None
        self.vocab = vocab

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        ids = text_to_ids(self.texts[idx], self.vocab)
        if self.labels is not None:
            label = torch.tensor(self.labels[idx]).float()
            return torch.tensor(ids, dtype=torch.long), label
        return torch.tensor(ids, dtype=torch.long)

def collate_batch(batch, max_len=MAX_LEN):
    if isinstance(batch[0], tuple):
        seqs, labels = zip(*batch)
    else:
        seqs = batch
        labels = None
    lengths = [min(len(s), max_len) for s in seqs]
    padded = torch.full((len(seqs), max_len), fill_value=vocab['<PAD>'], dtype=torch.long)
    for i, s in enumerate(seqs):
        truncated = s[:max_len]
        padded[i, :len(truncated)] = truncated
    lengths = torch.tensor(lengths, dtype=torch.long)
    if labels is not None:
        labels = torch.stack(labels)
        return padded, lengths, labels
    return padded, lengths

# ---------------- GRU + Attention Model ----------------
class GRUAttentionClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=1, dropout=0.3, bidirectional=True):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.gru = nn.GRU(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=bidirectional
        )
        self.dropout = nn.Dropout(dropout)
        self.attention = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, 1)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, num_classes)

    def forward(self, input_ids, lengths):
        emb = self.embed(input_ids)
        packed_output, h_n = self.gru(emb)  # outputs: (batch, seq_len, hidden*directions)

        # ---------------- Attention ----------------
        attn_weights = self.attention(packed_output).squeeze(-1)  # (batch, seq_len)
        mask = (input_ids != 0)  # ignore PAD tokens
        attn_weights[~mask] = float('-inf')
        attn_scores = torch.softmax(attn_weights, dim=1).unsqueeze(-1)  # (batch, seq_len, 1)
        weighted_output = (packed_output * attn_scores).sum(dim=1)      # (batch, hidden*directions)

        out = self.dropout(weighted_output)
        logits = self.fc(out)
        return logits

# ---------------- Initialize ----------------
vocab = build_vocab(x_train, min_freq=MIN_FREQ)
vocab_size = len(vocab)
model = GRUAttentionClassifier(
    vocab_size=vocab_size,
    embed_dim=EMBED_DIM,
    hidden_dim=HIDDEN_DIM,
    num_classes=len(EMOTION_COLS),
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    bidirectional=True
).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# ---------------- Train / Eval ----------------
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for input_ids, lengths, labels in tqdm(loader, leave=False):
        input_ids, lengths, labels = input_ids.to(device), lengths.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = model(input_ids, lengths)
        loss = criterion(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0.0
    preds_list, labels_list = [], []
    with torch.no_grad():
        for input_ids, lengths, labels in loader:
            input_ids, lengths, labels = input_ids.to(device), lengths.to(device), labels.to(device)
            logits = model(input_ids, lengths)
            loss = criterion(logits, labels)
            total_loss += loss.item()
            probs = torch.sigmoid(logits).cpu().numpy()
            preds_list.append(probs)
            labels_list.append(labels.cpu().numpy())
    preds = np.vstack(preds_list)
    labels = np.vstack(labels_list)
    preds_bin = (preds >= 0.5).astype(int)
    f1 = f1_score(labels, preds_bin, average='macro', zero_division=0)
    return total_loss / len(loader), f1

# ---------------- Dataloaders ----------------
train_ds = TextDataset(x_train, y_train, vocab)
val_ds   = TextDataset(x_val, y_val, vocab)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE*2, shuffle=False, collate_fn=collate_batch)

# ---------------- Training Loop ----------------
best_val_f1 = 0.0
for epoch in range(1, NUM_EPOCHS + 1):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_f1 = evaluate(model, val_loader, criterion)
    print(f"Epoch {epoch}/{NUM_EPOCHS} | Train Loss: {train_loss:.4f} | Val F1: {val_f1:.4f}")
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), "gru_attention_multi_label.pth")
        print("‚úÖ Saved best model (F1:", best_val_f1, ")")




Epoch 1/50 | Train Loss: 0.5637 | Val F1: 0.2276
‚úÖ Saved best model (F1: 0.22757210377278198 )




Epoch 2/50 | Train Loss: 0.5257 | Val F1: 0.3147
‚úÖ Saved best model (F1: 0.31468548803118307 )




Epoch 3/50 | Train Loss: 0.4823 | Val F1: 0.3633
‚úÖ Saved best model (F1: 0.3633344476727465 )




Epoch 4/50 | Train Loss: 0.4197 | Val F1: 0.4408
‚úÖ Saved best model (F1: 0.4407900749073555 )




Epoch 5/50 | Train Loss: 0.3502 | Val F1: 0.4537
‚úÖ Saved best model (F1: 0.4537360456591936 )




Epoch 6/50 | Train Loss: 0.2772 | Val F1: 0.4679
‚úÖ Saved best model (F1: 0.4678892446196373 )




Epoch 7/50 | Train Loss: 0.2020 | Val F1: 0.4665




Epoch 8/50 | Train Loss: 0.1444 | Val F1: 0.4662




Epoch 9/50 | Train Loss: 0.0921 | Val F1: 0.4697
‚úÖ Saved best model (F1: 0.46971098658003924 )




Epoch 10/50 | Train Loss: 0.0641 | Val F1: 0.4743
‚úÖ Saved best model (F1: 0.47434508481901105 )




Epoch 11/50 | Train Loss: 0.0450 | Val F1: 0.4623




Epoch 12/50 | Train Loss: 0.0284 | Val F1: 0.4571




Epoch 13/50 | Train Loss: 0.0206 | Val F1: 0.4492




Epoch 14/50 | Train Loss: 0.0175 | Val F1: 0.4499




Epoch 15/50 | Train Loss: 0.0185 | Val F1: 0.4511




Epoch 16/50 | Train Loss: 0.0159 | Val F1: 0.4669




Epoch 17/50 | Train Loss: 0.0109 | Val F1: 0.4705




Epoch 18/50 | Train Loss: 0.0096 | Val F1: 0.4598




Epoch 19/50 | Train Loss: 0.0059 | Val F1: 0.4576




Epoch 20/50 | Train Loss: 0.0055 | Val F1: 0.4671




Epoch 21/50 | Train Loss: 0.0138 | Val F1: 0.4639




Epoch 22/50 | Train Loss: 0.0278 | Val F1: 0.4626




Epoch 23/50 | Train Loss: 0.0292 | Val F1: 0.4535




Epoch 24/50 | Train Loss: 0.0197 | Val F1: 0.4581




Epoch 25/50 | Train Loss: 0.0093 | Val F1: 0.4588




Epoch 26/50 | Train Loss: 0.0054 | Val F1: 0.4617




Epoch 27/50 | Train Loss: 0.0039 | Val F1: 0.4601




Epoch 28/50 | Train Loss: 0.0023 | Val F1: 0.4654




Epoch 29/50 | Train Loss: 0.0018 | Val F1: 0.4626




Epoch 30/50 | Train Loss: 0.0013 | Val F1: 0.4614




Epoch 31/50 | Train Loss: 0.0015 | Val F1: 0.4578




Epoch 32/50 | Train Loss: 0.0015 | Val F1: 0.4588




Epoch 33/50 | Train Loss: 0.0018 | Val F1: 0.4602




Epoch 34/50 | Train Loss: 0.0016 | Val F1: 0.4531




Epoch 35/50 | Train Loss: 0.0026 | Val F1: 0.4593




Epoch 36/50 | Train Loss: 0.0034 | Val F1: 0.4690




Epoch 37/50 | Train Loss: 0.0022 | Val F1: 0.4542




Epoch 38/50 | Train Loss: 0.0028 | Val F1: 0.4545




Epoch 39/50 | Train Loss: 0.0155 | Val F1: 0.4591




Epoch 40/50 | Train Loss: 0.0454 | Val F1: 0.4590




Epoch 41/50 | Train Loss: 0.0501 | Val F1: 0.4451




Epoch 42/50 | Train Loss: 0.0160 | Val F1: 0.4696




Epoch 43/50 | Train Loss: 0.0059 | Val F1: 0.4585




Epoch 44/50 | Train Loss: 0.0028 | Val F1: 0.4580




Epoch 45/50 | Train Loss: 0.0019 | Val F1: 0.4480




Epoch 46/50 | Train Loss: 0.0015 | Val F1: 0.4593




Epoch 47/50 | Train Loss: 0.0015 | Val F1: 0.4589




Epoch 48/50 | Train Loss: 0.0015 | Val F1: 0.4560




Epoch 49/50 | Train Loss: 0.0011 | Val F1: 0.4575




Epoch 50/50 | Train Loss: 0.0010 | Val F1: 0.4576


# multihead Attention


In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import f1_score

# ------------------------------------------------------------------
# üîπ Model Definition
# ------------------------------------------------------------------
class GRU_AttentionModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=2, dropout=0.4):
        super(GRU_AttentionModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding_dropout = nn.Dropout(0.4)

        self.gru = nn.GRU(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0
        )

        # Multi-head self-attention
        self.multihead_attn = nn.MultiheadAttention(
            embed_dim=hidden_dim * 2,  # bidirectional GRU
            num_heads=4,
            dropout=0.3,
            batch_first=True
        )

        self.layer_norm = nn.LayerNorm(hidden_dim * 2)

        # MLP classifier head
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x, lengths=None):
        emb = self.embedding_dropout(self.embedding(x))  # (batch, seq, emb)
        packed_output, _ = self.gru(emb)

        # Multi-head self-attention (Q=K=V)
        attn_output, _ = self.multihead_attn(packed_output, packed_output, packed_output)
        attn_output = self.layer_norm(attn_output + packed_output)

        # Mean pooling over time dimension
        mean_output = attn_output.mean(dim=1)

        logits = self.fc(mean_output)
        return logits


# ------------------------------------------------------------------
# üîπ Dataset Class
# ------------------------------------------------------------------
class TextDataset(Dataset):
    def __init__(self, X, y, vocab, max_len):
        self.X = X
        self.y = y.values
        self.vocab = vocab
        self.max_len = max_len

    def text_to_seq(self, text):
        tokens = text.split()
        seq = [self.vocab.get(tok, self.vocab["<UNK>"]) for tok in tokens[:self.max_len]]
        seq += [self.vocab["<PAD>"]] * (self.max_len - len(seq))
        return torch.tensor(seq, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.text_to_seq(self.X.iloc[idx]), torch.tensor(self.y[idx], dtype=torch.float32)


# ------------------------------------------------------------------
# üîπ Training Utilities
# ------------------------------------------------------------------
def train_epoch(model, loader, optimizer, criterion, device, scheduler=None, clip=5):
    model.train()
    total_loss = 0
    for X_batch, y_batch in tqdm(loader, desc="Training", leave=False):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        logits = model(X_batch)
        loss = criterion(logits, y_batch)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip)
        optimizer.step()
        if scheduler:
            scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def eval_epoch(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    preds, targets = [], []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            total_loss += loss.item()

            preds.append(torch.sigmoid(logits).cpu())
            targets.append(y_batch.cpu())

    preds = torch.cat(preds)
    targets = torch.cat(targets)
    f1 = f1_score(targets.numpy() > 0.5, preds.numpy() > 0.5, average="macro")
    return total_loss / len(loader), f1


# ------------------------------------------------------------------
# üîπ Training Script
# ------------------------------------------------------------------
def train_model(vocab, x_train, y_train, x_val, y_val, num_classes):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    MAX_LEN = 100
    EMBED_DIM = 256
    HIDDEN_DIM = 256
    NUM_LAYERS = 2
    DROPOUT = 0.4
    BATCH_SIZE = 64
    EPOCHS = 50
    LEARNING_RATE = 5e-4

    train_ds = TextDataset(x_train, y_train, vocab, MAX_LEN)
    val_ds = TextDataset(x_val, y_val, vocab, MAX_LEN)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

    model = GRU_AttentionModel(len(vocab), EMBED_DIM, HIDDEN_DIM, num_classes, NUM_LAYERS, DROPOUT).to(device)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)
    criterion = nn.BCEWithLogitsLoss()
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=1e-3, epochs=EPOCHS, steps_per_epoch=len(train_loader)
    )

    best_f1 = 0.0
    for epoch in range(1, EPOCHS + 1):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device, scheduler)
        val_loss, val_f1 = eval_epoch(model, val_loader, criterion, device)

        print(f"Epoch {epoch:02d}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val F1: {val_f1:.4f}")
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), "best_model.pth")
            print(f"‚úÖ Saved best model (F1={val_f1:.4f})")

    print(f"Training complete. Best F1: {best_f1:.4f}")


# ------------------------------------------------------------------
# üîπ Run Training
# ------------------------------------------------------------------
# Assuming you already have:
# x_train, x_test, y_train, y_test, vocab
# and vocab contains "<PAD>" and "<UNK>"
train_model(vocab, x_train, y_train, x_val, y_val, num_classes=y_train.shape[1])




Epoch 01/50 | Train Loss: 0.5919 | Val F1: 0.1468
‚úÖ Saved best model (F1=0.1468)




Epoch 02/50 | Train Loss: 0.5737 | Val F1: 0.1468




Epoch 03/50 | Train Loss: 0.5690 | Val F1: 0.1468




Epoch 04/50 | Train Loss: 0.5618 | Val F1: 0.2492
‚úÖ Saved best model (F1=0.2492)




Epoch 05/50 | Train Loss: 0.5567 | Val F1: 0.2462




Epoch 06/50 | Train Loss: 0.5489 | Val F1: 0.2022




Epoch 07/50 | Train Loss: 0.5411 | Val F1: 0.2765
‚úÖ Saved best model (F1=0.2765)




Epoch 08/50 | Train Loss: 0.5367 | Val F1: 0.3038
‚úÖ Saved best model (F1=0.3038)




Epoch 09/50 | Train Loss: 0.5192 | Val F1: 0.3386
‚úÖ Saved best model (F1=0.3386)




Epoch 10/50 | Train Loss: 0.5080 | Val F1: 0.3485
‚úÖ Saved best model (F1=0.3485)




Epoch 11/50 | Train Loss: 0.4936 | Val F1: 0.3184




Epoch 12/50 | Train Loss: 0.4900 | Val F1: 0.3614
‚úÖ Saved best model (F1=0.3614)




Epoch 13/50 | Train Loss: 0.4686 | Val F1: 0.3161




Epoch 14/50 | Train Loss: 0.4552 | Val F1: 0.4100
‚úÖ Saved best model (F1=0.4100)




Epoch 15/50 | Train Loss: 0.4309 | Val F1: 0.4214
‚úÖ Saved best model (F1=0.4214)




Epoch 16/50 | Train Loss: 0.4072 | Val F1: 0.4135




Epoch 17/50 | Train Loss: 0.3880 | Val F1: 0.4521
‚úÖ Saved best model (F1=0.4521)




Epoch 18/50 | Train Loss: 0.3613 | Val F1: 0.4377




Epoch 19/50 | Train Loss: 0.3388 | Val F1: 0.4414




Epoch 20/50 | Train Loss: 0.3154 | Val F1: 0.4625
‚úÖ Saved best model (F1=0.4625)




Epoch 21/50 | Train Loss: 0.2913 | Val F1: 0.4627
‚úÖ Saved best model (F1=0.4627)




Epoch 22/50 | Train Loss: 0.2681 | Val F1: 0.4537




Epoch 23/50 | Train Loss: 0.2489 | Val F1: 0.4363




Epoch 24/50 | Train Loss: 0.2196 | Val F1: 0.4484




Epoch 25/50 | Train Loss: 0.1984 | Val F1: 0.4733
‚úÖ Saved best model (F1=0.4733)




Epoch 26/50 | Train Loss: 0.1833 | Val F1: 0.4563




Epoch 27/50 | Train Loss: 0.1595 | Val F1: 0.4592




Epoch 28/50 | Train Loss: 0.1493 | Val F1: 0.4482




Epoch 29/50 | Train Loss: 0.1263 | Val F1: 0.4679




Epoch 30/50 | Train Loss: 0.1226 | Val F1: 0.4674




Epoch 31/50 | Train Loss: 0.1094 | Val F1: 0.4588




Epoch 32/50 | Train Loss: 0.0973 | Val F1: 0.4438




Epoch 33/50 | Train Loss: 0.0823 | Val F1: 0.4565




Epoch 34/50 | Train Loss: 0.0834 | Val F1: 0.4615




Epoch 35/50 | Train Loss: 0.0729 | Val F1: 0.4457




Epoch 36/50 | Train Loss: 0.0674 | Val F1: 0.4666




Epoch 37/50 | Train Loss: 0.0613 | Val F1: 0.4640




Epoch 38/50 | Train Loss: 0.0548 | Val F1: 0.4626




Epoch 39/50 | Train Loss: 0.0477 | Val F1: 0.4679




Epoch 40/50 | Train Loss: 0.0451 | Val F1: 0.4679




Epoch 41/50 | Train Loss: 0.0475 | Val F1: 0.4627




Epoch 42/50 | Train Loss: 0.0411 | Val F1: 0.4710




Epoch 43/50 | Train Loss: 0.0382 | Val F1: 0.4627




Epoch 44/50 | Train Loss: 0.0384 | Val F1: 0.4678




Epoch 45/50 | Train Loss: 0.0387 | Val F1: 0.4652




Epoch 46/50 | Train Loss: 0.0322 | Val F1: 0.4674




Epoch 47/50 | Train Loss: 0.0315 | Val F1: 0.4714




KeyboardInterrupt: 

In [39]:
# pip install optuna

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import f1_score
import optuna

# ------------------------------------------------------------------
# üîπ Model Definition
# ------------------------------------------------------------------
class GRU_AttentionModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=2, dropout=0.4):
        super(GRU_AttentionModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding_dropout = nn.Dropout(0.4)

        self.gru = nn.GRU(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0
        )

        # Multi-head self-attention
        self.multihead_attn = nn.MultiheadAttention(
            embed_dim=hidden_dim * 2,
            num_heads=4,
            dropout=0.3,
            batch_first=True
        )

        self.layer_norm = nn.LayerNorm(hidden_dim * 2)

        # MLP classifier head
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x, lengths=None):
        emb = self.embedding_dropout(self.embedding(x))
        packed_output, _ = self.gru(emb)
        attn_output, _ = self.multihead_attn(packed_output, packed_output, packed_output)
        attn_output = self.layer_norm(attn_output + packed_output)
        mean_output = attn_output.mean(dim=1)
        logits = self.fc(mean_output)
        return logits


# ------------------------------------------------------------------
# üîπ Dataset
# ------------------------------------------------------------------
class TextDataset(Dataset):
    def __init__(self, X, y, vocab, max_len):
        self.X = X
        self.y = y.values
        self.vocab = vocab
        self.max_len = max_len

    def text_to_seq(self, text):
        tokens = text.split()
        seq = [self.vocab.get(tok, self.vocab["<UNK>"]) for tok in tokens[:self.max_len]]
        seq += [self.vocab["<PAD>"]] * (self.max_len - len(seq))
        return torch.tensor(seq, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.text_to_seq(self.X.iloc[idx]), torch.tensor(self.y[idx], dtype=torch.float32)


# ------------------------------------------------------------------
# üîπ Train / Eval
# ------------------------------------------------------------------
def train_epoch(model, loader, optimizer, criterion, device, scheduler=None, clip=5):
    model.train()
    total_loss = 0
    for X_batch, y_batch in tqdm(loader, desc="Training", leave=False):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        logits = model(X_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip)
        optimizer.step()
        if scheduler:
            scheduler.step()
        total_loss += loss.item()
    return total_loss / len(loader)


def eval_epoch(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    preds, targets = [], []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            total_loss += loss.item()
            preds.append(torch.sigmoid(logits).cpu())
            targets.append(y_batch.cpu())
    preds = torch.cat(preds)
    targets = torch.cat(targets)
    f1 = f1_score(targets.numpy() > 0.5, preds.numpy() > 0.5, average="macro")
    return total_loss / len(loader), f1


# ------------------------------------------------------------------
# üîπ Optuna Objective Function
# ------------------------------------------------------------------
def objective(trial):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # üî∏ Hyperparameters to tune
    EMBED_DIM = trial.suggest_categorical("embed_dim", [128, 256, 300])
    HIDDEN_DIM = trial.suggest_categorical("hidden_dim", [128, 256, 384])
    NUM_LAYERS = trial.suggest_int("num_layers", 1, 3)
    DROPOUT = trial.suggest_float("dropout", 0.2, 0.6)
    LR = trial.suggest_float("lr", 1e-4, 5e-3, log=True)
    BATCH_SIZE = trial.suggest_categorical("batch_size", [32, 64, 128])

    MAX_LEN = 100
    EPOCHS = 20  # keep small for tuning speed

    # Build datasets
    train_ds = TextDataset(x_train, y_train, vocab, MAX_LEN)
    val_ds = TextDataset(x_val, y_val, vocab, MAX_LEN)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

    model = GRU_AttentionModel(len(vocab), EMBED_DIM, HIDDEN_DIM, y_train.shape[1], NUM_LAYERS, DROPOUT).to(device)
    optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=1e-5)
    criterion = nn.BCEWithLogitsLoss()

    best_f1 = 0.0
    for epoch in range(EPOCHS):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss, val_f1 = eval_epoch(model, val_loader, criterion, device)
        trial.report(val_f1, epoch)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

        if val_f1 > best_f1:
            best_f1 = val_f1

    return best_f1


# ------------------------------------------------------------------
# üîπ Run Optuna Tuning
# ------------------------------------------------------------------
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20, timeout=None)

print("\nüéØ Best Trial:")
trial = study.best_trial
print(f"  F1 Score: {trial.value:.4f}")
print("  Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# ------------------------------------------------------------------
# üîπ (Optional) Retrain best model with tuned params
# ------------------------------------------------------------------
best_params = study.best_trial.params
print("\nRetraining with best params...")

train_model(
    vocab,
    x_train,
    y_train,
    x_val,
    y_val,
    num_classes=y_train.shape[1],
)


[I 2025-10-22 06:30:39,734] A new study created in memory with name: no-name-3198626d-5a80-43ba-87b6-7e629279e4ed
[I 2025-10-22 06:31:23,255] Trial 0 finished with value: 0.4481769298200516 and parameters: {'embed_dim': 256, 'hidden_dim': 256, 'num_layers': 1, 'dropout': 0.22593063025338786, 'lr': 0.0017149289930817563, 'batch_size': 128}. Best is trial 0 with value: 0.4481769298200516.
[I 2025-10-22 06:32:01,425] Trial 1 finished with value: 0.4755266850339118 and parameters: {'embed_dim': 300, 'hidden_dim': 128, 'num_layers': 3, 'dropout': 0.24770800542716534, 'lr': 0.00029483725734426666, 'batch_size': 32}. Best is trial 1 with value: 0.4755266850339118.
[I 2025-10-22 06:32:29,399] Trial 2 finished with value: 0.4803402155373063 and parameters: {'embed_dim': 300, 'hidden_dim': 128, 'num_layers': 2, 'dropout': 0.27394124734819464, 'lr': 0.0010634558316564187, 'batch_size': 64}. Best is trial 2 with value: 0.4803402155373063.
[I 2025-10-22 06:33:15,068] Trial 3 finished with value: 0.


üéØ Best Trial:
  F1 Score: 0.4908
  Params:
    embed_dim: 300
    hidden_dim: 128
    num_layers: 2
    dropout: 0.48683977124012534
    lr: 0.0016421109262960908
    batch_size: 128

Retraining with best params...




Epoch 01/50 | Train Loss: 0.5948 | Val F1: 0.1469
‚úÖ Saved best model (F1=0.1469)




Epoch 02/50 | Train Loss: 0.5742 | Val F1: 0.1468




Epoch 03/50 | Train Loss: 0.5698 | Val F1: 0.1465




Epoch 04/50 | Train Loss: 0.5624 | Val F1: 0.2373
‚úÖ Saved best model (F1=0.2373)




Epoch 05/50 | Train Loss: 0.5554 | Val F1: 0.2625
‚úÖ Saved best model (F1=0.2625)




Epoch 06/50 | Train Loss: 0.5468 | Val F1: 0.2347




Epoch 07/50 | Train Loss: 0.5339 | Val F1: 0.2767
‚úÖ Saved best model (F1=0.2767)




Epoch 08/50 | Train Loss: 0.5284 | Val F1: 0.2843
‚úÖ Saved best model (F1=0.2843)




Epoch 09/50 | Train Loss: 0.5126 | Val F1: 0.3224
‚úÖ Saved best model (F1=0.3224)




Epoch 10/50 | Train Loss: 0.5029 | Val F1: 0.3118




Epoch 11/50 | Train Loss: 0.4950 | Val F1: 0.3329
‚úÖ Saved best model (F1=0.3329)




Epoch 12/50 | Train Loss: 0.4783 | Val F1: 0.3834
‚úÖ Saved best model (F1=0.3834)




Epoch 13/50 | Train Loss: 0.4618 | Val F1: 0.3437




Epoch 14/50 | Train Loss: 0.4428 | Val F1: 0.4088
‚úÖ Saved best model (F1=0.4088)




Epoch 15/50 | Train Loss: 0.4191 | Val F1: 0.3390




Epoch 16/50 | Train Loss: 0.3926 | Val F1: 0.4215
‚úÖ Saved best model (F1=0.4215)




Epoch 17/50 | Train Loss: 0.3739 | Val F1: 0.4104




Epoch 18/50 | Train Loss: 0.3518 | Val F1: 0.4225
‚úÖ Saved best model (F1=0.4225)




Epoch 19/50 | Train Loss: 0.3158 | Val F1: 0.4103




Epoch 20/50 | Train Loss: 0.2907 | Val F1: 0.4503
‚úÖ Saved best model (F1=0.4503)




Epoch 21/50 | Train Loss: 0.2709 | Val F1: 0.4362




Epoch 22/50 | Train Loss: 0.2433 | Val F1: 0.4549
‚úÖ Saved best model (F1=0.4549)




Epoch 23/50 | Train Loss: 0.2174 | Val F1: 0.4774
‚úÖ Saved best model (F1=0.4774)




Epoch 24/50 | Train Loss: 0.2010 | Val F1: 0.4617




Epoch 25/50 | Train Loss: 0.1779 | Val F1: 0.4630




Epoch 26/50 | Train Loss: 0.1649 | Val F1: 0.4652




Epoch 27/50 | Train Loss: 0.1497 | Val F1: 0.4799
‚úÖ Saved best model (F1=0.4799)




Epoch 28/50 | Train Loss: 0.1285 | Val F1: 0.4685




Epoch 29/50 | Train Loss: 0.1192 | Val F1: 0.4837
‚úÖ Saved best model (F1=0.4837)




Epoch 30/50 | Train Loss: 0.1092 | Val F1: 0.4651




Epoch 31/50 | Train Loss: 0.0959 | Val F1: 0.4851
‚úÖ Saved best model (F1=0.4851)




Epoch 32/50 | Train Loss: 0.0883 | Val F1: 0.4639




Epoch 33/50 | Train Loss: 0.0824 | Val F1: 0.4757




Epoch 34/50 | Train Loss: 0.0737 | Val F1: 0.4668




Epoch 35/50 | Train Loss: 0.0615 | Val F1: 0.4674




Epoch 36/50 | Train Loss: 0.0620 | Val F1: 0.4811




Epoch 37/50 | Train Loss: 0.0587 | Val F1: 0.4761




Epoch 38/50 | Train Loss: 0.0508 | Val F1: 0.4736




Epoch 39/50 | Train Loss: 0.0506 | Val F1: 0.4767




Epoch 40/50 | Train Loss: 0.0431 | Val F1: 0.4783




Epoch 41/50 | Train Loss: 0.0398 | Val F1: 0.4745




Epoch 42/50 | Train Loss: 0.0404 | Val F1: 0.4737




Epoch 43/50 | Train Loss: 0.0368 | Val F1: 0.4774




Epoch 44/50 | Train Loss: 0.0347 | Val F1: 0.4741




Epoch 45/50 | Train Loss: 0.0349 | Val F1: 0.4785




Epoch 46/50 | Train Loss: 0.0321 | Val F1: 0.4796




Epoch 47/50 | Train Loss: 0.0332 | Val F1: 0.4793




Epoch 48/50 | Train Loss: 0.0315 | Val F1: 0.4808




Epoch 49/50 | Train Loss: 0.0283 | Val F1: 0.4827




Epoch 50/50 | Train Loss: 0.0292 | Val F1: 0.4825
Training complete. Best F1: 0.4851


In [29]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m400.9/400.9 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.5.0
