<a href="https://colab.research.google.com/github/22f3000982/DL_genai_project/blob/main/milestone_4_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Milestone 4 ‚Äî Sequence Modeling with LSTM and GRU

This milestone introduces **deep learning models (LSTM / GRU)** that are specifically designed to capture the **order and contextual relationships** between words in a sequence.

---

##  Suggested Readings
- [LSTM](https://docs.pytorch.org/docs/stable/generated/torch.nn.GRU.html)
- [GRU](https://docs.pytorch.org/docs/stable/generated/torch.nn.LSTM.html)

---

## ‚öôÔ∏è Instructions

Use the **constants and helper functions** provided in the next cell to answer all **Milestone-4 questions**.

Perform the following tasks on the **training dataset** provided as part of the Kaggle competition:

üîó **Competition Link:**  
[2025-Sep-DL-Gen-AI-Project](https://www.kaggle.com/competitions/2025-sep-dl-gen-ai-project)


# Imports

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
import random
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")

### Set seeds and Constants

In [15]:
#----------------------------- DON'T CHANGE THIS --------------------------
DATA_SEED = 67
TRAINING_SEED = 1234
MAX_LEN = 50
BATCH_SIZE = 64
EMB_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 5

random.seed(DATA_SEED)
np.random.seed(DATA_SEED)
torch.manual_seed(DATA_SEED)
torch.cuda.manual_seed(DATA_SEED)

# Create Vocab

In [16]:
data_path = "/content/train (2).csv"  # enter your data path here
df = pd.read_csv(data_path)  # read it and store it in df

In [17]:
# Split train df into train_df(80%) and test_df (20%) use seed
train_df, test_df = train_test_split(df, test_size=0.2, random_state=DATA_SEED)

In [18]:
# create a simple space-based tokenizer.
def tokenize(text):
    return text.split()

In [19]:
# Use counter to count all tokens in train_df
token_counter = Counter()
for text in train_df['text']:
    token_counter.update(tokenize(text))

## Create train and val dataloaders

In [21]:
#----------------------------- DON'T CHANGE THIS --------------------------
specials = ['<unk>', '<pad>']
min_freq = 2
vocab_list = specials + [token for token, freq in token_counter.items() if freq >= min_freq]
word2idx = {token: i for i, token in enumerate(vocab_list)}
UNK_IDX = word2idx['<unk>']
PAD_IDX = word2idx['<pad>']
VOCAB_SIZE = len(vocab_list)

def text_pipeline(text):
    """Converts text to a list of indices using the word2idx dict."""
    tokens = tokenize(text)
    return [word2idx.get(token, UNK_IDX) for token in tokens]

class EmotionDataset(Dataset):
    def __init__(self, dataframe):
        self.texts = dataframe['text'].values
        self.labels = dataframe[['anger', 'fear', 'joy', 'sadness', 'surprise']].values.astype(np.float32)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _labels) in batch:
        label_list.append(_labels)
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)[:MAX_LEN]
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.float32)
    text_list = pad_sequence(text_list, batch_first=True, padding_value=PAD_IDX)
    if text_list.shape[1] < MAX_LEN:
        pad_tensor = torch.full(
            (text_list.shape[0], MAX_LEN - text_list.shape[1]),
            PAD_IDX,
            dtype=torch.int64
        )
        text_list = torch.cat((text_list, pad_tensor), dim=1)

    return text_list, label_list

# Create train and val dataloaders
train_dataset = EmotionDataset(train_df)
val_dataset = EmotionDataset(test_df)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

In [34]:
# Training parameters
learning_rate = 0.001
num_epochs = 10

# Initialize model, loss function, and optimizer
simple_lstm_model = SimpleLSTM(VOCAB_SIZE, EMB_DIM, HIDDEN_DIM, OUTPUT_DIM, PAD_IDX)
criterion = nn.BCEWithLogitsLoss() # Use BCEWithLogitsLoss for multi-label classification
optimizer = optim.Adam(simple_lstm_model.parameters(), lr=learning_rate)

# Function to calculate F1 score
def calculate_f1(y_true, y_pred):
    y_pred = torch.sigmoid(y_pred).round().detach().cpu().numpy()
    y_true = y_true.detach().cpu().numpy()
    return f1_score(y_true, y_pred, average='macro')

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
simple_lstm_model.to(device)

for epoch in range(num_epochs):
    simple_lstm_model.train()
    running_loss = 0.0
    running_f1 = 0.0

    for text, labels in train_dataloader:
        text, labels = text.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = simple_lstm_model(text)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * text.size(0)
        running_f1 += calculate_f1(labels, outputs) * text.size(0)

    epoch_loss = running_loss / len(train_dataset)
    epoch_f1 = running_f1 / len(train_dataset)

    # Evaluation loop
    simple_lstm_model.eval()
    val_running_loss = 0.0
    val_running_f1 = 0.0

    with torch.no_grad():
        for text, labels in val_dataloader:
            text, labels = text.to(device), labels.to(device)
            outputs = simple_lstm_model(text)
            loss = criterion(outputs, labels)

            val_running_loss += loss.item() * text.size(0)
            val_running_f1 += calculate_f1(labels, outputs) * text.size(0)

    val_epoch_loss = val_running_loss / len(val_dataset)
    val_epoch_f1 = val_running_f1 / len(val_dataset)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_loss:.4f}, Train F1: {epoch_f1:.4f}, Val Loss: {val_epoch_loss:.4f}, Val F1: {val_epoch_f1:.4f}")

print("Training finished!")

Epoch 1/10, Train Loss: 0.5794, Train F1: 0.1454, Val Loss: 0.5663, Val F1: 0.1425
Epoch 2/10, Train Loss: 0.5664, Train F1: 0.1448, Val Loss: 0.5655, Val F1: 0.1425
Epoch 3/10, Train Loss: 0.5655, Train F1: 0.1535, Val Loss: 0.5662, Val F1: 0.1435
Epoch 4/10, Train Loss: 0.5638, Train F1: 0.1624, Val Loss: 0.5667, Val F1: 0.1513
Epoch 5/10, Train Loss: 0.5621, Train F1: 0.1682, Val Loss: 0.5638, Val F1: 0.1638


KeyboardInterrupt: 

In [31]:
# Simple LSTM Model
class SimpleLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        hidden = hidden.squeeze(0)
        return self.fc(hidden)

# Bidirectional LSTM Model
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        return self.fc(hidden)

# Stacked GRU Model (2 layers)
class StackedGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx, n_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.gru(embedded)
        hidden = hidden[-1,:,:]
        return self.fc(hidden)

### Q1. What are the vocabulary size, padding token index, and unknown token index for the above dataset?

In [22]:
# ------------------- write your code here -------------------------------
print(f"Vocabulary size: {VOCAB_SIZE}")
print(f"Padding token index: {PAD_IDX}")
print(f"Unknown token index: {UNK_IDX}")
#-------------------------------------------------------------------------

Vocabulary size: 5730
Padding token index: 1
Unknown token index: 0


### Q2.What are the indices for the words "happy", "alone", and "sad" in the vocabulary?

In [23]:
# ------------------- write your code here -------------------------------
print(f"Index for 'happy': {word2idx.get('happy', UNK_IDX)}")
print(f"Index for 'alone': {word2idx.get('alone', UNK_IDX)}")
print(f"Index for 'sad': {word2idx.get('sad', UNK_IDX)}")
#-------------------------------------------------------------------------

Index for 'happy': 1578
Index for 'alone': 2525
Index for 'sad': 885


In [24]:
# Get one batch to test shapes
text_batch, labels_batch = next(iter(train_dataloader))
emb_layer = nn.Embedding(VOCAB_SIZE, EMB_DIM)
embedded_batch = emb_layer(text_batch)

# Simple LSTM layer Output Shape (Use constants defined in 2nd cell)
# lstm = # Create your lstm layer here
#read_output = lstm(embedded_batch)

### Q3. What is the output shape of the Embedding layer?


In [25]:
# ------------------- write your code here -------------------------------
print(f"Output shape of Embedding layer: {embedded_batch.shape}")
#-------------------------------------------------------------------------

Output shape of Embedding layer: torch.Size([64, 50, 100])


### Q4. What will be output shape of simple LSTM layer

In [26]:
# ------------------- write your code here -------------------------------
lstm = nn.LSTM(EMB_DIM, HIDDEN_DIM, batch_first=True)
lstm_output, (lstm_hidden, lstm_cell) = lstm(embedded_batch)
print(f"Output shape of simple LSTM layer: {lstm_output.shape}")
#-------------------------------------------------------------------------

Output shape of simple LSTM layer: torch.Size([64, 50, 256])


### Q5. What is the 'hidden' state shape from a simple LSTM?

In [27]:
# ------------------- write your code here -------------------------------
print(f"'Hidden' state shape from a simple LSTM: {lstm_hidden.shape}")
#-------------------------------------------------------------------------

'Hidden' state shape from a simple LSTM: torch.Size([1, 64, 256])


### Q6. What is the 'hidden' state shape from a simple GRU?

In [28]:
# similarly do it for gru and find hidden state shape
# ------------------- write your code here -------------------------------
gru = nn.GRU(EMB_DIM, HIDDEN_DIM, batch_first=True)
gru_output, gru_hidden = gru(embedded_batch)
print(f"'Hidden' state shape from a simple GRU: {gru_hidden.shape}")
#-------------------------------------------------------------------------

'Hidden' state shape from a simple GRU: torch.Size([1, 64, 256])


### Q7. What is the 'output' tensor shape from a bidirectional LSTM?

In [29]:
# Bidirectional LSTM Output Shape
# ------------------- write your code here -------------------------------
bidirectional_lstm = nn.LSTM(EMB_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True)
bidirectional_lstm_output, (bidirectional_lstm_hidden, bidirectional_lstm_cell) = bidirectional_lstm(embedded_batch)
print(f"'Output' tensor shape from a bidirectional LSTM: {bidirectional_lstm_output.shape}")
#-------------------------------------------------------------------------

'Output' tensor shape from a bidirectional LSTM: torch.Size([64, 50, 512])


### Q8. What is the 'hidden' state shape from a bidirectional LSTM?

In [30]:
# Bidirectional LSTM Hidden Shape
# ------------------- write your code here -------------------------------
print(f"'Hidden' state shape from a bidirectional LSTM: {bidirectional_lstm_hidden.shape}")
#-------------------------------------------------------------------------

'Hidden' state shape from a bidirectional LSTM: torch.Size([2, 64, 256])


### Q9. Create 3 sequential models using the (Simple & Bidirectional)LSTM and Stacked GRU (2 layers)For all models, follow this(Embedding layer ‚Üí [LSTM / BiLSTM / Stacked GRU] ‚Üí Linear layer) architecture. What will be the training parameters in all 3 cases?(LSTM, BiLSTM, Stacked GRU)

In [32]:
# ------------------- write your code here -------------------------------
# Simple LSTM
simple_lstm_model = SimpleLSTM(VOCAB_SIZE, EMB_DIM, HIDDEN_DIM, OUTPUT_DIM, PAD_IDX)
simple_lstm_params = sum(p.numel() for p in simple_lstm_model.parameters() if p.requires_grad)
print(f"Number of training parameters for Simple LSTM: {simple_lstm_params}")

# Bidirectional LSTM
bilstm_model = BiLSTM(VOCAB_SIZE, EMB_DIM, HIDDEN_DIM, OUTPUT_DIM, PAD_IDX)
bilstm_params = sum(p.numel() for p in bilstm_model.parameters() if p.requires_grad)
print(f"Number of training parameters for Bidirectional LSTM: {bilstm_params}")

# Stacked GRU
stacked_gru_model = StackedGRU(VOCAB_SIZE, EMB_DIM, HIDDEN_DIM, OUTPUT_DIM, PAD_IDX)
stacked_gru_params = sum(p.numel() for p in stacked_gru_model.parameters() if p.requires_grad)
print(f"Number of training parameters for Stacked GRU: {stacked_gru_params}")
#-------------------------------------------------------------------------

Number of training parameters for Simple LSTM: 940877
Number of training parameters for Bidirectional LSTM: 1308749
Number of training parameters for Stacked GRU: 1243981


### Q10. If you experimented with both LSTM and GRU models using the same hyperparameters, which one achieved a better peak Macro F1-score in your W&B logs?

In [39]:
# ============================
# LSTM vs GRU (Multi-label)
# ============================
import re, time
import numpy as np
import pandas as pd
from collections import Counter

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using:", DEVICE)

# ---------- Load ----------
train_path = "/content/train (2).csv"
df = pd.read_csv(train_path)

text_col = "text"
# Multi-label columns present in your dataset
label_cols = ["anger", "fear", "joy", "sadness", "surprise"]
assert all(c in df.columns for c in label_cols), "Some multi-label columns missing."

# Clean text
df[text_col] = df[text_col].fillna("").astype(str)

# ---------- Tokenizer + Vocab ----------
def tokenize(x):
    return re.findall(r"[A-Za-z0-9']+", x.lower())

counter = Counter()
for t in df[text_col]:
    counter.update(tokenize(t))

# Use min_freq=3 to stabilize vocab
counter = Counter({w:c for w,c in counter.items() if c >= 3})

PAD, UNK = "<pad>", "<unk>"
itos = [PAD, UNK] + sorted(counter.keys())
stoi = {w:i for i,w in enumerate(itos)}
PAD_IDX, UNK_IDX = 0, 1
VOCAB_SIZE = len(itos)

print(f"Vocab size: {VOCAB_SIZE} | PAD={PAD_IDX} UNK={UNK_IDX}")

def encode(tokens, max_len=60):
    ids = [stoi.get(t, UNK_IDX) for t in tokens]
    ids = ids[:max_len] + [PAD_IDX] * max(0, max_len - len(ids))
    return ids

# ---------- Dataset ----------
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels.astype("float32")
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        x = torch.tensor(encode(tokenize(self.texts[i])), dtype=torch.long)
        y = torch.tensor(self.labels[i], dtype=torch.float32)
        return x, y

X_train, X_val, y_train, y_val = train_test_split(
    df[text_col].values,
    df[label_cols].values,
    test_size=0.2,
    random_state=42
)

train_loader = DataLoader(TextDataset(X_train, y_train), batch_size=64, shuffle=True)
val_loader   = DataLoader(TextDataset(X_val,   y_val),   batch_size=64, shuffle=False)

# ---------- Models ----------
EMB_DIM = 128
HIDDEN  = 128
EPOCHS  = 5
LR      = 2e-3
NUM_CLASSES = len(label_cols)

class LSTMModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb  = nn.Embedding(VOCAB_SIZE, EMB_DIM, padding_idx=PAD_IDX)
        self.rnn  = nn.LSTM(EMB_DIM, HIDDEN, batch_first=True)
        self.fc   = nn.Linear(HIDDEN, NUM_CLASSES)
    def forward(self, x):
        e = self.emb(x)
        out, (h, c) = self.rnn(e)   # h: (1, B, H)
        return self.fc(h[-1])       # (B, C)

class GRUModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb  = nn.Embedding(VOCAB_SIZE, EMB_DIM, padding_idx=PAD_IDX)
        self.rnn  = nn.GRU(EMB_DIM, HIDDEN, batch_first=True)
        self.fc   = nn.Linear(HIDDEN, NUM_CLASSES)
    def forward(self, x):
        e = self.emb(x)
        out, h = self.rnn(e)        # h: (1, B, H)
        return self.fc(h[-1])       # (B, C)

criterion = nn.BCEWithLogitsLoss()

@torch.no_grad()
def eval_macro_f1(model, loader):
    model.eval()
    preds, trues = [], []
    for x, y in loader:
        x = x.to(DEVICE)
        logits = model(x)
        probs  = torch.sigmoid(logits)
        p = (probs > 0.5).int().cpu().numpy()
        preds.append(p)
        trues.append(y.numpy())
    y_true = np.vstack(trues)
    y_pred = np.vstack(preds)
    return f1_score(y_true, y_pred, average="macro")

def train_and_eval(model):
    model = model.to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=LR)
    best_f1 = 0.0
    for ep in range(1, EPOCHS+1):
        model.train()
        for x, y in train_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            opt.zero_grad()
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            opt.step()
        f1 = eval_macro_f1(model, val_loader)
        best_f1 = max(best_f1, f1)
        print(f"Epoch {ep}: val Macro F1 = {f1:.4f}")
    return best_f1

print("\n--- Training LSTM (multi-label) ---")
best_lstm = train_and_eval(LSTMModel())

print("\n--- Training GRU (multi-label) ---")
best_gru = train_and_eval(GRUModel())

print("\n=== Comparison ===")
print(f"Best LSTM Macro F1: {best_lstm:.4f}")
print(f"Best GRU  Macro F1: {best_gru:.4f}")

if best_lstm > best_gru:
    print("\n‚úÖ LSTM achieved the better peak Macro F1-score.")
elif best_gru > best_lstm:
    print("\n‚úÖ GRU achieved the better peak Macro F1-score.")
else:
    print("\n‚öñÔ∏è Both performed equally.")


Using: cpu
Vocab size: 3246 | PAD=0 UNK=1

--- Training LSTM (multi-label) ---
Epoch 1: val Macro F1 = 0.1457
Epoch 2: val Macro F1 = 0.1490
Epoch 3: val Macro F1 = 0.1535
Epoch 4: val Macro F1 = 0.1540
Epoch 5: val Macro F1 = 0.1587

--- Training GRU (multi-label) ---
Epoch 1: val Macro F1 = 0.1459
Epoch 2: val Macro F1 = 0.1510
Epoch 3: val Macro F1 = 0.2187
Epoch 4: val Macro F1 = 0.2664
Epoch 5: val Macro F1 = 0.2693

=== Comparison ===
Best LSTM Macro F1: 0.1587
Best GRU  Macro F1: 0.2693

‚úÖ GRU achieved the better peak Macro F1-score.


In [37]:
df.head()
df.columns


Index(['id', 'text', 'anger', 'fear', 'joy', 'sadness', 'surprise',
       'emotions'],
      dtype='object')

### Q11. Compare the total training time for your best sequential model against the simple averaging model from Milestone 3. How much longer (in minutes or percentage) did the more complex model (LSTM and GRU) take to train for the same number of epochs?

In [40]:
# ===========================================
# 1) Simple Averaging Baseline Model (Milestone 3)
# ===========================================
class SimpleAvgModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding(VOCAB_SIZE, EMB_DIM, padding_idx=PAD_IDX)
        self.fc = nn.Linear(EMB_DIM, NUM_CLASSES)

    def forward(self, x):
        e = self.emb(x)                             # (B, L, E)
        mask = (x != PAD_IDX).unsqueeze(-1)         # (B, L, 1)
        summed = (e * mask).sum(dim=1)              # (B, E)
        denom = mask.sum(dim=1).clamp(min=1)        # (B, 1)
        avg = summed / denom                        # (B, E)
        return self.fc(avg)                         # (B, C)


def train_avg():
    model = SimpleAvgModel().to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=LR)
    start = time.time()

    for ep in range(1, EPOCHS+1):
        model.train()
        for x, y in train_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            opt.zero_grad()
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            opt.step()

    end = time.time()
    return end - start


# Run baseline model
print("\n--- Timing Simple Averaging Model ---")
baseline_time = train_avg()
print(f"Baseline Training Time: {baseline_time:.2f} sec")


# ===========================================
# 2) Timing your best sequential model
# Replace GRUModel() with LSTMModel() if LSTM performed better
# ===========================================
def train_seq():
    model = GRUModel().to(DEVICE)   # <-- Change to LSTMModel() if LSTM > GRU
    opt = torch.optim.AdamW(model.parameters(), lr=LR)
    start = time.time()

    for ep in range(1, EPOCHS+1):
        model.train()
        for x, y in train_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            opt.zero_grad()
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            opt.step()

    end = time.time()
    return end - start


print("\n--- Timing Best Sequential Model ---")
seq_time = train_seq()
print(f"Sequential Model Training Time: {seq_time:.2f} sec")


# ===========================================
# 3) Compare Times
# ===========================================
extra_minutes = (seq_time - baseline_time) / 60
percent_more = (seq_time - baseline_time) / baseline_time * 100

print("\n===== Time Comparison Result =====")
print(f"The sequential model took {extra_minutes:.2f} more minutes.")
print(f"Which is {percent_more:.1f}% longer than the simple averaging model.")



--- Timing Simple Averaging Model ---
Baseline Training Time: 6.94 sec

--- Timing Best Sequential Model ---
Sequential Model Training Time: 36.26 sec

===== Time Comparison Result =====
The sequential model took 0.49 more minutes.
Which is 422.7% longer than the simple averaging model.


### Q12. If you experimented with both LSTM and GRU models using the same hyperparameters, which one achieved a better peak Macro F1-score in your W&B logs?

### Q13 Based on your experiments, what was the most impactful hyperparameter you tuned for your sequential model (e.g., learning rate, hidden size, number of layers, dropout rate)?