#Notebook for Baseline BoW model and predictions on the EEC dataset

##Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from collections import Counter
import re
import pandas as pd

##Uploads include IMDB Test, Train, EEC dataset

In [2]:
from google.colab import files
uploaded = files.upload()

Saving 0000.parquet to 0000.parquet
Saving test.csv to test.csv
Saving train.csv to train.csv


##Loading training and testing data

In [3]:
eec = pd.read_parquet("/content/0000.parquet")

In [4]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_texts = train_df["text"].tolist()
train_labels = train_df["label"].tolist()

test_texts = test_df["text"].tolist()
test_labels = test_df["label"].tolist()

## Tokenization + Vocabulary Building
###We define a very simple tokenizer (regex-based) and build a vocabulary of the top 10k most frequent words.

In [5]:
def tokenize(text):
  """
  Used to convert text to lowercase and extract words using regex
  """

  text = text.lower()
  return re.findall(r"[a-z']+", text)

In [6]:
# Count word frequencies across the training set
counter = Counter()
for txt in train_texts:
    counter.update(tokenize(txt))
# Build vocabulary
vocab_size = 10000
vocab = {w: i+1 for i, (w, _) in enumerate(counter.most_common(vocab_size-1))}
UNK = 0

In [7]:
def encode(text):
  """
  Convert a sentence into a list of integer IDs.
  Limit of length 300.
  """
  tokens = tokenize(text)
  ids = [vocab.get(t, UNK) for t in tokens]
  return torch.tensor(ids[:300])

In [8]:
#Encode texts into ID's
X_train = [encode(t) for t in train_texts]
y_train = torch.tensor(train_labels)

X_test = [encode(t) for t in test_texts]
y_test = torch.tensor(test_labels)

##Saving vocabulary for BoW Distilled

In [9]:
import pickle

extras = {
    "vocab": vocab,
    "UNK": UNK,
    "vocab_size": vocab_size
}

with open("vocab_full.pkl", "wb") as f:
    pickle.dump(extras, f)

print("Saved vocab_full.pkl")

Saved vocab_full.pkl


In [10]:
def pad_batch(batch):
  """
  Padding tensors of different lengths with zeroes to create a batch of shape (batch_size, max_len)
  """
  lengths = [len(x) for x in batch]
  max_len = max(lengths)
  padded = torch.zeros(len(batch), max_len, dtype=torch.long)
  for i, seq in enumerate(batch):
      padded[i, :len(seq)] = seq
  return padded

##Define Neural BoW model

In [11]:
class NeuralBoW(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, num_classes=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
      """
      Forward pass
      x: tensor of shape (batch_size, max_len)
      embed tensor and then average
      """
      emb = self.embed(x)
      pooled = emb.mean(dim=1)
      logits = self.linear(pooled)
      return logits

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##Training Setup

In [13]:
model = NeuralBoW(vocab_size=vocab_size, embed_dim=100).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [14]:
batch_size = 32

def iterate_batches(X, y):
  """
  Randomly shuffle data and generate batches
  """
  idx = torch.randperm(len(X))
  for i in range(0, len(X), batch_size):
      batch_idx = idx[i:i+batch_size]
      batch_x = [X[j] for j in batch_idx]
      batch_y = y[batch_idx]
      yield pad_batch(batch_x).to(device), batch_y.to(device)

##Training

In [15]:
for epoch in range(5):
    model.train()
    total_loss = 0
    for batch_x, batch_y in iterate_batches(X_train, y_train):
        logits = model(batch_x)
        loss = criterion(logits, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Loss = {total_loss:.4f}")

Epoch 1 | Loss = 487.3235
Epoch 2 | Loss = 344.4422
Epoch 3 | Loss = 265.4308
Epoch 4 | Loss = 226.3418
Epoch 5 | Loss = 200.9920


##Evaluation

In [16]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for i in range(0, len(X_test), batch_size):
        batch = X_test[i:i+batch_size]
        padded = pad_batch(batch).to(device)
        logits = model(padded)
        preds = logits.argmax(dim=1)
        labels = y_test[i:i+batch_size].to(device)
        correct += (preds == labels).sum().item()
        total += len(labels)

accuracy = correct / total
print(f"\nTest Accuracy: {accuracy:.4f}")


Test Accuracy: 0.8748


##Store predictions and probabilities to extract as csv

In [17]:
import torch.nn.functional as F
all_preds = []
all_probs = []
all_labels = []
all_texts = []

model.eval()
with torch.no_grad():
    for i in range(0, len(X_test), batch_size):
        batch = X_test[i:i+batch_size]
        padded = pad_batch(batch).to(device)

        logits = model(padded)
        probs = F.softmax(logits, dim=1)
        preds = probs.argmax(dim=1)

        all_preds.extend(preds.cpu().tolist())
        all_probs.extend(probs.cpu().tolist())
        all_labels.extend(y_test[i:i+batch_size].tolist())
        all_texts.extend(test_texts[i:i+batch_size])

df = pd.DataFrame({
    "text": all_texts,
    "true_label": all_labels,
    "pred_label": all_preds,
    "prob_neg": [p[0] for p in all_probs],
    "prob_pos": [p[1] for p in all_probs],
})

In [18]:
df.to_csv("imdb_bow_test_predictions.csv", index=False)

## Prediction function for the EEC Dataset

In [19]:
def predict_sentiment(texts):
  """
  Predict sentiment for a list of texts
  """
  encoded = [encode(t) for t in texts]

  padded = pad_batch(encoded).to(device)

  model.eval()
  with torch.no_grad():
      logits = model(padded)
      probs = F.softmax(logits, dim=1)

  return probs.cpu()


In [20]:
import torch.nn.functional as F

## Make a dataframe to EEC predictions

In [21]:
all_texts = eec["sentence"].tolist()
probs = predict_sentiment(all_texts)

eec["neg_prob"] = probs[:, 0].numpy()
eec["pos_prob"] = probs[:, 1].numpy()

In [22]:
eec.head(20)

Unnamed: 0,sentence,template,person,gender,race,emotion,emotion word,neg_prob,pos_prob
0,Alonzo feels angry.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,angry,0.989238,0.010762
1,Alonzo feels furious.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,furious,0.989794,0.010206
2,Alonzo feels irritated.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,irritated,0.999221,0.000779
3,Alonzo feels enraged.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,enraged,0.975443,0.024557
4,Alonzo feels annoyed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,annoyed,0.999301,0.000699
5,Alonzo feels sad.,<person subject> feels <emotion word>.,Alonzo,male,African-American,sadness,sad,0.998369,0.001631
6,Alonzo feels depressed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,sadness,depressed,0.859677,0.140323
7,Alonzo feels devastated.,<person subject> feels <emotion word>.,Alonzo,male,African-American,sadness,devastated,0.880421,0.119579
8,Alonzo feels miserable.,<person subject> feels <emotion word>.,Alonzo,male,African-American,sadness,miserable,0.999744,0.000256
9,Alonzo feels disappointed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,sadness,disappointed,0.99999,1e-05


In [23]:
eec.to_csv("eec_predictions_bow_baseline.csv", index=False)