# Notebook for Distilled Bog of Words model and predictions on EEC dataset

## Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from collections import Counter
import re

In [2]:
import torch.nn.functional as F

##Uploads include IMDB Test, Train, EEC dataset and saved vocabulary from BoW baseline

In [3]:
from google.colab import files
uploaded = files.upload()

Saving 0000.parquet to 0000.parquet
Saving test.csv to test.csv
Saving train.csv to train.csv
Saving vocab_full.pkl to vocab_full.pkl


##Loading training and testing data

In [4]:
import pandas as pd

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [5]:
train_texts = train_df["text"].tolist()
train_labels = train_df["label"].tolist()
teacher_probs = train_df[["label_0", "label_1"]].values #Here we take the teacher probs to feed into the model for distillation

test_texts = test_df["text"].tolist()
test_labels = test_df["label"].tolist()

##Loading vocaab from baseline BoW

In [6]:
import pickle
with open("vocab_full.pkl", "rb") as f:
    data = pickle.load(f)

vocab = data["vocab"]
UNK = data["UNK"]
vocab_size = data["vocab_size"]

print("Vocab loaded. Size =", vocab_size)

Vocab loaded. Size = 10000


## Tokenization + Vocabulary Building
###We define a very simple tokenizer (regex-based) and build a vocabulary of the top 10k most frequent words.(Same fuction reused from baseline_BoW for preds)

In [7]:
def tokenize(text):
  """
  Used to convert text to lowercase and extract words using regex
  """
  text = text.lower()
  return re.findall(r"[a-z']+", text)

In [8]:
def encode(text):
  """
  Convert a sentence into a list of integer IDs.
  Limit of length 300.
  """
  tokens = tokenize(text)
  ids = [vocab.get(t, UNK) for t in tokens]
  return torch.tensor(ids[:300])

In [9]:
X_train = [encode(t) for t in train_texts]
y_train = torch.tensor(train_labels)
teacher_y = torch.tensor(teacher_probs, dtype=torch.float32)#Soft probs from teacher Llama to feed to the student model

X_test = [encode(t) for t in test_texts]
y_test = torch.tensor(test_labels)

print("Encoded", len(X_train), "training samples")

Encoded 24791 training samples


In [10]:
def pad_batch(batch):
  """
  Padding tensors of different lengths with zeroes to create a batch of shape (batch_size, max_len)
  """
  lengths = [len(x) for x in batch]
  max_len = max(lengths)
  padded = torch.zeros(len(batch), max_len, dtype=torch.long)
  for i, seq in enumerate(batch):
      padded[i, :len(seq)] = seq
  return padded

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cpu


In [12]:
batch_size = 32

def iterate_batches_distill(X, hard_y, soft_y):
  """
  Shuffled mini batches which include the teacher probs and hard labels
  """
  idx = torch.randperm(len(X))
  for i in range(0, len(X), batch_size):
      batch_idx = idx[i:i+batch_size]
      batch_x = [X[j] for j in batch_idx]
      batch_hard = hard_y[batch_idx]
      batch_soft = soft_y[batch_idx]
      yield pad_batch(batch_x).to(device), batch_hard.to(device), batch_soft.to(device)

In [13]:
class NeuralBoW(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, num_classes=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
      """
      Forward pass
      x: tensor of shape (batch_size, max_len)
      embed tensor and then average
      """
      emb = self.embed(x)
      pooled = emb.mean(dim=1)
      logits = self.linear(pooled)
      return logits

model = NeuralBoW(vocab_size=vocab_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

##Distillation Loss

In [14]:
def distillation_loss(student_logits, teacher_probs, hard_labels, alpha=0.9):
    """
    Computes the standard distillation objective:

    loss = α * KL(student || teacher)  +  (1 - α) * CE(student, hard_labels)

    Where:
    - teacher_probs: soft target probabilities from the teacher model
    - hard_labels: ground truth labels (0/1)
    - alpha: weight given to teacher signal
    """

    # Soft loss KL divergence between student and teacher distributions
    student_log_probs = F.log_softmax(student_logits, dim=1)

    # Ensure teacher probs are valid
    teacher_probs = teacher_probs.clamp(min=1e-8)
    teacher_probs = teacher_probs / teacher_probs.sum(dim=1, keepdim=True)

    loss_soft = F.kl_div(student_log_probs, teacher_probs, reduction="batchmean")

    # Hard loss standard cross entropy
    loss_hard = F.cross_entropy(student_logits, hard_labels)

    return alpha * loss_soft + (1 - alpha) * loss_hard

##Training loss

In [15]:
epochs = 5
alpha = 0.90  # Use 90% teacher signal, 10% hard labels

for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    for batch_x, hard_y, soft_y in iterate_batches_distill(X_train, y_train, teacher_y):
        logits = model(batch_x)

        # Call the fixed distillation loss (no temperature)
        loss = distillation_loss(
            student_logits=logits,
            teacher_probs=soft_y,
            hard_labels=hard_y,
            alpha=alpha
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} | Loss = {total_loss:.4f}")

Epoch 1/5 | Loss = 379.4425
Epoch 2/5 | Loss = 256.6921
Epoch 3/5 | Loss = 192.3764
Epoch 4/5 | Loss = 163.8495
Epoch 5/5 | Loss = 147.6578


##Model Evaluation

In [16]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for i in range(0, len(X_test), batch_size):
        batch = X_test[i:i+batch_size]
        padded = pad_batch(batch).to(device)
        logits = model(padded)
        preds = logits.argmax(dim=1)
        labels = y_test[i:i+batch_size].to(device)

        correct += (preds == labels).sum().item()
        total += len(labels)

accuracy = correct / total
print("Test Accuracy:", accuracy*100)

Test Accuracy: 85.98685006655641


##Converting preds to dataframe

In [17]:
import torch.nn.functional as F
all_preds = []
all_probs = []
all_labels = []
all_texts = []

model.eval()
with torch.no_grad():
    for i in range(0, len(X_test), batch_size):
        batch = X_test[i:i+batch_size]
        padded = pad_batch(batch).to(device)

        logits = model(padded)
        probs = F.softmax(logits, dim=1)
        preds = probs.argmax(dim=1)

        all_preds.extend(preds.cpu().tolist())
        all_probs.extend(probs.cpu().tolist())
        all_labels.extend(y_test[i:i+batch_size].tolist())
        all_texts.extend(test_texts[i:i+batch_size])

df = pd.DataFrame({
    "text": all_texts,
    "true_label": all_labels,
    "pred_label": all_preds,
    "prob_neg": [p[0] for p in all_probs],
    "prob_pos": [p[1] for p in all_probs],
})

In [18]:
df.to_csv("imdb_bow_distilled_test_predictions.csv", index=False)

In [19]:
import pandas as pd
eec = pd.read_parquet('/content/0000.parquet')
df.head()

Unnamed: 0,text,true_label,pred_label,prob_neg,prob_pos
0,This sci-fi great fortunately has little to do...,1,1,0.160246,0.839754
1,Yet another recent comedy that shows that Holl...,0,0,0.901871,0.098129
2,I was looking on Imdbs bottom 100 because i th...,0,0,0.984619,0.015381
3,Bo Derek will not go down in history as a grea...,0,0,0.885124,0.114876
4,"During the Civil War, there were many cases of...",0,1,0.172173,0.827827


##Predictions for the EEC Dataset

In [20]:
def predict_sentiment(texts):
  """
  Predict sentiment for a list of texts
  """
  encoded = [encode(t) for t in texts]

  padded = pad_batch(encoded).to(device)

  model.eval()
  with torch.no_grad():
      logits = model(padded)
      probs = F.softmax(logits, dim=1)

  return probs.cpu()


## Make a dataframe to EEC predictions

In [21]:
import torch.nn.functional as F

In [22]:
all_texts = eec["sentence"].tolist()
probs = predict_sentiment(all_texts)

eec["neg_prob"] = probs[:, 0].numpy()
eec["pos_prob"] = probs[:, 1].numpy()

In [23]:
eec.head(20)

Unnamed: 0,sentence,template,person,gender,race,emotion,emotion word,neg_prob,pos_prob
0,Alonzo feels angry.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,angry,0.979601,0.020399
1,Alonzo feels furious.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,furious,0.907245,0.092755
2,Alonzo feels irritated.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,irritated,0.997775,0.002225
3,Alonzo feels enraged.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,enraged,0.962489,0.037511
4,Alonzo feels annoyed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,annoyed,0.998522,0.001478
5,Alonzo feels sad.,<person subject> feels <emotion word>.,Alonzo,male,African-American,sadness,sad,0.99821,0.00179
6,Alonzo feels depressed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,sadness,depressed,0.935367,0.064633
7,Alonzo feels devastated.,<person subject> feels <emotion word>.,Alonzo,male,African-American,sadness,devastated,0.939962,0.060038
8,Alonzo feels miserable.,<person subject> feels <emotion word>.,Alonzo,male,African-American,sadness,miserable,0.999457,0.000543
9,Alonzo feels disappointed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,sadness,disappointed,0.999973,2.7e-05


In [None]:
eec.to_csv("eec_predictions_bow_distilled1.csv", index=False)