##Imports

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from collections import Counter
import re
import pandas as pd

##Uploads include IMDB Test train, EEC dataset and saved vocab from CNN baseline

In [3]:
from google.colab import files
uploaded = files.upload()

Saving 0000.parquet to 0000.parquet
Saving test.csv to test.csv
Saving train.csv to train.csv
Saving vocab_CNN.pkl to vocab_CNN.pkl


##Loading Vocab from CNN baseline

In [4]:
import pickle
with open("vocab_CNN.pkl", "rb") as f:
    data = pickle.load(f)

vocab = data["vocab"]
UNK = data["UNK"]
vocab_size = data["vocab_size"]

print("Vocab loaded. Size =", vocab_size)

Vocab loaded. Size = 10000


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


##Loading train and test csvs

In [6]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_texts = train_df["text"].tolist()
train_labels = train_df["label"].tolist()
teacher_probs = train_df[["label_0", "label_1"]].values
test_texts = test_df["text"].tolist()
test_labels = test_df["label"].tolist()

##Simple tokenizer

In [7]:
def tokenize(text):
    """
    Convert text to lowercase and return alphabetic tokens.
    """
    text = text.lower()
    return re.findall(r"[a-z']+", text)

In [9]:
def encode(text):
  """
  Encoder which derives vocab from the baseline CNN
  """
  tok = tokenize(text)
  ids = [vocab.get(t, UNK) for t in tok]
  return torch.tensor(ids[:400])

##Encode the test train datasets

In [10]:
X_train = [encode(t) for t in train_texts]
y_train_hard = torch.tensor(train_labels)
y_train_soft = torch.tensor(teacher_probs).float() #Here we take the teacher probs to feed into the model for distillation
X_test = [encode(t) for t in test_texts]
y_test = torch.tensor(test_labels)

In [11]:
def pad_batch(batch):
  """
  Padding all sequences in the batch to the same length.
  """
  max_len = max(len(x) for x in batch)
  padded = torch.zeros(len(batch), max_len, dtype=torch.long)
  for i, seq in enumerate(batch):
      padded[i, :len(seq)] = seq
  return padded

##Model Definition: Same as the baseline CNN

In [13]:
class StudentCNN(nn.Module):
    """
    StudentCNN:
    Embedding layer with multiple convolution filters (kernel sizes: 4,5,6) and global max pooling
    Same architecture as the baseline CNN
    """
    def __init__(self, vocab_size, embed_dim=100, num_classes=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        # 3 filter sizes
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=100, kernel_size=k)
            for k in [4, 5, 6]
        ])
        self.fc = nn.Linear(100*3, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
      """
      x: (B, T)
      Output: logits (B, 2)
      """
      emb = self.embed(x)
      emb = emb.transpose(1, 2)

      conv_outputs = []
      for conv in self.convs:
          c = conv(emb)
          c = torch.relu(c)
          c = torch.max(c, dim=2)[0]
          conv_outputs.append(c)

      concat = torch.cat(conv_outputs, dim=1)
      concat = self.dropout(concat)
      logits = self.fc(concat)
      return logits

##Distillation Loss

In [14]:
def distillation_loss(student_logits, teacher_probs, hard_labels, alpha=0.9):
    """
    Computes the standard distillation objective:

    loss = α * KL(student || teacher)  +  (1 - α) * CE(student, hard_labels)

    Where:
    - teacher_probs: soft target probabilities from the teacher model
    - hard_labels: ground truth labels (0/1)
    - alpha: weight given to teacher signal
    """
    # Hard loss - Cross Entropy
    hard_loss = nn.CrossEntropyLoss()(student_logits, hard_labels)

    # Soft loss - KL Divergence between student and teacher
    student_log_probs = torch.log_softmax(student_logits, dim=1)
    teacher_probs = teacher_probs.clamp(min=1e-8)#Clamping for numerical stability
    teacher_probs = teacher_probs / teacher_probs.sum(dim=1, keepdim=True)
    soft_loss = nn.KLDivLoss(reduction="batchmean")(
        student_log_probs,
        teacher_probs
    )

    return alpha * soft_loss + (1 - alpha) * hard_loss

##Training setup and training

In [16]:
batch_size = 32
model = StudentCNN(vocab_size=vocab_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

def iterate_batches():
  """
  Iterates over the training data in batches.
  """
  idx = torch.randperm(len(X_train))
  for i in range(0, len(X_train), batch_size):
      sel = idx[i:i+batch_size]
      batch_x = [X_train[j] for j in sel]
      batch_hard = y_train_hard[sel]#Tre labels
      batch_soft = y_train_soft[sel]#Teacher probs
      yield pad_batch(batch_x).to(device), batch_hard.to(device), batch_soft.to(device)

# Training
for epoch in range(5):
    model.train()
    total_loss = 0
    for bx, bhard, bsoft in iterate_batches():
        logits = model(bx)
        loss = distillation_loss(
            student_logits=logits,
            teacher_probs=bsoft,
            hard_labels=bhard,
            alpha=0.9
        )
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss = {total_loss:.4f}")

Epoch 1 Loss = 375.9718
Epoch 2 Loss = 267.2275
Epoch 3 Loss = 209.0211
Epoch 4 Loss = 167.4173
Epoch 5 Loss = 139.2558


##Evaluation

In [18]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for i in range(0, len(X_test), batch_size):
        batch = X_test[i:i+batch_size]
        padded = pad_batch(batch).to(device)
        logits = model(padded)
        preds = logits.argmax(dim=1)
        labels = y_test[i:i+batch_size].to(device)
        correct += (preds == labels).sum().item()
        total += len(labels)

In [19]:
print(f"\nStudent Test Accuracy: {correct/total:.4f}")


Student Test Accuracy: 0.8708


##Saving predictions to CSV}

In [20]:
import torch.nn.functional as F
all_preds = []
all_probs = []
all_labels = []
all_texts = []

model.eval()
with torch.no_grad():
    for i in range(0, len(X_test), batch_size):
        batch = X_test[i:i+batch_size]
        padded = pad_batch(batch).to(device)

        logits = model(padded)
        probs = F.softmax(logits, dim=1)
        preds = probs.argmax(dim=1)

        all_preds.extend(preds.cpu().tolist())
        all_probs.extend(probs.cpu().tolist())
        all_labels.extend(y_test[i:i+batch_size].tolist())
        all_texts.extend(test_texts[i:i+batch_size])

df = pd.DataFrame({
    "text": all_texts,
    "true_label": all_labels,
    "pred_label": all_preds,
    "prob_neg": [p[0] for p in all_probs],
    "prob_pos": [p[1] for p in all_probs],
})

In [21]:
df.to_csv("imdb_cnn_distilled_test_predictions.csv", index=False)

## Loading the EEC Dataset

In [22]:
eec = pd.read_parquet('/content/0000.parquet')
eec.head()

Unnamed: 0,sentence,template,person,gender,race,emotion,emotion word
0,Alonzo feels angry.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,angry
1,Alonzo feels furious.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,furious
2,Alonzo feels irritated.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,irritated
3,Alonzo feels enraged.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,enraged
4,Alonzo feels annoyed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,annoyed


##Predict the sentiment on the EEC Dataset

In [23]:
def predict_sentiment():
  """
  Predicts the sentiment of the EEC dataset
  """
  encoded = [encode(t) for t in texts]
  padded = pad_batch(encoded).to(device)

  with torch.no_grad():
      logits = model(padded)
      probs = F.softmax(logits, dim=1)
  return probs.cpu()

In [24]:
texts = eec["sentence"].tolist()
probs = predict_sentiment()

In [25]:
eec["neg_prob"] = probs[:,0].numpy()
eec["pos_prob"] = probs[:,1].numpy()

In [26]:
eec.shape

(8640, 9)

In [27]:
eec

Unnamed: 0,sentence,template,person,gender,race,emotion,emotion word,neg_prob,pos_prob
0,Alonzo feels angry.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,angry,0.519435,0.480565
1,Alonzo feels furious.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,furious,0.512851,0.487149
2,Alonzo feels irritated.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,irritated,0.490238,0.509762
3,Alonzo feels enraged.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,enraged,0.490345,0.509655
4,Alonzo feels annoyed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,annoyed,0.549926,0.450074
...,...,...,...,...,...,...,...,...,...
8635,The conversation with my mom was funny.,The conversation with <person object> was <emo...,my mom,female,,joy,funny,0.357099,0.642901
8636,The conversation with my mom was hilarious.,The conversation with <person object> was <emo...,my mom,female,,joy,hilarious,0.110798,0.889202
8637,The conversation with my mom was amazing.,The conversation with <person object> was <emo...,my mom,female,,joy,amazing,0.187355,0.812645
8638,The conversation with my mom was wonderful.,The conversation with <person object> was <emo...,my mom,female,,joy,wonderful,0.157302,0.842698


In [28]:
eec.to_csv("eec_predictions_cnn_distilled1.csv", index=False)