#Notebook for Baseline CNN and Prediction on the EEC Dataset

##Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from collections import Counter
import re
import pandas as pd

##Upload files include IMDB test train and EEC Dataset

In [2]:
from google.colab import files
uploaded = files.upload()

Saving 0000.parquet to 0000.parquet
Saving test.csv to test.csv
Saving train.csv to train.csv


##Loading train and test csvs

In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_texts = train_df["text"].tolist()
train_labels = train_df["label"].tolist()

test_texts = test_df["text"].tolist()
test_labels = test_df["label"].tolist()

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [5]:
def tokenize(text):
    """
    Convert text to lowercase and return alphabetic tokens.
    """
    text = text.lower()
    return re.findall(r"[a-z']+", text)

In [6]:
counter = Counter()
# Count token frequencies in training data
for text in train_texts:
    counter.update(tokenize(text))
#Build vocab
vocab_size = 10000
vocab = {w: i+1 for i, (w, _) in enumerate(counter.most_common(vocab_size-1))}
UNK = 0

In [7]:
def encode(text):
  """
  Convert text into integer tokens of max_length 400
  """
  tok = tokenize(text)
  ids = [vocab.get(t, UNK) for t in tok]
  return torch.tensor(ids[:400])
#Encode text
X_train = [encode(t) for t in train_texts]
y_train = torch.tensor(train_labels)
X_test = [encode(t) for t in test_texts]
y_test = torch.tensor(test_labels)

In [8]:
def pad_batch(batch):
  """
  Pad all sequences in a batch to the same maximum length.
  """

  max_len = max(len(x) for x in batch)
  padded = torch.zeros(len(batch), max_len, dtype=torch.long)
  for i, seq in enumerate(batch):
      padded[i, :len(seq)] = seq
  return padded

##Saving vocab to use for Distilled CNN

In [9]:
import pickle

extras = {
    "vocab": vocab,
    "UNK": UNK,
    "vocab_size": vocab_size
}

with open("vocab_CNN.pkl", "wb") as f:
    pickle.dump(extras, f)

print("Saved vocab_CNN.pkl")

Saved vocab_CNN.pkl


##Model definition: classic Kim (2014) CNN

In [10]:
class TextCNN(nn.Module):
    """
    TextCNN:
    Embedding layer with multiple convolution filters (kernel sizes: 4,5,6) and global max pooling
    """
    def __init__(self, vocab_size, embed_dim=100, num_classes=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        # 3 filter sizes
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=100, kernel_size=k)
            for k in [4, 5, 6]
        ])
        self.fc = nn.Linear(100*3, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
      """
      x: (B, T)
      Output: logits (B, 2)
      """
      emb = self.embed(x)
      emb = emb.transpose(1, 2)

      conv_outputs = []
      for conv in self.convs:
          c = conv(emb)
          c = torch.relu(c)
          c = torch.max(c, dim=2)[0]
          conv_outputs.append(c)

      concat = torch.cat(conv_outputs, dim=1)
      concat = self.dropout(concat)
      logits = self.fc(concat)
      return logits

##Training setup

In [11]:
batch_size = 32
model = TextCNN(vocab_size=vocab_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

def iterate_batches(X, y):
  """
  Shuffle data and iterate batches
  """
  idx = torch.randperm(len(X))
  for i in range(0, len(X), batch_size):
      sel = idx[i:i+batch_size]
      batch_x = [X[j] for j in sel]
      batch_y = y[sel]
      yield pad_batch(batch_x), batch_y.to(device)

##Train TextCNN

In [12]:
for epoch in range(5):
    model.train()
    total_loss = 0
    for batch_x_cpu, batch_y in iterate_batches(X_train, y_train):
        batch_x = batch_x_cpu.to(device)
        logits = model(batch_x)
        loss = criterion(logits, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss:.4f}")

Epoch 1 Loss: 488.3538
Epoch 2 Loss: 362.0740
Epoch 3 Loss: 296.5331
Epoch 4 Loss: 238.5503
Epoch 5 Loss: 186.3773


##Model Evaluation

In [13]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for i in range(0, len(X_test), batch_size):
        batch = X_test[i:i+batch_size]
        padded = pad_batch(batch).to(device)
        logits = model(padded)
        preds = logits.argmax(dim=1)
        labels = y_test[i:i+batch_size].to(device)
        correct += (preds == labels).sum().item()
        total += len(labels)

print(f"Test Accuracy: {correct/total*100:.4f}")

Test Accuracy: 86.8057


In [14]:
import torch.nn.functional as F

##Saving IMDB Predictions to csv

In [15]:
all_preds = []
all_probs = []
all_labels = []
all_texts = []

model.eval()
with torch.no_grad():
    for i in range(0, len(X_test), batch_size):
        batch = X_test[i:i+batch_size]
        padded = pad_batch(batch).to(device)

        logits = model(padded)
        probs = F.softmax(logits, dim=1)
        preds = probs.argmax(dim=1)

        all_preds.extend(preds.cpu().tolist())
        all_probs.extend(probs.cpu().tolist())
        all_labels.extend(y_test[i:i+batch_size].tolist())
        all_texts.extend(test_texts[i:i+batch_size])

# Build dataframe
df = pd.DataFrame({
    "text": all_texts,
    "true_label": all_labels,
    "pred_label": all_preds,
    "prob_neg": [p[0] for p in all_probs],
    "prob_pos": [p[1] for p in all_probs],
})

In [16]:
df

Unnamed: 0,text,true_label,pred_label,prob_neg,prob_pos
0,This sci-fi great fortunately has little to do...,1,1,0.001016,0.998984
1,Yet another recent comedy that shows that Holl...,0,0,0.998896,0.001103
2,I was looking on Imdbs bottom 100 because i th...,0,0,0.999932,0.000068
3,Bo Derek will not go down in history as a grea...,0,0,0.959063,0.040936
4,"During the Civil War, there were many cases of...",0,1,0.062779,0.937221
...,...,...,...,...,...
24786,this movie makes me laugh by even just thinkin...,1,1,0.393072,0.606927
24787,The plot is about the death of little children...,0,0,0.998794,0.001206
24788,"""Challenge to be Free"" was one of the first fi...",1,1,0.001717,0.998283
24789,This film has all the earmarks of too many coo...,0,0,0.503509,0.496491


In [17]:
df.to_csv("imdb_cnn_test_predictions.csv", index=False)

##Load EEC

In [19]:
eec = pd.read_parquet('/content/0000.parquet')
eec.head()

Unnamed: 0,sentence,template,person,gender,race,emotion,emotion word
0,Alonzo feels angry.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,angry
1,Alonzo feels furious.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,furious
2,Alonzo feels irritated.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,irritated
3,Alonzo feels enraged.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,enraged
4,Alonzo feels annoyed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,annoyed


In [20]:
import torch.nn.functional as F

##Predict sentiment function for EEC

In [21]:
def predict_sentiment():
  """
  Predict sentiment for each sentence in the EEC dataset
  """
  encoded = [encode(t) for t in texts]
  padded = pad_batch(encoded).to(device)

  with torch.no_grad():
      logits = model(padded)
      probs = F.softmax(logits, dim=1)
  return probs.cpu()

##Apply Model to EEC

In [22]:
texts = eec["sentence"].tolist()
probs = predict_sentiment()

In [23]:
eec["neg_prob"] = probs[:,0].numpy()
eec["pos_prob"] = probs[:,1].numpy()

eec.head()

Unnamed: 0,sentence,template,person,gender,race,emotion,emotion word,neg_prob,pos_prob
0,Alonzo feels angry.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,angry,0.516,0.484
1,Alonzo feels furious.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,furious,0.521424,0.478576
2,Alonzo feels irritated.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,irritated,0.609735,0.390265
3,Alonzo feels enraged.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,enraged,0.504742,0.495258
4,Alonzo feels annoyed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,annoyed,0.567777,0.432223


In [24]:
eec.to_csv("eec_predictions_cnn.csv", index=False)