In [4]:
import pandas as pd

df = pd.read_excel("/content/Model Training Data.xlsx")

print("Columns:", df.columns.tolist())
print("Data sample:\n", df.head())
print("Missing values:\n", df.isnull().sum())


FileNotFoundError: [Errno 2] No such file or directory: '/content/Model Training Data.xlsx'

In [None]:
from sklearn.model_selection import train_test_split

text_col = "Headline"
emotion_cols = ["Anger", "Fear", "Joy", "Sadness", "Trust", "Surprise",
                "Disgust", "Anticipation", "Nostalgia", "Pride", "Shame", "Vindication"]

# Confirm no missing data for relevant columns
assert df[text_col].notnull().all(), "Missing headlines found"
assert df[emotion_cols].notnull().all().all(), "Missing labels found"

# Split train and validation (80/20)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df[text_col].tolist(),
    df[emotion_cols].values.tolist(),
    test_size=0.2,
    random_state=42
)


In [5]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=64, return_tensors="pt")

train_encodings = tokenize(train_texts)
val_encodings = tokenize(val_texts)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

NameError: name 'train_texts' is not defined

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset = EmotionDataset(train_encodings, train_labels)
val_dataset = EmotionDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [None]:
from transformers import BertForSequenceClassification

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=12,
    problem_type="multi_label_classification"
)
model.to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
from torch.optim import AdamW
from tqdm import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())


Epoch 0: 100%|██████████| 51/51 [05:18<00:00,  6.25s/it, loss=0.57]
Epoch 1: 100%|██████████| 51/51 [05:01<00:00,  5.92s/it, loss=0.453]
Epoch 2: 100%|██████████| 51/51 [04:54<00:00,  5.78s/it, loss=0.375]


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

model.eval()

all_labels = []
all_preds = []

with torch.no_grad():
    for batch in val_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].cpu().numpy()
        outputs = model(**inputs)
        logits = outputs.logits.cpu().numpy()

        # Sigmoid to get probabilities for multi-label
        probs = 1 / (1 + np.exp(-logits))

        # Threshold at 0.5 to get predicted labels
        preds = (probs >= 0.5).astype(int)

        all_labels.append(labels)
        all_preds.append(preds)

all_labels = np.vstack(all_labels)
all_preds = np.vstack(all_preds)

# Calculate and print metrics for each emotion
for i, emotion in enumerate(emotion_cols):
    f1 = f1_score(all_labels[:, i], all_preds[:, i], zero_division=0)
    prec = precision_score(all_labels[:, i], all_preds[:, i], zero_division=0)
    rec = recall_score(all_labels[:, i], all_preds[:, i], zero_division=0)
    print(f"{emotion} — F1: {f1:.3f}, Precision: {prec:.3f}, Recall: {rec:.3f}")


Anger — F1: 0.681, Precision: 0.706, Recall: 0.658
Fear — F1: 0.773, Precision: 0.746, Recall: 0.802
Joy — F1: 0.609, Precision: 0.661, Recall: 0.565
Sadness — F1: 0.667, Precision: 0.662, Recall: 0.671
Trust — F1: 0.765, Precision: 0.688, Recall: 0.863
Surprise — F1: 0.061, Precision: 0.667, Recall: 0.032
Disgust — F1: 0.308, Precision: 0.471, Recall: 0.229
Anticipation — F1: 0.000, Precision: 0.000, Recall: 0.000
Nostalgia — F1: 0.000, Precision: 0.000, Recall: 0.000
Pride — F1: 0.290, Precision: 0.455, Recall: 0.213
Shame — F1: 0.644, Precision: 0.636, Recall: 0.651
Vindication — F1: 0.590, Precision: 0.593, Recall: 0.587


In [None]:
all_logits = []
all_labels = []

model.eval()
with torch.no_grad():
    for batch in val_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].cpu().numpy()
        outputs = model(**inputs)
        logits = outputs.logits.cpu().numpy()

        all_logits.append(logits)
        all_labels.append(labels)

all_logits = np.vstack(all_logits)
all_labels = np.vstack(all_labels)


In [None]:
import numpy as np
from sklearn.metrics import f1_score

thresholds = np.arange(0.1, 0.9, 0.05)
best_thresholds = {}

for i, emotion in enumerate(emotion_cols):
    best_f1 = 0
    best_thresh = 0.5  # default
    for thresh in thresholds:
        preds_thresh = ((1 / (1 + np.exp(-all_logits[:, i]))) >= thresh).astype(int)
        f1 = f1_score(all_labels[:, i], preds_thresh, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh
    best_thresholds[emotion] = best_thresh
    print(f"{emotion}: Best threshold = {best_thresh:.2f} with F1 = {best_f1:.3f}")


Anger: Best threshold = 0.50 with F1 = 0.681
Fear: Best threshold = 0.45 with F1 = 0.798
Joy: Best threshold = 0.25 with F1 = 0.667
Sadness: Best threshold = 0.50 with F1 = 0.667
Trust: Best threshold = 0.50 with F1 = 0.765
Surprise: Best threshold = 0.15 with F1 = 0.498
Disgust: Best threshold = 0.25 with F1 = 0.495
Anticipation: Best threshold = 0.10 with F1 = 0.227
Nostalgia: Best threshold = 0.10 with F1 = 0.385
Pride: Best threshold = 0.15 with F1 = 0.503
Shame: Best threshold = 0.40 with F1 = 0.663
Vindication: Best threshold = 0.40 with F1 = 0.676


In [None]:
best_thresholds = {
    'Anger': 0.50,
    'Fear': 0.45,
    'Joy': 0.25,
    'Sadness': 0.50,
    'Trust': 0.50,
    'Surprise': 0.15,
    'Disgust': 0.25,
    'Anticipation': 0.10,
    'Nostalgia': 0.10,
    'Pride': 0.15,
    'Shame': 0.40,
    'Vindication': 0.40
}


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

all_preds = np.zeros_like(all_labels)

for i, emotion in enumerate(emotion_cols):
    thresh = best_thresholds[emotion]
    probs = 1 / (1 + np.exp(-all_logits[:, i]))  # sigmoid to get probabilities
    preds = (probs >= thresh).astype(int)
    all_preds[:, i] = preds

    f1 = f1_score(all_labels[:, i], preds, zero_division=0)
    precision = precision_score(all_labels[:, i], preds, zero_division=0)
    recall = recall_score(all_labels[:, i], preds, zero_division=0)

    print(f"{emotion} — F1: {f1:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}")


Anger — F1: 0.681, Precision: 0.706, Recall: 0.658
Fear — F1: 0.798, Precision: 0.746, Recall: 0.858
Joy — F1: 0.667, Precision: 0.589, Recall: 0.768
Sadness — F1: 0.667, Precision: 0.662, Recall: 0.671
Trust — F1: 0.765, Precision: 0.688, Recall: 0.863
Surprise — F1: 0.498, Precision: 0.341, Recall: 0.921
Disgust — F1: 0.495, Precision: 0.365, Recall: 0.771
Anticipation — F1: 0.227, Precision: 0.130, Recall: 0.885
Nostalgia — F1: 0.385, Precision: 0.286, Recall: 0.588
Pride — F1: 0.503, Precision: 0.370, Recall: 0.787
Shame — F1: 0.663, Precision: 0.598, Recall: 0.744
Vindication — F1: 0.676, Precision: 0.583, Recall: 0.804


In [2]:
def predict_emotions(headline):
    model.eval()
    inputs = tokenizer(headline, return_tensors="pt", truncation=True, padding=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits.cpu().numpy()[0]
        probs = 1 / (1 + np.exp(-logits))  # sigmoid

    results = {}
    for i, emotion in enumerate(emotion_cols):
        if probs[i] >= best_thresholds[emotion]:
            results[emotion] = round(probs[i], 3)

    return results


In [None]:
predict_emotions("India attacks Pakistan's nuclear base")


{'Anger': np.float32(0.745),
 'Fear': np.float32(0.678),
 'Sadness': np.float32(0.697),
 'Surprise': np.float32(0.342),
 'Disgust': np.float32(0.285),
 'Anticipation': np.float32(0.147),
 'Pride': np.float32(0.267),
 'Shame': np.float32(0.572),
 'Vindication': np.float32(0.707)}

In [None]:
!pip install -q transformers

from transformers import pipeline

goemotion = pipeline("text-classification",
                     model="bhadresh-savani/distilbert-base-uncased-emotion",
                     return_all_scores=True)

def compare_with_goemotions(headline):
    result = goemotion(headline)[0]
    sorted_result = sorted(result, key=lambda x: x['score'], reverse=True)
    return [(r['label'], round(r['score'], 3)) for r in sorted_result if r['score'] > 0.2]


config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
compare_with_goemotions("India attacks Pakistan's nuclear base")


[('fear', 0.613), ('anger', 0.35)]

In [3]:
model.save_pretrained("emotion_model")
tokenizer.save_pretrained("emotion_model")


NameError: name 'model' is not defined

In [None]:
!zip -r emotion_model.zip emotion_model


  adding: emotion_model/ (stored 0%)
  adding: emotion_model/tokenizer_config.json (deflated 75%)
  adding: emotion_model/model.safetensors (deflated 7%)
  adding: emotion_model/config.json (deflated 58%)
  adding: emotion_model/special_tokens_map.json (deflated 42%)
  adding: emotion_model/vocab.txt (deflated 53%)


In [None]:
from google.colab import files
files.download("emotion_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [1]:
# ---- 1. Upload the Excel file ----
from google.colab import files, drive
import pandas as pd, io, os

# if you prefer Drive, uncomment the next two lines and mount once
# drive.mount('/content/drive')
# path = '/content/drive/My Drive/Dissertation/Headlines 22 Apr to 16 May.xlsx'

uploaded = files.upload()  # pick “Headlines 22 Apr to 16 May.xlsx” in the dialog
fname = next(iter(uploaded))               # grab the uploaded filename

# ---- 2. Load headlines ----
df = pd.read_excel(io.BytesIO(uploaded[fname]))  # or pd.read_excel(path) if using Drive
if "Headline" not in df.columns:
    raise ValueError("Column called 'Headline' not found.")
headlines = df["Headline"].tolist()

# ---- 3. Run predictions ----
records = [predict_emotions(h) for h in headlines]
emotion_df = pd.DataFrame(records).fillna(0)
emotion_df.insert(0, "Headline", headlines)

# ---- 4. Save and offer download ----
out_name = "headlines_22Apr_16May_emotions.csv"
emotion_df.to_csv(out_name, index=False)
files.download(out_name)  # triggers browser download
print(f"{len(emotion_df)} headlines processed → {out_name}")


Saving Headlines 22 Apr to 16 May.xlsx to Headlines 22 Apr to 16 May.xlsx


NameError: name 'predict_emotions' is not defined