In [None]:
import pandas as pd, numpy as np, torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
!pip install safetensors



In [None]:
from google.colab import files; files.upload()

Saving emotion_predictions.csv to emotion_predictions.csv
Saving Headlines_22Apr_16May_withThemes.csv to Headlines_22Apr_16May_withThemes.csv
Saving Peacetime news.xlsx to Peacetime news.xlsx
Saving config.json to config.json
Saving model.safetensors to model.safetensors
Saving special_tokens_map.json to special_tokens_map.json
Saving tokenizer_config.json to tokenizer_config.json
Saving vocab.txt to vocab.txt


In [None]:
df = pd.read_excel("Model Training Data.xlsx", sheet_name="Themes")  # Sheet named 'Themes'
text_col = "Headline"

theme_cols = [
    "Nationalism", "Religion", "Violence", "Conspiracy",
    "Political figures", "Statistics", "History/Mythology",
    "Other countries/Global perception", "Hero worship", "Gender/Sex",
    "Humour", "Bullying/Targeted Harassment"
]

assert df[text_col].notnull().all()
assert df[theme_cols].notnull().all().all()

In [None]:
# ========= 1. SPLIT =========
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df[text_col].tolist(),
    df[theme_cols].values.tolist(),
    test_size=0.2,
    random_state=42
)

# ========= 2. TOKENISE =========
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tok(x):
    return tokenizer(x, padding=True, truncation=True, max_length=64, return_tensors="pt")

train_enc = tok(train_texts)
val_enc = tok(val_texts)

class TDS(Dataset):
    def __init__(self, enc, lab):
        self.enc = enc
        self.lab = torch.tensor(lab, dtype=torch.float32)

    def __len__(self):
        return len(self.lab)

    def __getitem__(self, i):
        item = {k: v[i] for k, v in self.enc.items()}
        item["labels"] = self.lab[i]
        return item

train_loader = DataLoader(TDS(train_enc, train_labels), batch_size=16, shuffle=True)
val_loader = DataLoader(TDS(val_enc, val_labels), batch_size=16)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(theme_cols),
    problem_type="multi_label_classification"
).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
opt = AdamW(model.parameters(), lr=5e-5)
model.train()
for epoch in range(3):
    loop = tqdm(train_loader)
    for batch in loop:
        opt.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
        labels = batch["labels"].to(device)
        loss = model(**inputs, labels=labels).loss
        loss.backward()
        opt.step()
        loop.set_description(f"epoch {epoch}")
        loop.set_postfix(loss=loss.item())

epoch 0: 100%|██████████| 51/51 [04:00<00:00,  4.71s/it, loss=0.452]
epoch 1: 100%|██████████| 51/51 [03:58<00:00,  4.68s/it, loss=0.372]
epoch 2: 100%|██████████| 51/51 [03:54<00:00,  4.60s/it, loss=0.296]


In [None]:
model.eval()
all_logits = []
all_lab = []
with torch.no_grad():
    for batch in val_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
        logits = model(**inputs).logits.cpu().numpy()
        labels = batch["labels"].cpu().numpy()
        all_logits.append(logits)
        all_lab.append(labels)
all_logits = np.vstack(all_logits)
all_lab = np.vstack(all_lab)

from sklearn.metrics import f1_score

best_thr = {}
for i, col in enumerate(theme_cols):
    best_f1 = 0
    best_threshold = 0.5
    for t in np.arange(0.1, 0.9, 0.05):
        preds = (1 / (1 + np.exp(-all_logits[:, i])) >= t).astype(int)
        f1 = f1_score(all_lab[:, i], preds, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = t
    best_thr[col] = best_threshold

print("Optimized thresholds:", best_thr)


Optimized thresholds: {'Nationalism': np.float64(0.25000000000000006), 'Religion': np.float64(0.45000000000000007), 'Violence': np.float64(0.5000000000000001), 'Conspiracy': np.float64(0.15000000000000002), 'Political figures': np.float64(0.15000000000000002), 'Statistics': np.float64(0.15000000000000002), 'History/Mythology': np.float64(0.25000000000000006), 'Other countries/Global perception': np.float64(0.3500000000000001), 'Hero worship': np.float64(0.1), 'Gender/Sex': np.float64(0.20000000000000004), 'Humour': np.float64(0.1), 'Bullying/Targeted Harassment': np.float64(0.1)}


In [None]:
model.save_pretrained("theme_model")
tokenizer.save_pretrained("theme_model")
np.save("theme_thresholds.npy", best_thr)

In [None]:
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification

# Load model and tokenizer
model_path = "theme_model"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
model.eval()

# Load thresholds
thresholds = np.load("theme_thresholds.npy", allow_pickle=True).item()

# Theme columns (must match training order)
theme_cols = [
    "Nationalism", "Religion", "Violence", "Conspiracy",
    "Political figures", "Statistics", "History/Mythology",
    "Other countries/Global perception", "Hero worship", "Gender/Sex",
    "Humour", "Bullying/Targeted Harassment"
]

# Prediction function
def predict_themes(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    with torch.no_grad():
        logits = model(**inputs).logits[0].numpy()
    probs = 1 / (1 + np.exp(-logits))
    return {
        theme: int(prob >= thresholds[theme])
        for theme, prob in zip(theme_cols, probs)
    }


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
import numpy as np

# Load model, tokenizer, thresholds
model = BertForSequenceClassification.from_pretrained("theme_model")
tokenizer = BertTokenizer.from_pretrained("theme_model")
thresholds = np.load("theme_thresholds.npy", allow_pickle=True).item()
model.eval()

# Load headlines
df = pd.read_excel("Peacetime news.xlsx", header=None)
df.columns = ["Headline"]

# Tokenize in batches
batch_size = 32
all_preds = []

for i in range(0, len(df), batch_size):
    batch_texts = df["Headline"][i:i+batch_size].fillna("").astype(str).tolist()
    encodings = tokenizer(batch_texts, padding=True, truncation=True, max_length=64, return_tensors="pt")

    with torch.no_grad():
        logits = model(**encodings).logits.numpy()

    # Apply sigmoid and thresholds
    probs = 1 / (1 + np.exp(-logits))
    preds = {key: probs[:, idx]
         for idx, key in enumerate(thresholds.keys())}
    batch_df = pd.DataFrame(preds)
    all_preds.append(batch_df)

# Combine everything
theme_df = pd.concat(all_preds, ignore_index=True)
output_df = pd.concat([df, theme_df], axis=1)
output_df.to_excel("Peacetime_with_themes.xlsx", index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


OSError: theme_model is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
from google.colab import files
files.download("Peacetime_with_themes.xlsx")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive
