**Import Libraries and Data**

In [None]:
!pip install transformers datasets scikit-learn
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load your datasets
src_llm_train = pd.read_csv('src_llm_train.csv')
src_llm_validation = pd.read_csv('src_llm_validation.csv')


**Tokenise the Data**

In [None]:
# Set seeds before initializing anything that uses randomness
seed_value = 12345
torch.manual_seed(seed_value)
np.random.seed(seed_value)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Initialize LabelEncoder and fit on training data
label_encoder = LabelEncoder()
src_llm_train['Category'] = label_encoder.fit_transform(src_llm_train['Category'])
src_llm_validation['Category'] = label_encoder.transform(src_llm_validation['Category'])

# Define a function to tokenize and encode data
def preprocess_function(examples):
    # Tokenize the articles
    tokenized = tokenizer(examples['Article'], padding="max_length", truncation=True)
    # Add encoded labels
    tokenized['label'] = examples['Category']
    return tokenized

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(src_llm_train)
val_dataset = Dataset.from_pandas(src_llm_validation)

# Apply the preprocessing function
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


Map:   0%|          | 0/146 [00:00<?, ? examples/s]

Map:   0%|          | 0/89 [00:00<?, ? examples/s]

In [None]:
# Define the model
num_labels = len(label_encoder.classes_)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    learning_rate=1e-5,              # Learning Rate
    lr_scheduler_type='cosine',
    num_train_epochs=20,             # Number of training epochs
    per_device_train_batch_size=4,   # Batch size for training
    per_device_eval_batch_size=4,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end = True,
    metric_for_best_model = "accuracy" # Evaluate after each epoch
)

# Define accuracy metric
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted', zero_division = 0)
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1":f1}

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Training the model and validation results**

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,                         # The model to train
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset,            # Validation dataset
    compute_metrics=compute_metrics      # Function to compute metrics
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()

print("Validation results:", eval_results)

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4188,1.390058,0.269663,0.073544,0.269663,0.11557
2,1.4286,1.384573,0.269663,0.072718,0.269663,0.114547
3,1.3185,1.376765,0.269663,0.072718,0.269663,0.114547
4,1.2791,1.364834,0.292135,0.265401,0.292135,0.156824
5,1.2328,1.323616,0.325843,0.169796,0.325843,0.195118
6,1.196,1.337197,0.314607,0.152432,0.314607,0.181475
7,1.1414,1.34462,0.325843,0.158985,0.325843,0.200032
8,1.1473,1.222067,0.41573,0.352663,0.41573,0.303496
9,0.9011,1.209097,0.41573,0.391058,0.41573,0.322051
10,0.7948,1.240914,0.460674,0.409982,0.460674,0.37526


Validation results: {'eval_loss': 1.226752519607544, 'eval_accuracy': 0.6404494382022472, 'eval_precision': 0.6624339299901097, 'eval_recall': 0.6404494382022472, 'eval_f1': 0.6344146117641642, 'eval_runtime': 2.6618, 'eval_samples_per_second': 33.436, 'eval_steps_per_second': 8.641, 'epoch': 20.0}


**Run predictions on the test data**

In [None]:
# Load best trained model
best_checkpoint_path = trainer.state.best_model_checkpoint
print("Best checkpoint:", best_checkpoint_path)

Best checkpoint: ./results/checkpoint-629


In [None]:
model_path = './results/checkpoint-629'
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=num_labels)

In [None]:
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

In [None]:
# Load your test data
test_data = pd.read_csv('src_official_test.csv')
test_texts = test_data['Article']

# Create a dataset for the test data
test_dataset = TestDataset(
    texts=test_texts,
    tokenizer=tokenizer,
    max_len=512
)

In [None]:
# Create a DataLoader for the test data
test_loader = DataLoader(test_dataset, batch_size=4)

# Run predictions on the test data
model.eval()

predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=-1)
        predictions.extend(preds.cpu().numpy())

In [None]:
test_data['category_predictions'] = predictions

In [None]:
test_data.to_csv('test_predictions.csv', index=False)

In [None]:
# Get the mapping of categories to their numeric labels
label_mapping = dict(enumerate(label_encoder.classes_))

# Print the mapping
for num, label in label_mapping.items():
    print(f"Numeric Label: {num} -> Original Category: {label}")

Numeric Label: 0 -> Original Category: Aid
Numeric Label: 1 -> Original Category: Conditions
Numeric Label: 2 -> Original Category: Events
Numeric Label: 3 -> Original Category: Migration
Numeric Label: 4 -> Original Category: Policy
