**Import Libraries and Data**

In [None]:
!pip install transformers datasets scikit-learn
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load datasets
src_llm_train = pd.read_csv('src_llm_train.csv')
src_llm_validation = pd.read_csv('src_llm_validation.csv')

**Tokenise the Data**

In [None]:
# Set seeds before initializing anything that uses randomness
seed_value = 12345
torch.manual_seed(seed_value)
np.random.seed(seed_value)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a function to tokenize and encode data
def preprocess_function(examples):
    # Tokenize the articles
    tokenized = tokenizer(examples['Article'], padding="max_length", truncation=True)
    # Add encoded labels (Solidarity column)
    tokenized['label'] = examples['Solidarity']
    return tokenized

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(src_llm_train)
val_dataset = Dataset.from_pandas(src_llm_validation)

# Apply the preprocessing function
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
# Define the model for binary classification
num_labels = 2
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    learning_rate=1e-5,              # Learning Rate
    lr_scheduler_type='cosine',
    num_train_epochs=30,             # Number of training epochs
    per_device_train_batch_size=4,   # Batch size for training
    per_device_eval_batch_size=4,    # Batch size for evaluation
    warmup_steps=1000,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"  # Evaluate after each epoch
)

# Define accuracy metric for binary classification
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='binary', zero_division=0)
    recall = recall_score(labels, preds, average='binary')
    f1 = f1_score(labels, preds, average='binary')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Training the model and validation results**

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,                         # The model to train
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset,            # Validation dataset
    compute_metrics=compute_metrics      # Function to compute metrics
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()

print("Validation results:", eval_results)

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6323,0.693518,0.550562,0.545455,1.0,0.705882
2,0.7048,0.690844,0.550562,0.545455,1.0,0.705882
3,0.7132,0.686685,0.550562,0.545455,1.0,0.705882
4,0.7152,0.68725,0.550562,0.545455,1.0,0.705882
5,0.6731,0.684631,0.550562,0.545455,1.0,0.705882
6,0.6317,0.678788,0.550562,0.547619,0.958333,0.69697
7,0.6408,0.674332,0.58427,0.571429,0.916667,0.704
8,0.5845,0.664584,0.595506,0.642857,0.5625,0.6
9,0.532,0.655522,0.595506,0.636364,0.583333,0.608696
10,0.432,0.669546,0.617978,0.609375,0.8125,0.696429


Validation results: {'eval_loss': 1.8556827306747437, 'eval_accuracy': 0.6404494382022472, 'eval_precision': 0.7666666666666667, 'eval_recall': 0.4791666666666667, 'eval_f1': 0.5897435897435898, 'eval_runtime': 2.8068, 'eval_samples_per_second': 31.708, 'eval_steps_per_second': 8.194, 'epoch': 30.0}


**Run predictions on test data**

In [None]:
# Load best trained model
best_checkpoint_path = trainer.state.best_model_checkpoint
print("Best checkpoint:", best_checkpoint_path)

Best checkpoint: ./results/checkpoint-888


In [None]:
model_path = './results/checkpoint-888'
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=num_labels)

In [None]:
from torch.utils.data import Dataset, DataLoader

class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

In [None]:
# Load test data
test_data = pd.read_csv('src_official_test2.csv')
test_texts = test_data['Article']  # Replace with your column name containing text data

# Create a dataset for the test data
test_dataset = TestDataset(
    texts=test_texts,
    tokenizer=tokenizer,
    max_len=512  # Adjust max_len according to your needs
)

In [None]:
# Create a DataLoader for the test data
test_loader = DataLoader(test_dataset, batch_size=4)

# Run predictions on the test data
model.eval()

predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=-1)
        predictions.extend(preds.cpu().numpy())

In [None]:
test_data['solidarity_predictions'] = predictions

In [None]:
test_data.to_csv('src_llm_solidarity_predictions.csv', index=False)