In [3]:
import json
import os
import pandas as pd
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)
from sklearn.metrics import f1_score
import torch
from transformers import EvalPrediction, pipeline

# Utils

Helper classes and functions for our task

In [6]:
os.environ["WANDB_DISABLED"] = "true"

In [4]:
class DisinformationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


def load_and_process_data(file_path: str, label_column: str = "label") -> pd.DataFrame:
    """
    Loads the data from a CSV file and processes the labels.
    Args:
        file_path (str): Path to the CSV file.
        label_column (str): The column name containing the labels.
        text_column (str): The column name containing the text content.
    Returns:
        pd.DataFrame: Processed dataframe with labels and text content.
    """
    data = pd.read_csv(file_path, encoding='utf-8')
    data[label_column] = data[label_column].apply(lambda x: 1 if "fake" in x.lower() else 0)
    return data


def save_metrics_to_json(metrics: dict, output_file_path: str):
    """
    Saves the metrics to a JSON file.
    Args:
        metrics (dict): The evaluation metrics.
        output_file_path (str): The file path to save the metrics.
    """
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    with open(output_file_path, 'w') as output_file:
        json.dump(metrics, output_file, indent=4)

In [None]:
def compute_metrics(pred=None, y_true=None, y_pred=None):
    """
    Computes F1 scores (micro, macro, weighted) for both training and testing data.

    If `pred` is provided, it computes metrics for the trainer using `EvalPrediction`.
    If `y_true` and `y_pred` are provided, it computes metrics for test data predictions.

    Parameters:
        - pred (EvalPrediction, optional): The evaluation prediction object for Trainer.
        - y_true (list, optional): The ground truth labels for the test data.
        - y_pred (list, optional): The predicted labels for the test data.

    Returns:
        - dict: A dictionary containing F1 metrics.
    """
    if pred is not None:
        # When working with the Trainer, pred is an EvalPrediction object
        labels = pred.label_ids
        y_pred = pred.predictions.argmax(-1)
    elif y_true is not None and y_pred is not None:
        # If y_true and y_pred are provided, use them for test evaluation
        labels = y_true
    else:
        raise ValueError("Either `pred` or both `y_true` and `y_pred` must be provided.")
    
        # Compute F1 scores
    f1 = f1_score(y_true=labels, y_pred=y_pred)

    return {
        'f1': f1
    }
    
def compute_metrics_for_trainer(pred: EvalPrediction):
    return compute_metrics(pred=pred)

# Fine Tuning BERT for Disinformation Detection

Source of this lab session is [HuggingFace documentation and Learning Materials](https://huggingface.co/learn/llm-course/chapter3/1?fw=pt)

HuggingFace Transformers library provides a Trainer class to help you fine-tune any of the pretrained models it provides on your dataset. You have just a few steps to do and define the Trainer. The hardest part is likely to be preparing the environment to run Trainer.train(), as it will run very slowly on a CPU. If you don’t have a GPU set up, you can get access to free GPUs or TPUs on Google Colab.


In [None]:
# Define the URLs pointing to the raw CSV data files hosted on GitHub.

url_test = 'https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/ECTF/test.csv'
url_train = 'https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/ECTF/train.csv'
url_valid = 'https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/ECTF/validation.csv'

# Download the datasets from GitHub using the wget command-line tool.
# Each file is saved with a simple filename for ease of use.

!wget -O test.csv {url_test}
!wget -O train.csv {url_train}
!wget -O validation.csv {url_valid}

In [None]:
# Load and preprocess the datasets using the custom function 'load_and_process_data'
# This function will load the CSV data files, process the labels, and return the data in a usable dataframe format.

# Load and process the training data
train_data = load_and_process_data('train.csv')

# Load and process the validation data
validation_data = load_and_process_data('validation.csv')

What is BERT?
BERT (Bidirectional Encoder Representations from Transformers) is a powerful pre-trained language model developed by Google. It understands the context of words in a sentence by considering both the left and right context (bidirectional), making it effective for a wide variety of NLP tasks.

Why load the model and tokenizer?

Model: The AutoModelForSequenceClassification class loads a pre-trained version of BERT and enables further fine-tuning for classification tasks. For example, in our case, the model can predict whether a text is "fake" or "real".

Tokenizer: The tokenizer converts raw text into tokens (smaller units of text) that the model can understand. It takes care of breaking down the text into the right format that matches the model's expectations (e.g., splitting the sentence into words or sub-words, and converting these into IDs).

What does it mean "uncased"?
The "bert-base-uncased" model is a version of BERT that doesn't distinguish between uppercase and lowercase letters. It treats "Apple" and "apple" the same, which can be helpful for some NLP tasks.

In [None]:
id2label = {0: "Credible", 1: "Fake"}
label2id = {"Credible": 0, "Fake": 1}

In [None]:
# Load the pre-trained BERT model and tokenizer
# BERT is a transformer-based model that has been pre-trained on a large corpus of text
# We'll use it for classification task, where the model predicts labels for text.

# Load the BERT model for classification (the base uncased version of BERT)
# This is a generic model class that will be instantiated as one of the model classes of the library (with a sequence classification head) when created with the from_pretrained()
model = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-base-uncased', num_labels=2, id2label=id2label, label2id=label2id)

# Load the corresponding tokenizer for BERT
# The tokenizer is responsible for converting the text into tokens that the model can process
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')

In [None]:
# Tokenize the datasets (training and validation) to prepare them for input into the BERT model.
# Tokenization converts the raw text data into a format the BERT model can process.

# Tokenizing the training dataset
train_encodings = tokenizer(
        train_data['content'].tolist(),
        truncation=True,
        padding=True,
        max_length=256
    )

# Tokenizing the validation dataset
val_encodings = tokenizer(
        validation_data['content'].tolist(),
        truncation=True,
        padding=True,
        max_length=256
    )

Why do we use truncation?
Text in a dataset can vary in length, and BERT-based models have a maximum input length they can handle (often 512 tokens). In this case, we're setting max_length=256, meaning that if a piece of text is longer than 256 tokens, it will be truncated (cut off) at that length. This ensures the text fits within BERT's expected input size.

Why do we use padding?
Some sentences or documents in the dataset may be shorter than 256 tokens. To ensure that all inputs to the model are the same length, we use padding. Padding adds extra tokens (usually with a value of 0) to the end of shorter sequences, so they all have the same number of tokens (256 in this case).

What is max_length?
The max_length=256 argument defines the maximum number of tokens for each input sequence. This ensures that all inputs are consistent in size, which is necessary for batch processing during model training or evaluation. If a sequence exceeds this length, it will be truncated; if it’s shorter, it will be padded.

In [None]:
# Create custom datasets for training and validation using the DisinformationDataset class.
# These datasets will format the tokenized text data and corresponding labels into a format that can be used by the model during training and evaluation.

# Create the training dataset: it combines the tokenized training data and corresponding labels
train_dataset = DisinformationDataset(train_encodings, train_data['label'].tolist())

# Create the validation dataset: it combines the tokenized validation data and corresponding labels
val_dataset = DisinformationDataset(val_encodings, validation_data['label'].tolist())

In [None]:
# https://huggingface.co/docs/transformers/v4.51.3/en/main_classes/trainer#transformers.TrainingArguments
training_args = TrainingArguments(
    output_dir='output/training/',
    eval_strategy='steps',
    learning_rate=0.00001,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    warmup_ratio=0.06,
    weight_decay=0.1,
    fp16=True,
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    save_total_limit=2,
    greater_is_better=True,
    save_strategy='steps',
    eval_steps=100,
    save_on_each_node=True,
    report_to=[]
)

trainer = Trainer(
        model=model,  # Pass the actual model instance
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_for_trainer
    )

1. What are TrainingArguments?

- The TrainingArguments class is used to configure how the model should be trained. It controls aspects like learning rate, batch size, number of epochs, where to save the model, and how to handle evaluations and logging. You can think of it as the “training setup” that specifies how and where training will happen.

2. Key training arguments:

- output_dir: The output directory where the model predictions and checkpoints will be written.
- eval_strategy: Specifies how often the model will be evaluated during training. In this case, evaluation happens at specific intervals (steps).
- learning_rate: Controls how much the model adjusts its parameters with each update. A very small learning rate (0.00001) ensures more gradual changes to avoid overshooting optimal values. It is the initial learning rate for AdamW optimizer.
- per_device_train_batch_size and per_device_eval_batch_size: These set the number of samples in each batch during training and evaluation.
- num_train_epochs: Defines how many times the entire training dataset will be passed through the model. Typically, more epochs lead to better model performance, but too many can lead to overfitting.
- warmup_ratio:  Ratio of total training steps used for a linear warmup from 0 to learning_rate.
- weight_decay: Regularization technique to prevent the model from overfitting.
- fp16: Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training to speed up training on GPUs.
- metric_for_best_model: Use in conjunction with load_best_model_at_end to specify the metric to use to compare two different models. Must be the name of a metric returned by the evaluation with or without the prefix "eval_".
- load_best_model_at_end: Ensures that the best model is loaded at the end of training for further use or evaluation.
- save_strategy: The checkpoint save strategy to adopt during training.

3. What is the Trainer?
Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for HuggingFace Transformers.

4. compute_metrics:
This is a custom function you define that computes evaluation metrics like accuracy or F1 score.

In [None]:
# Train the model using the Trainer class.
# This method will start the training process based on the configurations specified in the TrainingArguments.
# The model will learn from the training data and be evaluated on the validation data according to the provided settings.

trainer.train()

In [None]:
# Save the trained model to a specified directory after training is completed.
# This allows you to persist the model and use it for future predictions or fine-tuning without retraining.
model_saved_path='output/final/'
trainer.save_model(model_saved_path)
tokenizer.save_pretrained(model_saved_path)

# Prediction on Test Dataset

In [None]:
# Load the test data and preprocess
test_data = load_and_process_data('test.csv')

In [None]:
# Load the pipeline with CUDA
classifier = pipeline(
    task="text-classification", 
    model=model_saved_path, 
    tokenizer=model_saved_path, 
    device=0,
    truncation=True,
    padding=True,
    max_length=256
)

# Run pipeline on all content (batched)
results = classifier(test_data["content"].tolist(), batch_size=32)

# Convert results to binary predictions
test_data["predictions"] = [1 if r["label"] == "Fake" else 0 for r in results]

In [None]:
# Compute evaluation metrics on the test data
evaluation_results = compute_metrics(y_true=test_data["label"], y_pred=test_data["predictions"])

# Save the evaluation metrics to a JSON file
output_file_path = "metrics/results.json"
save_metrics_to_json(evaluation_results, output_file_path)