In [1]:
import json
import os
import pandas as pd
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)
from sklearn.metrics import f1_score, precision_score, recall_score
import torch
from transformers import EvalPrediction, pipeline

## Utils

In [2]:
class DisinformationDataset(torch.utils.data.Dataset):
    """
    This class wraps our tokenized data and labels so PyTorch can easily loop through them during training. It converts each input into tensors and returns them with the label — all in the format the model expects.
    """
    # When we create an instance of dataset, we pass in encodings and labels
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    # This method tells PyTorch how to get one item (input + label).
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    # Returns how many examples are in the dataset (needed by DataLoader).
    def __len__(self):
        return len(self.labels)


def load_and_process_data(file_path: str, label_column: str = "label") -> pd.DataFrame:
    """
    Loads the data from a CSV file and processes the labels.
    Args:
        file_path (str): Path to the CSV file.
        label_column (str): The column name containing the labels.
        text_column (str): The column name containing the text content.
    Returns:
        pd.DataFrame: Processed dataframe with labels and text content.
    """
    data = pd.read_csv(file_path, encoding='utf-8')
    data[label_column] = data[label_column].apply(lambda x: 1 if "fake" in x.lower() else 0)
    return data

In [3]:
def compute_metrics(pred=None, y_true=None, y_pred=None):
    """
    Computes F1 scores (micro, macro, weighted) for both training and testing data.

    If `pred` is provided, it computes metrics for the trainer using `EvalPrediction`.
    If `y_true` and `y_pred` are provided, it computes metrics for test data predictions.

    Parameters:
        - pred (EvalPrediction, optional): The evaluation prediction object for Trainer.
        - y_true (list, optional): The ground truth labels for the test data.
        - y_pred (list, optional): The predicted labels for the test data.

    Returns:
        - dict: A dictionary containing F1 metrics.
    """
    if pred is not None:
        # When working with the Trainer, pred is an EvalPrediction object
        labels = pred.label_ids
        y_pred = pred.predictions.argmax(-1)
    elif y_true is not None and y_pred is not None:
        # If y_true and y_pred are provided, use them for test evaluation
        labels = y_true
    else:
        raise ValueError("Either `pred` or both `y_true` and `y_pred` must be provided.")

        # Compute F1 scores
    f1 = f1_score(y_true=labels, y_pred=y_pred)
    precision = precision_score(y_true=labels, y_pred=y_pred)
    recall = recall_score(y_true=labels, y_pred=y_pred)

    return {
        'f1': f1,
        'recall':recall,
        'precision':precision
    }

def compute_metrics_for_trainer(pred: EvalPrediction):
    return compute_metrics(pred=pred)

def save_metrics_to_json(metrics: dict, output_file_path: str):
    """
    Saves the metrics to a JSON file.
    Args:
        metrics (dict): The evaluation metrics.
        output_file_path (str): The file path to save the metrics.
    """
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    with open(output_file_path, 'w') as output_file:
        json.dump(metrics, output_file, indent=4)

# Assignment

# Fine-Tuning BERT Model to Fake News detection

### Import Train, Validation and Test data

Import all datasets and load and preprocess train and validation

Link to direcotry with data: https://github.com/ArkadiusDS/NLP-Labs/tree/master/data/CoAID/

In [4]:
# Define the URLs pointing to the raw CSV data files hosted on GitHub.

url_test = 'https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/CoAID/test.csv'
url_train = 'https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/CoAID/train.csv'
url_valid = 'https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/CoAID/validation.csv'

# Download the datasets from GitHub using the wget command-line tool.
# Each file is saved with a simple filename for ease of use.

!wget -O test.csv {url_test}
!wget -O train.csv {url_train}
!wget -O validation.csv {url_valid}

--2025-05-14 19:39:14--  https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/CoAID/test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 221757 (217K) [text/plain]
Saving to: ‘test.csv’


2025-05-14 19:39:14 (59.7 MB/s) - ‘test.csv’ saved [221757/221757]

--2025-05-14 19:39:15--  https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/CoAID/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1556530 (1.5M) [text/plain]
Saving to: ‘train.csv’


2025-05-14 19:39:15 (142 MB/s) - ‘train.csv’ saved [155

In [5]:
# Load and preprocess the datasets using the custom function 'load_and_process_data'
# This function will load the CSV data files, process the labels, and return the data in a usable dataframe format.

# Load and process the training data
train_data = load_and_process_data('train.csv')

# Load and process the validation data
validation_data = load_and_process_data('validation.csv')

### Load model and tokenizer

Firstly create two dicts id2label and label2id and then load model and tokenizer
Use well-known distilled version of BERT model for faster fine-tuning: 'distilbert/distilbert-base-uncased' or any other model you wish.

In [6]:
id2label = {0: "Credible", 1: "Fake"}
label2id = {"Credible": 0, "Fake": 1}

In [8]:
# Load the pre-trained BERT model and tokenizer
# BERT is a transformer-based model that has been pre-trained on a large corpus of text
# We'll use it for classification task, where the model predicts labels for text.

# Load the BERT model for classification (the base uncased version of BERT)
# This is a generic model class that will be instantiated as one of the model classes of the library (with a sequence classification head) when created with the from_pretrained()
model = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased', num_labels=2, id2label=id2label, label2id=label2id)

# Load the corresponding tokenizer for BERT
# The tokenizer is responsible for converting the text into tokens that the model can process
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

### Tokenize datasets and prepare it for fine-tuning

You may use DisinformationDataset class for data preparation.

In [9]:
# Tokenize the datasets (training and validation) to prepare them for input into the BERT model.
# Tokenization converts the raw text data into a format the BERT model can process.

# Tokenizing the training dataset
train_encodings = tokenizer(
        train_data['content'].tolist(),
        truncation=True,
        padding=True,
        max_length=256
    )

# Tokenizing the validation dataset
val_encodings = tokenizer(
        validation_data['content'].tolist(),
        truncation=True,
        padding=True,
        max_length=256
    )

In [10]:
# Create custom datasets for training and validation using the DisinformationDataset class.
# These datasets will format the tokenized text data and corresponding labels into a format that can be used by the model during training and evaluation.

# Create the training dataset: it combines the tokenized training data and corresponding labels
train_dataset = DisinformationDataset(train_encodings, train_data['label'].tolist())

# Create the validation dataset: it combines the tokenized validation data and corresponding labels
val_dataset = DisinformationDataset(val_encodings, validation_data['label'].tolist())

In [15]:
hyperparameters = {
    # https://huggingface.co/docs/transformers/v4.51.3/en/main_classes/trainer#transformers.TrainingArguments
    "training_args0": TrainingArguments(
        output_dir='output/training0/',
        eval_strategy='steps',
        learning_rate=0.00001,
        num_train_epochs=3,
        warmup_ratio=0.05,
        weight_decay=0.15,
        fp16=True,
        metric_for_best_model='f1',
        load_best_model_at_end=True,
        save_total_limit=2,
        greater_is_better=True,
        save_strategy='steps',
        eval_steps=500,
        save_on_each_node=True,
        report_to=[]
    ),
    "training_args1": TrainingArguments(
        output_dir='output/training1/',
        eval_strategy='steps',
        learning_rate=0.00002,
        num_train_epochs=2,
        warmup_ratio=0.1,
        weight_decay=0.1,
        fp16=True,
        metric_for_best_model='f1',
        load_best_model_at_end=True,
        save_total_limit=2,
        greater_is_better=True,
        save_strategy='steps',
        eval_steps=500,
        save_on_each_node=True,
        report_to=[]
    ),
    "training_args2": TrainingArguments(
        output_dir='output/training2/',
        eval_strategy='steps',
        learning_rate=0.00003,
        num_train_epochs=5,
        warmup_ratio=0.06,
        weight_decay=0.2,
        fp16=True,
        metric_for_best_model='f1',
        load_best_model_at_end=True,
        save_total_limit=2,
        greater_is_better=True,
        save_strategy='steps',
        eval_steps=500,
        save_on_each_node=True,
        report_to=[]
    ),
}

### Fine-tune BERT model on at least 3 sets of hyperparameters

Check F1 score for each fine-tuned model and at the end choose set of hyperparameters that gives you best results.
For each set of hyperparameters write down (you may save it in the dictionary) the final F1 score.

In [16]:
for ind, args_set in enumerate(hyperparameters):

    trainer = Trainer(
            model=model,  # Pass the actual model instance
            args=hyperparameters[args_set],
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics_for_trainer
        )

    trainer.train()
    model_saved_path=f'output/final{ind}/'
    os.makedirs(os.path.dirname(model_saved_path), exist_ok=True)

    trainer.save_model(model_saved_path)
    tokenizer.save_pretrained(model_saved_path)

    # Load the pipeline with CUDA
    classifier = pipeline(
        task="text-classification",
        model=model_saved_path,
        tokenizer=model_saved_path,
        device=0,
        truncation=True,
        padding=True,
        max_length=256
    )

    # Run pipeline on all content (batched)
    results = classifier(validation_data["content"].tolist(), batch_size=32)

    # Convert results to binary predictions
    validation_data["predictions"] = [1 if r["label"] == "Fake" else 0 for r in results]
    # Compute evaluation metrics on the test data
    evaluation_results = compute_metrics(y_true=validation_data["label"], y_pred=validation_data["predictions"])


    # Save the evaluation metrics to a JSON file
    output_file_path = f"metrics/results{ind}.json"
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    save_metrics_to_json(evaluation_results, output_file_path)

Step,Training Loss,Validation Loss,F1,Recall,Precision
500,0.0,0.170974,0.965686,0.960976,0.970443
1000,0.0,0.174277,0.965854,0.965854,0.965854


Device set to use cuda:0


Step,Training Loss,Validation Loss,F1,Recall,Precision
500,0.0,0.283348,0.952381,0.926829,0.979381


Device set to use cuda:0


Step,Training Loss,Validation Loss,F1,Recall,Precision
500,0.0,0.305206,0.952381,0.926829,0.979381
1000,0.0,0.311758,0.952381,0.926829,0.979381
1500,0.0,0.314193,0.955,0.931707,0.979487
2000,0.0,0.31525,0.957606,0.936585,0.979592


Device set to use cuda:0


# Final prediction on test dataset

Take best model and hyperparameters on validation and predict on test dataset. Compute evaluation metrics f1, precision and recall.

In [18]:
# Load the test data and preprocess
test_data = load_and_process_data('test.csv')

In [19]:
final_model_saved_path='output/final0/'

In [20]:
# Load the pipeline with CUDA
classifier = pipeline(
    task="text-classification",
    model=final_model_saved_path,
    tokenizer=final_model_saved_path,
    device=0,
    truncation=True,
    padding=True,
    max_length=256
)

# Run pipeline on all content (batched)
results = classifier(test_data["content"].tolist(), batch_size=32)

# Convert results to binary predictions
test_data["predictions"] = [1 if r["label"] == "Fake" else 0 for r in results]

Device set to use cuda:0


In [22]:
# Compute evaluation metrics on the test data
evaluation_results = compute_metrics(y_true=test_data["label"], y_pred=test_data["predictions"])

# Save the evaluation metrics to a JSON file
output_file_path = "metrics/results_on_test.json"
save_metrics_to_json(evaluation_results, output_file_path)

## Final resulting file

In [None]:
data = {
    "experiment_0": {
        "model": "google-bert/bert-base-uncased",
        "hyperparameters": {
            "learning_rate": "float",
            "warmap_ratio": "float",
            "weight_decay": "float"
        },
        "f1_score": "float",
        "precision": "float",
        "recall": "float",
        "description": "This experiment fine-tuned the google-bert/bert-base-uncased model for binary classification using a learning rate of 1e-5 and a warmup ratio of 0.06. The model achieved an F1-score of 0.76, with a strong recall of 0.85, indicating high sensitivity to positive cases. Precision was moderate at 0.65, suggesting some trade-off in false positives. The setup demonstrates effective recall-oriented performance in identifying relevant instances."
    },
    "experiment_1": {
        "model": "google-bert/bert-base-uncased",
        "hyperparameters": {
            "learning_rate": "float",
            "weight_decay": "float"
        },
        "f1_score": "float",
        "precision": "float",
        "recall": "float",
        "description": "Unique description two of the approach - it has to be different for each experiment. Everything in experiment_1 is related to experiment on validation dataset, so metrics are computed on validation dataset etc."
    },
    "experiment_2": {
        "model": "google-bert/bert-base-uncased",
        "hyperparameters": {
            "learning_rate": "float",
            "num_train_epochs": "int",
            "weight_decay": "float"
        },
        "f1_score": "float",
        "precision": "float",
        "recall": "float",
        "description": "Unique description three of the approach - it has to be different for each experiment. Everything in experiment_2 is related to experiment on validation dataset, so metrics are computed on validation dataset etc."
    },
    "final_prediction": {
        "model": "google-bert/bert-base-uncased",
        "experiment_chosen": "experiment_0",
        "hyperparameters": {
            "learning_rate": "float",
            "warmap_ratio": "float"
        },
        "f1_score": "float",
        "precision": "float",
        "recall": "float",
        "description": "Unique description four of the final results and prediction - it has to be different and here you will describe results on test dataset. Everything in final_prediction is related to prediction on test dataset, so metrics are computed on test dataset etc."
    }
}

In [None]:
with open("experiments_Arkadiusz_Modzelewski_29580.json", "w") as f:
    json.dump(data, f, indent=4)