<a href="https://colab.research.google.com/github/Fcera10/Lab04/blob/main/assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

Here import all crucial packages etc.

In [8]:
# Code here
import json
import os
import pandas as pd
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)
from sklearn.metrics import f1_score
import torch
from transformers import EvalPrediction, pipeline

## Utils

Helper functions that you will use

In [9]:
#Code here
os.environ["WANDB_DISABLED"] = "true"

In [10]:
class DisinformationDataset(torch.utils.data.Dataset):
    """
    This class wraps our tokenized data and labels so PyTorch can easily loop through them during training. It converts each input into tensors and returns them with the label — all in the format the model expects.
    """
    # When we create an instance of dataset, we pass in encodings and labels
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    # This method tells PyTorch how to get one item (input + label).
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    # Returns how many examples are in the dataset (needed by DataLoader).
    def __len__(self):
        return len(self.labels)


def load_and_process_data(file_path: str, label_column: str = "label") -> pd.DataFrame:
    """
    Loads the data from a CSV file and processes the labels.
    Args:
        file_path (str): Path to the CSV file.
        label_column (str): The column name containing the labels.
        text_column (str): The column name containing the text content.
    Returns:
        pd.DataFrame: Processed dataframe with labels and text content.
    """
    data = pd.read_csv(file_path, encoding='utf-8')
    data[label_column] = data[label_column].apply(lambda x: 1 if "fake" in x.lower() else 0)
    return data


def save_metrics_to_json(metrics: dict, output_file_path: str):
    """
    Saves the metrics to a JSON file.
    Args:
        metrics (dict): The evaluation metrics.
        output_file_path (str): The file path to save the metrics.
    """
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    with open(output_file_path, 'w') as output_file:
        json.dump(metrics, output_file, indent=4)

def compute_metrics(pred=None, y_true=None, y_pred=None):
    """
    Computes F1 scores (micro, macro, weighted) for both training and testing data.

    If `pred` is provided, it computes metrics for the trainer using `EvalPrediction`.
    If `y_true` and `y_pred` are provided, it computes metrics for test data predictions.

    Parameters:
        - pred (EvalPrediction, optional): The evaluation prediction object for Trainer.
        - y_true (list, optional): The ground truth labels for the test data.
        - y_pred (list, optional): The predicted labels for the test data.

    Returns:
        - dict: A dictionary containing F1 metrics.
    """
    if pred is not None:
        # When working with the Trainer, pred is an EvalPrediction object
        labels = pred.label_ids
        y_pred = pred.predictions.argmax(-1)
    elif y_true is not None and y_pred is not None:
        # If y_true and y_pred are provided, use them for test evaluation
        labels = y_true
    else:
        raise ValueError("Either `pred` or both `y_true` and `y_pred` must be provided.")

        # Compute F1 scores
    f1 = f1_score(y_true=labels, y_pred=y_pred)

    return {
        'f1': f1
    }

def compute_metrics_for_trainer(pred: EvalPrediction):
    return compute_metrics(pred=pred)

# Assignment

# Fine-Tuning BERT Model to Fake News detection

## Import Train, Validation and Test data

Import all datasets and load and preprocess train and validation

Link to direcotry with data: https://github.com/ArkadiusDS/NLP-Labs/tree/master/data/CoAID/

In [11]:
# Define the URLs pointing to the raw CSV data files hosted on GitHub.

url_test = 'https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/CoAID/test.csv'
url_train = 'https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/CoAID/train.csv'
url_valid = 'https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/CoAID/validation.csv'

# Download the datasets from GitHub using the wget command-line tool.
# Each file is saved with a simple filename for ease of use.

!wget -O test.csv {url_test}
!wget -O train.csv {url_train}
!wget -O validation.csv {url_valid}


--2025-05-17 15:24:26--  https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/CoAID/test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 221757 (217K) [text/plain]
Saving to: ‘test.csv’


2025-05-17 15:24:26 (10.7 MB/s) - ‘test.csv’ saved [221757/221757]

--2025-05-17 15:24:26--  https://raw.githubusercontent.com/ArkadiusDS/NLP-Labs/master/data/CoAID/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1556530 (1.5M) [text/plain]
Saving to: ‘train.csv’


2025-05-17 15:24:27 (27.7 MB/s) - ‘train.csv’ saved [15

In [12]:
# Load and preprocess the datasets using the custom function 'load_and_process_data'
# This function will load the CSV data files, process the labels, and return the data in a usable dataframe format.

# Load and process the training data
train_data = load_and_process_data('train.csv')

# Load and process the validation data
validation_data = load_and_process_data('validation.csv')

train_data.head()

Unnamed: 0,uuid,content,label,article_type,source_type,pre_post_GPT,dataset_source,text_length
0,04355d1f-02e4-4823-8e7d-9fce5db7c884,"""Contrary to claims in viral social media post...",1,twitter post,human,pre-GPT,CoAID,30
1,1c1dbbcc-e0e2-4da6-b3b5-60489fa04024,"""What is herd immunity? | @scoopit""",0,twitter post,human,pre-GPT,CoAID,10
2,cf880aed-6724-4e19-b601-1417bf9bf715,march 23 2020 -- the fda has approved a rapid ...,0,news article,human,pre-GPT,CoAID,94
3,7a4e0ab3-e150-409d-8e96-fac591cb07d1,people may be disinfecting their food to avoid...,0,news article,human,pre-GPT,CoAID,80
4,4af86d9d-4951-4124-bf30-7356bbcae1b4,"""Target calls (those affected by gambling) ans...",0,twitter post,human,pre-GPT,CoAID,43


## Load model and tokenizer

Firstly create two dicts id2label and label2id and then load model and tokenizer
Then use well-known distilled version of BERT model for faster fine-tuning: 'distilbert/distilbert-base-uncased' or any other model you wish.

In [13]:
id2label = {0: "Credible", 1: "Fake"}
label2id = {"Credible": 0, "Fake": 1}

# Load the pre-trained BERT model and tokenizer
# BERT is a transformer-based model that has been pre-trained on a large corpus of text
# We'll use it for classification task, where the model predicts labels for text.

# Load the BERT model for classification (the base uncased version of BERT)
# This is a generic model class that will be instantiated as one of the model classes of the library (with a sequence classification head) when created with the from_pretrained()
model = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-base-uncased', num_labels=2, id2label=id2label, label2id=label2id)

# Load the corresponding tokenizer for BERT
# The tokenizer is responsible for converting the text into tokens that the model can process
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')


# Tokenize the datasets (training and validation) to prepare them for input into the BERT model.
# Tokenization converts the raw text data into a format the BERT model can process.

# Tokenizing the training dataset
train_encodings = tokenizer(
        train_data['content'].tolist(),
        truncation=True,
        padding=True,
        max_length=256
    )

# Tokenizing the validation dataset
val_encodings = tokenizer(
        validation_data['content'].tolist(),
        truncation=True,
        padding=True,
        max_length=256
    )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenize datasets and prepare it for fine-tuning

You may use DisinformationDataset class for data preparation.

In [14]:
# Create custom datasets for training and validation using the DisinformationDataset class.
# These datasets will format the tokenized text data and corresponding labels into a format that can be used by the model during training and evaluation.

# Create the training dataset: it combines the tokenized training data and corresponding labels
train_dataset = DisinformationDataset(train_encodings, train_data['label'].tolist())

# Create the validation dataset: it combines the tokenized validation data and corresponding labels
val_dataset = DisinformationDataset(val_encodings, validation_data['label'].tolist())

## Fine-tune BERT model on at least 3 sets of hyperparameters

Check F1 score, precision and recall for each fine-tuned model and at the end choose set of hyperparameters that gives you best results. For each set of hyperparameters write down the final metrics. You need to acheive at least below result on validation dataset:

"f1": 0.91,
"recall": 0.91,
"precision": 0.91

Remember you need to achieve these minimum results on VALIDATION dataset and the best model on validation dataset will have to be used for predictions on test dataset.


In [15]:
# https://huggingface.co/docs/transformers/v4.51.3/en/main_classes/trainer#transformers.TrainingArguments
training_args = TrainingArguments(
    output_dir='output/training/',
    eval_strategy='steps',
    learning_rate=0.00001,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    warmup_ratio=0.06,
    weight_decay=0.1,
    fp16=True,
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    save_total_limit=2,
    greater_is_better=True,
    save_strategy='steps',
    eval_steps=100,
    save_on_each_node=True,
    report_to=[]
)

trainer = Trainer(
        model=model,  # Pass the actual model instance
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_for_trainer
    )
# Train the model using the Trainer class.
# This method will start the training process based on the configurations specified in the TrainingArguments.
# The model will learn from the training data and be evaluated on the validation data according to the provided settings.

trainer.train()

# Save the trained model to a specified directory after training is completed.
# This allows you to persist the model and use it for future predictions or fine-tuning without retraining.
model_saved_path='output/final/'
trainer.save_model(model_saved_path)
tokenizer.save_pretrained(model_saved_path)

Step,Training Loss,Validation Loss,F1
100,No log,0.292664,0.662338
200,No log,0.112512,0.914842
300,No log,0.133809,0.89008
400,No log,0.097599,0.931646
500,0.177900,0.094066,0.93401
600,0.177900,0.115299,0.928571
700,0.177900,0.090848,0.945
800,0.177900,0.09193,0.950249
900,0.177900,0.097417,0.945
1000,0.024100,0.096384,0.945545


('output/final/tokenizer_config.json',
 'output/final/special_tokens_map.json',
 'output/final/vocab.txt',
 'output/final/added_tokens.json',
 'output/final/tokenizer.json')

## Final prediction on test dataset

Take best model and hyperparameters on validation and predict on test dataset. Compute evaluation metrics f1, precision and recall.

In [16]:
# Load the test data and preprocess
test_data = load_and_process_data('test.csv')

# Load the pipeline with CUDA
classifier = pipeline(
    task="text-classification",
    model=model_saved_path,
    tokenizer=model_saved_path,
    device=0,
    truncation=True,
    padding=True,
    max_length=256
)

# Run pipeline on all content (batched)
results = classifier(test_data["content"].tolist(), batch_size=32)

# Convert results to binary predictions
test_data["predictions"] = [1 if r["label"] == "Fake" else 0 for r in results]


# Compute evaluation metrics on the test data
evaluation_results = compute_metrics(y_true=test_data["label"], y_pred=test_data["predictions"])

# Save the evaluation metrics to a JSON file
output_file_path = "metrics/results.json"
save_metrics_to_json(evaluation_results, output_file_path)

Device set to use cuda:0


# Final file with results and description

In [17]:
import json

All keys in your dictionary have to be the same as below. The only changes you should do in terms of keys is changing names of hyperparameters, e.g. instead of key "name_of_hyperparameter_0" if you used learning rate then write "learning_rate". Other important information in the dictionary below and comments. Each value says what is expected.

Example dictionary provided under the template.

Template for your structured resulting file

In [18]:
data = {
    # Everything in experiment_0 is related to experiment on validation dataset, so metrics are computed on validation dataset etc.
    "experiment_0": {
        "model": "model name",
        "hyperparameters": {
            "name_of_hyperparameter_0": "value in str or float - You need to play with at least two different hyperparameters so at least name_of_hyperparameter_0 and name_of_hyperparameter_1",
            "name_of_hyperparameter_1": "value in str or float"
        },
        "f1_score": "value in float",
        "precision": "value in float",
        "recall": "value in float",
        "description": "Unique description one of the approach - it has to be different for each experiment."
    },
    # Everything in experiment_1 is related to experiment on validation dataset, so metrics are computed on validation dataset etc.
    "experiment_1": {
        "model": "model name",
        "hyperparameters": {
            "name_of_hyperparameter_0": "value in str or float",
            "name_of_hyperparameter_1": "value in str or float"
        },
        "f1_score": "value in float",
        "precision": "value in float",
        "recall": "value in float",
        "description": "Unique description two of the approach - it has to be different for each experiment."
    },
    # Everything in experiment_2 is related to experiment on validation dataset, so metrics are computed on validation dataset etc.
    "experiment_2": {
        "model": "model name",
        "hyperparameters": {
            "name_of_hyperparameter_0": "value in str or float",
            "name_of_hyperparameter_1": "value in str or float"
        },
        "f1_score": "value in float",
        "precision": "value in float",
        "recall": "value in float",
        "description": "Unique description three of the approach - it has to be different for each experiment."
    },
    # Everything in final_prediction is related to prediction on test dataset, so metrics are computed on test dataset etc.
    "final_prediction": {
        "model": "google-bert/bert-base-uncased",
        "experiment_chosen": "experiment_0 or experiment_1 or experiment_2",
        "hyperparameters": {
            "name_of_hyperparameter_0": "value in str or float",
            "name_of_hyperparameter_1": "value in str or float"
        },
        "f1_score": "value in float",
        "precision": "value in float",
        "recall": "value in float",
        "description": "Unique description four of the final results and prediction - it has to be different and here you will describe results on test dataset."
    }
}


In [19]:
with open("experiments_name_surname_student_id.json", "w") as f:
    json.dump(data, f, indent=4)

## Example final file

In [20]:
data = {
    "experiment_0": {
        "model": "google-bert/bert-base-uncased",
        "hyperparameters": {
            "learning_rate": "float",
            "warmap_ratio": "float",
            "weight_decay": "float"
        },
        "f1_score": "float",
        "precision": "float",
        "recall": "float",
        "description": "This experiment fine-tuned the google-bert/bert-base-uncased model for binary classification using a learning rate of 1e-5 and a warmup ratio of 0.06. The model achieved an F1-score of 0.76, with a strong recall of 0.85, indicating high sensitivity to positive cases. Precision was moderate at 0.65, suggesting some trade-off in false positives. The setup demonstrates effective recall-oriented performance in identifying relevant instances."
    },
    "experiment_1": {
        "model": "google-bert/bert-base-uncased",
        "hyperparameters": {
            "learning_rate": "float",
            "weight_decay": "float"
        },
        "f1_score": "float",
        "precision": "float",
        "recall": "float",
        "description": "Unique description two of the approach - it has to be different for each experiment. Everything in experiment_1 is related to experiment on validation dataset, so metrics are computed on validation dataset etc."
    },
    "experiment_2": {
        "model": "google-bert/bert-base-uncased",
        "hyperparameters": {
            "learning_rate": "float",
            "num_train_epochs": "int",
            "weight_decay": "float"
        },
        "f1_score": "float",
        "precision": "float",
        "recall": "float",
        "description": "Unique description three of the approach - it has to be different for each experiment. Everything in experiment_2 is related to experiment on validation dataset, so metrics are computed on validation dataset etc."
    },
    "final_prediction": {
        "model": "google-bert/bert-base-uncased",
        "experiment_chosen": "experiment_0",
        "hyperparameters": {
            "learning_rate": "float",
            "warmap_ratio": "float"
        },
        "f1_score": "float",
        "precision": "float",
        "recall": "float",
        "description": "Unique description four of the final results and prediction - it has to be different and here you will describe results on test dataset. Everything in final_prediction is related to prediction on test dataset, so metrics are computed on test dataset etc."
    }
}

In [21]:
with open("experiments_Arkadiusz_Modzelewski_29580.json", "w") as f:
    json.dump(data, f, indent=4)