In [2]:
import json
import os
import yaml
import time
import pandas as pd
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, BatchEncoding
)
from datasets import load_dataset
from sklearn.metrics import f1_score
import torch
import numpy as np
import concurrent.futures
from tqdm import tqdm
from openai import OpenAI
from transformers import EvalPrediction, TrainerCallback
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

API_KEY = os.getenv("OPENAI_API_KEY")
DEEPINFRA_API_KEY = os.getenv("DEEPINFRA_API_KEY")

## Utils

In [None]:
class PromptTemplate:
    def __init__(self, system_prompt, user_prompt_template):
        self.system_prompt = system_prompt
        self.user_prompt_template = user_prompt_template

    def format_user_prompt(self, text):
        return self.user_prompt_template.format(text=text)

In [None]:
def client_instance(model):
    if "llama" in model.lower():
        return OpenAI(api_key=DEEPINFRA_API_KEY, base_url="https://api.deepinfra.com/v1/openai")
    else:
        return OpenAI(api_key=API_KEY)
    
def process_text_with_model(index, text, model, prompt_template):
    try:
        user_prompt = prompt_template.format_user_prompt(text)
        client = client_instance(model)
        completion = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": prompt_template.system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            temperature=0.0
        )
        return {
            "index": index,
            "system_prompt": prompt_template.system_prompt,
            "user_prompt": user_prompt,
            "completion": completion.choices[0].message.content,
        }
    except Exception as e:
        print(f"Error processing row {index}: {e}")
        time.sleep(2)  # Avoid rapid retries
        return {"index": index, "system_prompt": None, "user_prompt": None, "completion": None}
    
    
def parallel_text_processing(dataframe, col_with_content, column, filename, model, prompt_template):
    dataframe["system_prompt"] = None
    dataframe["user_prompt"] = None
    dataframe[column] = None
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    results = []

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(process_text_with_model, index, text, model, prompt_template)
            for index, text in enumerate(dataframe[col_with_content])
        ]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            try:
                result = future.result()
                if result:
                    results.append(result)
            except Exception as e:
                print(f"Thread failed with error: {e}")

    for result in results:
        dataframe.at[result["index"], "system_prompt"] = result["system_prompt"]
        dataframe.at[result["index"], "user_prompt"] = result["user_prompt"]
        dataframe.at[result["index"], column] = result["completion"]

    dataframe.to_csv(filename, index=False)
    return dataframe


def extract_numbers(dataset, column_name, new_column_name):
    """
    Extracts numbers from a JSON string in the specified column and creates a new column with those numbers.

    Parameters:
        dataset (pd.DataFrame): The input DataFrame.
        column_name (str): The name of the column containing JSON strings.
        new_column_name (str): The name of the new column to store extracted numbers.

    Returns:
        pd.DataFrame: The DataFrame with the new column added.
    """
    dataset = dataset.copy()

    def safe_extract(json_str):
        if pd.notna(json_str):
            try:
                # Ensure proper escaping
                json_str = json_str.replace("\\", "\\\\")
                return float(json.loads(json_str)['answer'])
            except (json.JSONDecodeError, KeyError, ValueError) as e:
                print(f"Error parsing JSON: {e} | Data: {json_str}")
                return None
        return None

    dataset[new_column_name] = dataset[column_name].apply(safe_extract)

    return dataset


def calculate_accuracy(dataset, true_col, pred_col):
    agreement = dataset[dataset[true_col] == dataset[pred_col]].shape[0]
    accuracy = agreement / dataset.shape[0]

    return accuracy

## Prompting

In [2]:
dataset = load_dataset("gsm8k", "main")

print(f'Dataset length : {len(dataset["test"])}')

test_samples = dataset["test"][:100]
questions = test_samples["question"]
long_answers = test_samples["answer"]
answers = [float(an.split("#### ")[-1]) for an in long_answers]

Dataset length : 1319


In [3]:
questions[0]

"Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"

In [4]:
long_answers[0]

'Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18'

In [5]:
answers[0]

18.0

In [6]:
df = pd.DataFrame({"questions":questions, "long_answers": long_answers, "answer": answers})

In [7]:
df.head()

Unnamed: 0,questions,long_answers,answer
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...,18.0
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...,3.0
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...,70000.0
3,James decides to run 3 sprints 3 times a week....,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...,540.0
4,"Every day, Wendi feeds each of her chickens th...","If each chicken eats 3 cups of feed per day, t...",20.0


In [16]:
system_prompt = """You are a helpful AI assistant who knows math."""
user_prompt = """Below I will provide a question with a math problem. 
Please solve it and present final number which is an answer to the problem. 
Do not show any explanation and do not provide units.

Question: {text}
Give answer in this form: {{"answer": "answer with final number"}}"""

prompt_template = PromptTemplate(system_prompt, user_prompt)

df_zero = parallel_text_processing(
    dataframe=df.copy(),
    col_with_content="questions",
    column="pred_solution",
    filename="test/math_problem_zero_shot.csv",
    model="gpt-4o-mini",
    prompt_template=prompt_template
)

100%|██████████| 100/100 [00:12<00:00,  7.97it/s]


In [20]:
df_zero.head()

Unnamed: 0,questions,long_answers,answer,system_prompt,user_prompt,pred_solution,extracted_number
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...,18.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""26""}",26.0
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...,3.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""3""}",3.0
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...,70000.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""65000""}",65000.0
3,James decides to run 3 sprints 3 times a week....,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...,540.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""540""}",540.0
4,"Every day, Wendi feeds each of her chickens th...","If each chicken eats 3 cups of feed per day, t...",20.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""30""}",30.0


In [18]:
df_zero = extract_numbers(dataset=df_zero, column_name='pred_solution', new_column_name='extracted_number')

In [21]:
df_zero.head()

Unnamed: 0,questions,long_answers,answer,system_prompt,user_prompt,pred_solution,extracted_number
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...,18.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""26""}",26.0
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...,3.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""3""}",3.0
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...,70000.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""65000""}",65000.0
3,James decides to run 3 sprints 3 times a week....,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...,540.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""540""}",540.0
4,"Every day, Wendi feeds each of her chickens th...","If each chicken eats 3 cups of feed per day, t...",20.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""30""}",30.0


In [22]:
calculate_accuracy(df_zero, "answer", "extracted_number")

Accuracy achieved is equal to 0.37


In [23]:
system_prompt = """You are a helpful AI assistant who knows math."""
user_prompt = """Below I will provide a question with a math problem. 
Please solve it and present final number which is an answer to the problem.
Example problems:

Problem 1:
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Reasoning with answer: Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72 
Answer: {{"answer": "72"}}

Problem 2:
Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?
Reasoning with answer: Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\n#### 10
Answer: {{"answer": "72"}}

Problem 3:
Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?
Reasoning with answer: In the beginning, Betty has only 100 / 2 = $<<100/2=50>>50.\nBetty's grandparents gave her 15 * 2 = $<<15*2=30>>30.\nThis means, Betty needs 100 - 50 - 30 - 15 = $<<100-50-30-15=5>>5 more.\n#### 5
Answer: {{"answer": "5"}}


Now your turn to solve a problem. Provide only answer. Do not show any reasoning and explanation and do not provide units.
Question: {text}
Give answer in this form: {{"answer": "answer with final number"}}"""

prompt_template = PromptTemplate(system_prompt, user_prompt)

df_few = parallel_text_processing(
    dataframe=df.copy(),
    col_with_content="questions",
    column="pred_solution",
    filename="test/math_problem_few_shot.csv",
    model="gpt-4o-mini",
    prompt_template=prompt_template
)

100%|██████████| 100/100 [00:06<00:00, 14.70it/s]


In [24]:
df_few.head()

Unnamed: 0,questions,long_answers,answer,system_prompt,user_prompt,pred_solution
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...,18.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""18""}"
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...,3.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""3""}"
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...,70000.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""65000""}"
3,James decides to run 3 sprints 3 times a week....,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...,540.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""540""}"
4,"Every day, Wendi feeds each of her chickens th...","If each chicken eats 3 cups of feed per day, t...",20.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""30""}"


In [25]:
df_few = extract_numbers(dataset=df_few, column_name='pred_solution', new_column_name='extracted_number')

In [26]:
df_few.head()

Unnamed: 0,questions,long_answers,answer,system_prompt,user_prompt,pred_solution,extracted_number
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...,18.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""18""}",18.0
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...,3.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""3""}",3.0
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...,70000.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""65000""}",65000.0
3,James decides to run 3 sprints 3 times a week....,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...,540.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""540""}",540.0
4,"Every day, Wendi feeds each of her chickens th...","If each chicken eats 3 cups of feed per day, t...",20.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""answer"": ""30""}",30.0


In [27]:
calculate_accuracy(df_few, "answer", "extracted_number")

Accuracy achieved is equal to 0.38


In [33]:
# Usage Example
system_prompt = "You are a helpful AI assistant who knows math."
user_prompt_template = """Below I will provide a question with a math problem. 
Please solve it and present the final number which is the answer to the problem. 
In the final answer do not provide units, give only the number.

Question: {text}
Give answer in this form: {{"reasoning": "Solve it step by step and provide reasoning and explanation", "answer": "final number"}}"""

prompt_template = PromptTemplate(system_prompt, user_prompt_template)

df_cot = parallel_text_processing(
    dataframe=df.copy(),
    col_with_content="questions",
    column="pred_solution",
    filename="test/math_problem_cot.csv",
    model="gpt-4o-mini",
    prompt_template=prompt_template
)

100%|██████████| 100/100 [00:38<00:00,  2.60it/s]


In [34]:
df_cot.head()

Unnamed: 0,questions,long_answers,answer,system_prompt,user_prompt,pred_solution
0,Janet’s ducks lay 16 eggs per day. She eats th...,Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eg...,18.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""reasoning"": ""First, we need to determine how..."
1,A robe takes 2 bolts of blue fiber and half th...,It takes 2/2=<<2/2=1>>1 bolt of white fiber\nS...,3.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""reasoning"": ""To find the total number of bol..."
2,Josh decides to try flipping a house. He buys...,The cost of the house and repairs came out to ...,70000.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""reasoning"": ""First, we calculate the total i..."
3,James decides to run 3 sprints 3 times a week....,He sprints 3*3=<<3*3=9>>9 times\nSo he runs 9*...,540.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""reasoning"": ""To find the total meters James ..."
4,"Every day, Wendi feeds each of her chickens th...","If each chicken eats 3 cups of feed per day, t...",20.0,You are a helpful AI assistant who knows math.,Below I will provide a question with a math pr...,"{""reasoning"": ""First, we need to determine the..."


In [35]:
df_cot.pred_solution.iloc[0]

'{"reasoning": "First, we need to determine how many eggs Janet has left after she eats and bakes with them. Janet\'s ducks lay 16 eggs per day. She eats 3 eggs for breakfast and uses 4 eggs for baking muffins. Therefore, the total number of eggs she consumes each day is 3 + 4 = 7 eggs. Next, we subtract the number of eggs she consumes from the total number of eggs laid: 16 - 7 = 9 eggs. These 9 eggs are the ones she sells at the farmers\' market. Since she sells each egg for $2, we calculate her daily earnings by multiplying the number of eggs sold by the price per egg: 9 eggs * $2/egg = $18. Thus, Janet makes $18 every day at the farmers\' market.", "answer": "18"}'

In [36]:
df_cot=extract_numbers(dataset=df_cot, column_name='pred_solution', new_column_name='extracted_number')

In [37]:
calculate_accuracy(df_cot, "answer", "extracted_number")

Accuracy achieved is equal to 0.94


# Disinformation detection with GPT

In [7]:
df = pd.read_csv("../data/ECTF/test.csv")

In [8]:
df.head()

Unnamed: 0,id,content,label
0,1255615613923450882,Luc Montagnier won the Nobel Prize for for his...,real
1,1236143519133077504,If you pumped soap from those public soap disp...,fake
2,1265282794831392768,🔴 LIVE at 11:30 am: @GovernorTomWolf + @Secret...,real
3,1234899473882664960,When’s this coronavirus vaccine coming out?,fake
4,1248951179393904640,MTA employees have contracted COVID-19 at thre...,real


In [10]:
# Usage Example
system_prompt = "You are a helpful AI assistant who detects disinformation"
user_prompt_template = """Below I will provide a short text. 
Please provide answer if it is fake or real information.

Text: {text}
Give answer in this form: {{"answer": "fake or real"}}"""

prompt_template = PromptTemplate(system_prompt, user_prompt_template)

In [None]:
df_dis = parallel_text_processing(
    dataframe=df.copy(),
    col_with_content="content",
    column="pred_solution",
    filename="test/disinformation_classification_zero_shot.csv",
    model="gpt-4o-mini",
    prompt_template=prompt_template
)

In [16]:
df_dis.head()

Unnamed: 0,id,content,label,system_prompt,user_prompt,pred_solution
0,1255615613923450882,Luc Montagnier won the Nobel Prize for for his...,real,You are a helpful AI assistant who detects dis...,Below I will provide a short text. \nPlease pr...,"{""answer"": ""real""}"
1,1236143519133077504,If you pumped soap from those public soap disp...,fake,You are a helpful AI assistant who detects dis...,Below I will provide a short text. \nPlease pr...,"{""answer"": ""fake""}"
2,1265282794831392768,🔴 LIVE at 11:30 am: @GovernorTomWolf + @Secret...,real,You are a helpful AI assistant who detects dis...,Below I will provide a short text. \nPlease pr...,"{""answer"": ""real""}"
3,1234899473882664960,When’s this coronavirus vaccine coming out?,fake,You are a helpful AI assistant who detects dis...,Below I will provide a short text. \nPlease pr...,"{""answer"": ""real""}"
4,1248951179393904640,MTA employees have contracted COVID-19 at thre...,real,You are a helpful AI assistant who detects dis...,Below I will provide a short text. \nPlease pr...,"{""answer"": ""real""}"


In [18]:
y_pred = df_dis.pred_solution.apply(lambda x: 1 if "fake" in x.lower() else 0)
y_true = df_dis.label.apply(lambda x: 1 if "fake" in x.lower() else 0)
f1_score(y_true, y_pred, average="micro")

In [11]:
df_dis_llama = parallel_text_processing(
    dataframe=df.copy(),
    col_with_content="content",
    column="pred_solution",
    filename="result/disinformation_classification_zero_shot.csv",
    model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    prompt_template=prompt_template
)

100%|██████████| 400/400 [00:30<00:00, 13.05it/s]


In [12]:
y_pred = df_dis_llama.pred_solution.apply(lambda x: 1 if "fake" in x.lower() else 0)
y_true = df_dis_llama.label.apply(lambda x: 1 if "fake" in x.lower() else 0)
f1_score(y_true, y_pred, average="micro")

0.8225

# Fine Tuning BERT for Disinformation Detection

## Utils

In [None]:
class DisinformationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
class SaveMetricsCallback(TrainerCallback):

    def __init__(self, csv_file_name, hyperparameters):
        """
        Initializes the SaveMetricsCallback class.

        Parameters:
        - csv_file_name (str): The name of the CSV file to save metrics.
        - hyperparameters (dict): Dictionary containing hyperparameters.
        """
        super().__init__()
        self.df = pd.DataFrame()
        self.file_name = csv_file_name

        # Ensure the directory exists
        os.makedirs(os.path.dirname(self.file_name), exist_ok=True)

        self.df_hyperparameters = pd.DataFrame([hyperparameters])

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        """
        Event called after an evaluation phase.
        Appends evaluated metrics to the DataFrame.
        """
        self.df = pd.concat([self.df, pd.DataFrame([metrics])])

    def on_train_end(self, args, state, control, **kwargs):
        """
        Event called at the end of training.
        Concatenates hyperparameters DataFrame with metrics DataFrame and saves to CSV file.
        """
        self.df = pd.concat([self.df, self.df_hyperparameters], axis=1)
        # Ensure the directory exists before saving
        os.makedirs(os.path.dirname(self.file_name), exist_ok=True)
        self.df.to_csv(self.file_name, index=False)


In [5]:
# Define your configuration as a Python dictionary
config = {
    'data': {
        'train': '../../data/ECTF/train.csv',
        'validation': '../../data/ECTF/validation.csv',
        'test': '../../data/ECTF/test.csv'
    },
    'tokenizer': {
        'truncation': True,
        'padding': True,
        'max_length': 256
    },
    'model': {
        'model_name': 'google-bert/bert-base-uncased',
        'output': 'output/training/dis_bert_base',
        'valid_metrics': 'metrics/disinformation/bert_base/valid/dis_bert_base',
        'path_to_save_model': 'output/final/dis_bert_base',
        'test_metrics': 'metrics/disinformation/bert_base/test/dis_bert_base',
        'hyperparameters': {
            'evaluation_strategy': 'steps',
            'per_device_train_batch_size': 16,
            'per_device_eval_batch_size': 16,
            'num_train_epochs': 5,
            'warmup_steps': 200,
            'learning_rate': 0.00001,
            'weight_decay': 0.1,
            'fp16': True,
            'metric_for_best_model': 'f1_micro',
            'load_best_model_at_end': True,
            'save_total_limit': 2,
            'greater_is_better': True,
            'save_strategy': 'steps',
            'eval_steps': 50
        }
    }
}

# Path to save the YAML file
yaml_file_path = 'config.yaml'

# Write the dictionary to a YAML file
with open(yaml_file_path, 'w') as yaml_file:
    yaml.dump(config, yaml_file, default_flow_style=False)

Config file created at: config_test.yaml


In [None]:
def compute_metrics(pred=None, y_true=None, y_pred=None):
    """
    Computes F1 scores (micro, macro, weighted) for both training and testing data.

    If `pred` is provided, it computes metrics for the trainer using `EvalPrediction`.
    If `y_true` and `y_pred` are provided, it computes metrics for test data predictions.

    Parameters:
        - pred (EvalPrediction, optional): The evaluation prediction object for Trainer.
        - y_true (list, optional): The ground truth labels for the test data.
        - y_pred (list, optional): The predicted labels for the test data.

    Returns:
        - dict: A dictionary containing F1 metrics.
    """
    if pred is not None:
        # When working with the Trainer, pred is an EvalPrediction object
        labels = pred.label_ids
        y_pred = pred.predictions.argmax(-1)
    elif y_true is not None and y_pred is not None:
        # If y_true and y_pred are provided, use them for test evaluation
        labels = y_true
    else:
        raise ValueError("Either `pred` or both `y_true` and `y_pred` must be provided.")

    # Compute F1 scores
    f1 = f1_score(y_true=labels, y_pred=y_pred)
    f1_micro = f1_score(y_true=labels, y_pred=y_pred, average='micro')
    f1_macro = f1_score(y_true=labels, y_pred=y_pred, average='macro')
    f1_macro_weighted = f1_score(y_true=labels, y_pred=y_pred, average='weighted')

    return {
        'f1': f1,
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_macro_weighted': f1_macro_weighted
    }


def compute_metrics_for_trainer(pred: EvalPrediction):
    return compute_metrics(pred=pred)


def predict_disinformation(text, tokenizer, model):
    """
    Function that predicts the label for input text using argmax
    """

    tokenized_text = tokenizer([text], truncation=True, padding=True, max_length=256, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokenized_text)

    logits = outputs.logits
    probabilities = torch.sigmoid(logits).squeeze().cpu().numpy()

    predicted_label = np.argmax(probabilities)
    return predicted_label

In [2]:
def load_config(file_path='config.yaml'):
    """Load configuration from a YAML file."""
    with open(file_path, 'r') as yaml_file:
        return yaml.safe_load(yaml_file)
    
    
def load_and_process_data(file_path: str, label_column: str = "label", text_column: str = "content") -> pd.DataFrame:
    """
    Loads the data from a CSV file and processes the labels.
    Args:
        file_path (str): Path to the CSV file.
        label_column (str): The column name containing the labels.
        text_column (str): The column name containing the text content.
    Returns:
        pd.DataFrame: Processed dataframe with labels and text content.
    """
    data = pd.read_csv(file_path, encoding='utf-8')
    data[label_column] = data[label_column].apply(lambda x: 1 if "fake" in x.lower() else 0)
    return data


def tokenize_data(tokenizer, data: pd.Series, config: dict) -> BatchEncoding:
    """
    Tokenizes the text data.
    Args:
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use.
        data (pd.Series): The text data to tokenize.
        config (dict): Configuration dictionary with tokenizer settings.
    Returns:
        dict: The tokenized data.
    """
    return tokenizer(
        data.tolist(),
        truncation=config["tokenizer"]["truncation"],
        padding=config["tokenizer"]["padding"],
        max_length=config["tokenizer"]["max_length"]
    )


def setup_trainer(config: dict, train_dataset, val_dataset) -> Trainer:
    """
    Configures the Trainer for model training.
    Args:
        config (dict): Configuration dictionary.
        train_dataset (Dataset): The training dataset.
        val_dataset (Dataset): The validation dataset.
    Returns:
        Trainer: Configured Trainer instance.
    """
    # Load model before passing it to Trainer
    model = AutoModelForSequenceClassification.from_pretrained(config["model"]["model_name"])

    training_args = TrainingArguments(
        output_dir=config["model"]["output"],
        evaluation_strategy=config["model"]["hyperparameters"]["evaluation_strategy"],
        learning_rate=config["model"]["hyperparameters"]["learning_rate"],
        per_device_train_batch_size=config["model"]["hyperparameters"]["per_device_train_batch_size"],
        per_device_eval_batch_size=config["model"]["hyperparameters"]["per_device_eval_batch_size"],
        num_train_epochs=config["model"]["hyperparameters"]["num_train_epochs"],
        warmup_steps=config["model"]["hyperparameters"]["warmup_steps"],
        weight_decay=config["model"]["hyperparameters"]["weight_decay"],
        fp16=config["model"]["hyperparameters"]["fp16"],
        metric_for_best_model=config["model"]["hyperparameters"]["metric_for_best_model"],
        load_best_model_at_end=config["model"]["hyperparameters"]["load_best_model_at_end"],
        save_total_limit=config["model"]["hyperparameters"]["save_total_limit"],
        greater_is_better=config["model"]["hyperparameters"]["greater_is_better"],
        save_strategy=config["model"]["hyperparameters"]["save_strategy"],
        eval_steps=config["model"]["hyperparameters"]["eval_steps"],
        save_on_each_node=True
    )

    return Trainer(
        model=model,  # Pass the actual model instance
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_for_trainer,
        callbacks=[SaveMetricsCallback(
            csv_file_name=f"{config['model']['valid_metrics']}.csv",
            hyperparameters=config["model"]["hyperparameters"]
        )]
    )


def save_metrics_to_json(metrics: dict, output_file_path: str):
    """
    Saves the metrics to a JSON file.
    Args:
        metrics (dict): The evaluation metrics.
        output_file_path (str): The file path to save the metrics.
    """
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    with open(output_file_path, 'w') as output_file:
        json.dump(metrics, output_file, indent=4)

In [None]:
config=load_config()
# Load and preprocess the datasets
train_data = load_and_process_data(config["data"]["train"])
validation_data = load_and_process_data(config["data"]["validation"])

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(config["model"]["model_name"])
tokenizer = AutoTokenizer.from_pretrained(config["model"]["model_name"])

In [2]:
# Tokenize datasets
train_encodings = tokenize_data(tokenizer, train_data['content'], config)
val_encodings = tokenize_data(tokenizer, validation_data['content'], config)

# Create custom datasets
train_dataset = DisinformationDataset(train_encodings, train_data['label'].tolist())
val_dataset = DisinformationDataset(val_encodings, validation_data['label'].tolist())

In [2]:
# Setup the Trainer
trainer = setup_trainer(config, train_dataset, val_dataset)

# Train the model
trainer.train()

In [2]:
# Save the trained model
model_saved_path = config["model"]["path_to_save_model"]
trainer.save_model(model_saved_path)

# Load the test data and preprocess
test_data = load_and_process_data(config["data"]["test"])

# Make predictions on the test data
model = AutoModelForSequenceClassification.from_pretrained(model_saved_path)
test_data["predictions"] = test_data["content"].apply(
    lambda x: predict_disinformation(x, tokenizer, model)
)

In [2]:
# Compute evaluation metrics on the test data
evaluation_results = compute_metrics(y_true=test_data["label"], y_pred=test_data["predictions"])

# Save the evaluation metrics to a JSON file
output_file_path = f"{config['model']['test_metrics']}.json"
save_metrics_to_json(evaluation_results, output_file_path)