# CheckThat Lab Task 2: Claims Extraction & Normalization (English)

In this task, you will be given a noisy, unstructured social media post, and your goal is to simplify it into a concise form, and normalize them into a structured format. 

Therefore, we aim to bridge this gap by decomposing social media posts into simpler, more comprehensible forms, which are referred to as normalized claims.

We will employ METEOR score for final evaluation.

For more information, please visit [CHECKTHAT! LAB TASK 2](https://checkthat.gitlab.io/clef2025/task2/)

# Task 2: Claim Normalization

Given a noisy, unstructured social media post, the task is to simplify it into a concise form.
This is a text generation task in which systems have to generate the normlized claims for the goven social media posts.

# Steps to run the code

## 1. Start by processing the dataset to match the instruction finetuning format for the together.ai platform
```bash
python3 process_data.py
```
Run the above command on CLI and this will generate a JSONL file with reformatted data.

In [None]:
from typing import List, Dict, Any
import pandas as pd
import json
import argparse
import os

def read_prompt(path: Any) -> str:
    """
    Read the system prompt from the file and return the prompt as a string.

    Args:
        path (str): path to the file containing the system prompt.
    """
    file_path: str
    if path is None:
        file_path = "./prompt.jsonl"
    else:
        file_path = path
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            json_object = json.load(file)
            PROMPT = json_object['prompt']
            return PROMPT
    except FileNotFoundError:
        print(f"No file found for the prompt.\nPlease re-launch the program using python3 process_data.py -p path_to_prompt_file")
        exit()
        
def read_data(path: Any) -> pd.DataFrame:
    """
    Read the data from the file and return the data as a pandas DataFrame.

    Args:
        path (str): path to the file containing the data.
    """
    fp: str
    if path is None:
        fp = "./data/train.csv"
    else:
        fp = path
    try:
        data = pd.read_csv(fp)
        return data
    except Exception as e:
        print(f"Error: {e}")
        print(f"No file found for the data.\nPlease re-launch the program using python3 process_data.py -d path_to_data_file")
        exit()
        


In [None]:
def process_data(data: pd.DataFrame, prompt: str) -> Any:
    """
    Process the data to re-format it to match instruction fine-tuning data style.

    Args:
        data (pd.DataFrame): data to be processed.
        prompt (str): system prompt for the fine-tuning data.
    """
    train_examples = []
    output_file = "./data/buffer_data.jsonl"
    with open(output_file, 'w', encoding='utf-8') as f:
        for index, item in data.iterrows():
            json_data = {
                "messages": [
                                {
                                    "content":prompt,
                                    "role": "system"
                                },
                                {
                                    "content": item["post"],
                                    "role": "user"
                                },
                                {
                                    "content": item["normalized claim"],
                                    "role": "assistant"
                                }
                            ]
            }
            try:
                # Validate JSON by encoding and decoding
                json_line = json.dumps(json_data, ensure_ascii=False)
                json.loads(json_line)  # This will raise an error if JSON is invalid
                f.write(json_line + '\n')
            except json.JSONDecodeError as e:
                print(f"Error in line {index + 1}: {e}")
                
    with open(output_file, 'r', encoding='utf-8') as in_file, open("./data/finetune_data.jsonl", 'w', encoding='utf-8') as outfile:
        for line_number, line in enumerate(in_file, 1):
            try:
                # Try to parse the JSON
                json_object = json.loads(line.strip())

                # Write the valid JSON line to the output file
                json.dump(json_object, outfile)
                outfile.write('\n')
            except json.JSONDecodeError as e:
                print(f"Error in line {line_number}: {e}")
    os.remove(output_file)
    return outfile.name

In [None]:
PROMPT: str
file_path: str
TRAIN_DATA: pd.DataFrame
    
PROMPT = read_prompt(None)
TRAIN_DATA = read_data(None)
    
FINETUNE_DATA = process_data(TRAIN_DATA, PROMPT)
print(f"Fine-tuning data has been created and saved in the file: {FINETUNE_DATA}")

## 2. Upload the file to together.ai's fine-tuning queue.
    
First set your account's API key to an environment variable named TOGETHER_API_KEY:
    
```bash
export TOGETHER_API_KEY=xxxxx
```
    
Install together library 
    
```bash
pip install together --upgrade
```

In [None]:
import os
from together import Together
from typing import Any
import argparse

def uploadfile(filepath:Any):
    file_name: str
    if filepath is None:
        file_name = "./data/finetune_data.jsonl"
    else:
        file_name = filepath
    try:
        client = Together(api_key=os.getenv("TOGETHER_API_KEY"))
        file_resp = client.files.upload(file=file_name, check=True)
        return file_resp.model_dump()

    except Exception as e:
        print(f"Error: {e}")
        if e is FileNotFoundError:
            print(f"No file found for the data.\nPlease re-launch the program using python3 process_data.py -d path_to_data_file")
        exit()

In [None]:
upload_response: Any
upload_response = uploadfile(None)        
print(f"File upload response:\n{upload_response}")

## 3. Create a fine-tuning job on together.ai
Run the below command to create a fine-tuning job on together.ai

### Python
```bash
python3 together_finetune.py -m model_name -f file-id
```
### CLI
```bash
together fine-tuning create \
  --training-file "file-629e58b4-ff73-438c-b2cc-f69542b27980" \
  --model "meta-llama/Meta-Llama-3.1-8B-Instruct-Reference" \
  --lora
```
The response object will have all the details of your job, including its ID and a `status` key that starts out as "pending":
```bash
{
  id='ft-66592697-0a37-44d1-b6ea-9908d1c81fbd', 
  training_file='file-63b9f097-e582-4d2e-941e-4b541aa7e328', 
  validation_file='', 
  model='meta-llama/Meta-Llama-3.1-8B-Instruct-Reference', 
  output_name='zainhas/Meta-Llama-3.1-8B-Instruct-Reference-30b975fd', 
... 
  status=<FinetuneJobStatus.STATUS_PENDING: 'pending'>
}
```

## 4. Monitoring the fine-tuning status
Go to your Dashboard on togther.ai and look under jobs to monitor the fine-tuning progress. Alternatively, you can also use the below command to get the status of the job.
```bash
together fine-tuning retrieve "ft-66592697-0a37-44d1-b6ea-9908d1c81fbd"
```
Your fine-tuning job will go through several phases, including `Pending` , `Queued` , `Running` , `Uploading` , and `Completed` .
## 5. Download Checkpoints
Once the fine-tuning jo is completed, download the Adapter checkpoints to run locally with your base model.

## 6. Downloading model from hugging face
Download the base version of your chosen model from hugging face

Authenticate yourself first by logging into your Hugging Face account
```bash
huggingface-cli login
```
Run the below command to download the model
```bash
huggingface-cli download meta-llama/Meta-Llama-3.1-8B-Instruct --include "original/*" --local-dir meta-llama/Meta-Llama-3.1-8B-Instruct
```

## 7. Run local Inference 
To evaluate the performance on the validation set, run inference locally using the below command
```bash
python3 evaluate.py
```
Make sure you update the paths to the model and adapters in your evaluate.py file

In [None]:
import os
import argparse
from together import Together
from typing import Any
import nltk
nltk.download('wordnet', quiet=True)
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import meteor_score
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

def get_claim(model: Any, tokenizer: Any, user_prompt: str) -> str:
    sys_promt = """You are a helpful AI assistant that can generate a summary of claims made in a given text in the style of a news headline.
    Based on the input text, extract a claim that is being made or implied and return it as a json object."""
    inputs = tokenizer(f"{sys_promt}\ninput_text:{user_prompt}", return_tensors="pt").to('cuda')
    outputs = model.generate(
        **inputs,
        max_new_tokens=30,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True, 
        top_k=50,
        top_p=0.95,
        temperature=0.3
    )
    generated_claim = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    torch.cuda.empty_cache()
    return generated_claim

def evaluate_model(model: Any, tokenizer: Any, input_data: Any):
    scores = []
    responses = []
    for index, item in tqdm(input_data.iterrows(), total=len(input_data)):
        response = get_claim(model, tokenizer, item['post'])
        #print(f"Response: {response}")
        token_res = word_tokenize(response)
        token_label = word_tokenize(item['normalized claim'])
        scores.append(meteor_score([token_res], token_label))
        responses.append(response)
    with open("generated_claims_dev.jsonl", "w") as f:
        json.dump(responses,f, ensure_ascii=False)
        
    return np.mean(scores)

In [None]:
file_path: str
DEV_DATA: pd.DataFrame
    
file_path = "./data/dev.csv"
DEV_DATA = pd.read_csv(file_path)
    
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("Device name:", torch.cuda.get_device_name(0))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model_path = "./meta-llama/Meta-Llama-3.1-8B-Instruct"  # Local directory of the base model
lora_adapter_path = "./lora-adapters"  # Path to your LoRA adapters

tokenizer = AutoTokenizer.from_pretrained(base_model_path, local_files_only=True)

if tokenizer.pad_token is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token


model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    local_files_only=True,
    torch_dtype=torch.float16,
    device_map="auto" 
)

model.config.pad_token_id = tokenizer.pad_token_id
model = PeftModel.from_pretrained(model, lora_adapter_path, is_trainable=False)
model = model.to(device)
    
METEROR_SCORE = evaluate_model(model, tokenizer, DEV_DATA)
    
print(f"Average METEOR Score: {METEROR_SCORE}")

In [None]:
TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")
WANDB_API_KEY = os.getenv("WANDB_API_KEY")
client = Together(api_key = TOGETHER_API_KEY)

## Pipeline flow of the task2

The pipeline flow for the task 2 is as follows:
1. Data Preprocessing: Clean and preprocess the data by reformatting to the finetuning format for the model of your choice.
2. Fine-tuning: Upload the dataset file and create a new fine-tuning job 
3. Evaluation: Use the model to extract claims from the social media posts and calculate the average METEOR score on the development set.
4. Training: Train the fine-tuned model using the training loop if the performance is not satisfactory.
5. Inference: Use the model to extract claims from the social media posts from the test set.

# Fine-tuning Techniques

In [None]:
TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")
WANDB_API_KEY = os.getenv("WANDB_API_KEY")
client = Together(api_key = TOGETHER_API_KEY)

## Using LoRA on Together.ai

In [None]:
import os
from together import Together

client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))

response = client.fine_tuning.create(
  training_file = 'file-5e32a8e6-72b3-485d-ab76-71a73d9e1f5b',
  model = 'meta-llama/Meta-Llama-3-8B',
  lora = True,
  n_epochs = 3,
  n_checkpoints = 1,
  batch_size = "max",
  learning_rate = 1e-5,
  suffix = 'my-demo-finetune',
  wandb_api_key = '1a2b3c4d5e.......', # weights and biases API key to directly monitor the training process
  lora_r=16,
  lora_dropout=0.5,
)

print(response)

LoRA with huggingface libraries

### Loading and Preparing the Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

# Load pre-trained model and tokenizer
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)


### LoRA Configuration

In [None]:

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

### Applying LoRA to the Model

In [None]:
# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Load and prepare your dataset
dataset = load_dataset("your_dataset")

### Training Configuration

In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=100,
    save_total_limit=2,
    learning_rate=2e-5,
)

### SFT Trainer Initialization

In [None]:
# Initialize the SFT Trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    tokenizer=tokenizer,
)

### Training and saving the model

In [None]:
# Start fine-tuning
trainer.train()

# Save the fine-tuned model
trainer.save_model("./fine_tuned_model")

In [None]:
from openai import OpenAI
import os

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
print(OPENAI_API_KEY)
client = OpenAI(api_key=OPENAI_API_KEY)

completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "user",
            "content": "Write a one-sentence bedtime story about a unicorn."
        }
    ]
)

print(completion.choices[0].message.content)