### Install and Import Libraries

In [1]:
!pip install -q -U bitsandbytes transformers peft accelerate evaluate rouge_score bert_score

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m129.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m108.2 MB/s[0m eta [36m0:00

In [1]:
import pandas as pd
import torch
import os
import sys
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from peft import PeftModel, PeftConfig
from huggingface_hub import notebook_login
import warnings
warnings.filterwarnings("ignore")
import evaluate
from tqdm import tqdm

### Hugging Face login

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Model Configurations

In [3]:
class ModelConfig:
    """
    Configuration class for the CausalLM fine-tuning script.
    """
    # --- Model Selection ---
    MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"

    # --- Hugging Face Hub Integration ---
    PUSH_TO_HUB = True
    # IMPORTANT: Hugging Face repository ID (username/model-name)
    HUB_MODEL_ID = "TripleH/Meta-Llama-3-8B-Instruct-qlora-modular"

    # --- Data and Paths ---
    # The input file path
    INPUT_FILE_PATH = "/content/drive/MyDrive/Colab/football_dataset_contrarian_claims_with_reasoning.csv"
    OUTPUT_DIR = "/content/drive/MyDrive/football_model_llama/results"
    LOGGING_DIR = "/content/drive/MyDrive/football_model_llama/logs"

    # --- Preprocessing ---
    INPUT_COLUMN = "claim"
    TARGET_FOOTBALL_COLUMN = "Football_Term"
    TARGET_REASONING_COLUMN = "Reasoning"

    # --- Tokenization ---
    MAX_SEQ_LENGTH = 384

    # --- Training ---
    NUM_TRAIN_EPOCHS = 6
    PER_DEVICE_TRAIN_BATCH_SIZE = 1
    PER_DEVICE_EVAL_BATCH_SIZE = 1
    GRADIENT_ACCUMULATION_STEPS = 4
    WARMUP_STEPS = 100
    WEIGHT_DECAY = 0.01
    LEARNING_RATE = 2e-4

    # Early stopping will halt training if validation loss doesn't improve for 3 evaluations
    EARLY_STOPPING_PATIENCE = 3


### Format Prompt and Data Transformation for Llama-3-8B

In [4]:
import pandas as pd
import sys
from datasets import Dataset


def create_chat_prompt(example, config: ModelConfig):
    """
    Creates a chat-formatted prompt with a more specific system message
    to encourage the desired "commentator" style output.
    """
    system_prompt = "You are a witty football commentator who refutes claims with a clever, direct analogy. You MUST first provide a short, punchy 'Football Term' you would hear on a broadcast, followed by a 'Reasoning' that explains the connection. Your response must strictly follow this two-part format."
    user_prompt = f"Refute this claim: '{example[config.INPUT_COLUMN]}'"
    model_answer = f"**Football Term:** {example[config.TARGET_FOOTBALL_COLUMN]}\n**Reasoning:** {example[config.TARGET_REASONING_COLUMN]}"

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": model_answer}
    ]

    return {"messages": messages}


def load_and_preprocess_data(config: ModelConfig, file_path: str) -> Dataset:
    """
    Loads and preprocesses the data for CausalLM fine-tuning.
    Accepts a file_path argument passed from the main script.
    """
    print(f"Loading data from {config.INPUT_FILE_PATH}...")
    try:
        df = pd.read_csv(file_path, encoding='latin1')
    except FileNotFoundError:
        print(f"Error: Dataset not found at '{config.INPUT_FILE_PATH}'", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error loading CSV: {e}", file=sys.stderr)
        return None

    df.columns = df.columns.str.strip()
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(lambda x: create_chat_prompt(x, config), remove_columns=dataset.column_names)
    print("Data loading and formatting complete.")
    return dataset


### LoRA Configuration and Training Arguments

In [5]:
import os
import warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
warnings.filterwarnings("ignore", category=FutureWarning)
# --- End Suppress Warnings ---

import torch
import sys
from getpass import getpass
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

def run_training(config: ModelConfig, file_path: str):
    """Main function to orchestrate the fine-tuning process."""
    # --- Device and Auth ---
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"--- Using device: {device} ---")

    # --- Tokenizer and Data Loading ---
    tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    def tokenize_function(examples):
        formatted_text = tokenizer.apply_chat_template(examples['messages'], tokenize=False, add_generation_prompt=False)
        return tokenizer(formatted_text, truncation=True, max_length=config.MAX_SEQ_LENGTH, padding="max_length")

    full_dataset = load_and_preprocess_data(config, file_path)
    if full_dataset is None:
        return

    tokenized_dataset = full_dataset.map(tokenize_function, remove_columns=["messages"])

    train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
    train_dataset = train_test_split['train']
    eval_dataset = train_test_split['test']

    # --- QLoRA Configuration ---
    bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)

    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # --- Model Loading and Preparation ---
    model = AutoModelForCausalLM.from_pretrained(config.MODEL_NAME, quantization_config=bnb_config, device_map="auto")
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    model.config.use_cache = False

    # --- Model Training ---
    training_args = TrainingArguments(
        output_dir=config.OUTPUT_DIR,
        num_train_epochs=config.NUM_TRAIN_EPOCHS,
        learning_rate=config.LEARNING_RATE,
        per_device_train_batch_size=config.PER_DEVICE_TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=config.PER_DEVICE_EVAL_BATCH_SIZE,
        gradient_accumulation_steps=config.GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=config.WARMUP_STEPS,
        weight_decay=config.WEIGHT_DECAY,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy = "epoch",
        load_best_model_at_end=True, # This is crucial for early stopping
        metric_for_best_model="loss", # Monitor validation loss
        push_to_hub=config.PUSH_TO_HUB,
        hub_model_id=config.HUB_MODEL_ID,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=config.EARLY_STOPPING_PATIENCE)]
    )

    print("--- Starting QLoRA Fine-Tuning ---")
    trainer.train()
    print("--- Fine-Tuning Complete ---")

    if config.PUSH_TO_HUB:
        print(f"Uploading LoRA adapters to Hugging Face Hub: {config.HUB_MODEL_ID}")
        trainer.push_to_hub()
        print("Adapters uploaded successfully!")


### Final Model Training

In [None]:
def main():

    # Initialize the configuration
    config = ModelConfig()

    # Pass the configuration and the filename to the training utility
    run_training(config, config.INPUT_FILE_PATH)

if __name__ == "__main__":
    main()


--- Using device: cuda ---
Loading data from /content/drive/MyDrive/Colab/football_dataset_contrarian_claims_with_reasoning.csv...


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Data loading and formatting complete.


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


--- Starting QLoRA Fine-Tuning ---


Epoch,Training Loss,Validation Loss
1,4.6094,3.766745
2,2.7371,1.532767
3,1.1169,1.022627
4,0.7548,0.95375
5,0.4412,0.913543
6,0.2324,0.86589


--- Fine-Tuning Complete ---
Uploading LoRA adapters to Hugging Face Hub: TripleH/Meta-Llama-3-8B-Instruct-qlora-modular
Adapters uploaded successfully!


### Inference Configurations

In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import sys

# --- CONFIGURATION ---
class BatchInferenceConfig:
    """
    Configuration class for the QLoRA inference script.
    """
    # --- Model Selection ---
    # The Hugging Face Hub repository ID of the fine-tuned LoRA adapters.
    ADAPTER_MODEL_PATH = "TripleH/Meta-Llama-3-8B-Instruct-qlora-modular"

    # --- Data Paths ---
    # The path to the original dataset file.
    INPUT_DATA_PATH = "/content/drive/MyDrive/Colab/football_dataset_contrarian_claims_with_reasoning.csv"
    # The name of the file where the full predictions and ground truth will be saved.
    OUTPUT_DATA_PATH = "/content/drive/MyDrive/Colab/predictions_Meta-Llama-3-8B-Instruct.csv"
    # The name for a cleaner CSV with just the claims and predictions.
    CLEAN_OUTPUT_DATA_PATH = "/content/drive/MyDrive/Colab/predictions_clean_Meta-Llama-3-8B-Instruct.csv"

    # --- Generation Parameters ---
    # Switching to sampling with temperature for more creative outputs.
    MAX_NEW_TOKENS = 128
    DO_SAMPLE = True
    TEMPERATURE = 0.1  # Controls randomness. Higher is more creative.
    TOP_P = 0.9        # Nucleus sampling. Helps prevent the model from going off-topic.

# --- END CONFIGURATION ---

### Loading Fine Tuned Model and Model Merging

In [7]:
def load_model_and_tokenizer(adapter_path: str):
    """
    Loads the base model in 4-bit, applies the LoRA adapters,
    and prepares it for inference.
    """
    print(f"Loading fine-tuned model from Hub: {adapter_path}...")
    try:
        # First, get the base model name from the adapter's config using PeftConfig
        peft_config = PeftConfig.from_pretrained(adapter_path)
        base_model_name = peft_config.base_model_name_or_path

        print(f"Base model identified as: {base_model_name}")

        # Configure 4-bit quantization to load the base model efficiently
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
        )

        # Load the base model with quantization
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            quantization_config=bnb_config,
            device_map="auto", # Automatically map layers to GPU/CPU
            trust_remote_code=True
        )

        # Load the tokenizer
        tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        # Load the LoRA adapter and merge it into the base model
        print("Loading LoRA adapters and merging with base model...")
        model = PeftModel.from_pretrained(base_model, adapter_path)
        model = model.merge_and_unload() # Merge adapters for faster inference

        model.eval() # Set the model to evaluation mode
        print("Model and tokenizer loaded successfully.")
        return model, tokenizer

    except Exception as e:
        print(f"Error loading model: {e}", file=sys.stderr)
        return None, None


### Generating Predictions

In [8]:
def generate_prediction(model, tokenizer, claim_text: str, config: BatchInferenceConfig):
    """Generates a prediction for a single claim text using the chat template."""

    # Define the system and user prompts, exactly matching the training format.
    system_prompt = "You are an expert sports analyst who refutes claims with a football analogy. You MUST first provide a concise, metaphorical 'Football Term', followed by a 'Reasoning' that explains the connection. Your response must strictly follow this two-part format."
    user_prompt = f"Refute this claim: '{claim_text}'"

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    # Apply the template and get the input IDs.
    # add_generation_prompt=True adds the special tokens to signal the model to start generating.
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # Generate output using the fine-tuned model
    with torch.no_grad():
        output_sequences = model.generate(
            input_ids=inputs,
            max_new_tokens=config.MAX_NEW_TOKENS,
            do_sample=config.DO_SAMPLE,
            temperature=config.TEMPERATURE,
            top_p=config.TOP_P,
            pad_token_id=tokenizer.eos_token_id,
            use_cache = False
        )

    # Decode the generated tokens back to a string
    # We only want the newly generated part, so we slice the output
    generated_text = tokenizer.decode(output_sequences[0][inputs.shape[1]:], skip_special_tokens=True)
    return generated_text.strip()


### Model Inference

In [9]:
def main():
    """
    Main function to run batch inference on the entire dataset.
    """
    config = BatchInferenceConfig()

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"--- Using device: {device} ---")

    model, tokenizer = load_model_and_tokenizer(config.ADAPTER_MODEL_PATH)
    if model is None:
        return

    try:
        df = pd.read_csv(config.INPUT_DATA_PATH, encoding='latin1')
        df.columns = df.columns.str.strip()
    except FileNotFoundError:
        print(f"Error: Input data file not found at '{config.INPUT_DATA_PATH}'.", file=sys.stderr)
        return

    predictions = []
    ground_truths = []

    print(f"\n--- Generating predictions for {len(df)} claims ---")
    # Use tqdm for a progress bar
    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing claims"):
        claim = row['claim']

        # Generate the model's prediction
        prediction = generate_prediction(model, tokenizer, claim, config)
        predictions.append(prediction)

        # Construct the ground truth for comparison
        ground_truth = f"**Football Term:** {row['Football_Term']}\n**Reasoning:** {row['Reasoning']}"
        ground_truths.append(ground_truth)

    # Add the new columns to the DataFrame
    df['ground_truth'] = ground_truths
    df['model_prediction'] = predictions

    # Save the full results to the first CSV file
    df.to_csv(config.OUTPUT_DATA_PATH, index=False)

    # Create and save the second, cleaner CSV file
    clean_df = df[['claim', 'model_prediction']]
    clean_df.to_csv(config.CLEAN_OUTPUT_DATA_PATH, index=False)

    print("\n--- Batch Inference Complete ---")
    print(f"Full results with ground truth saved to '{config.OUTPUT_DATA_PATH}'")
    print(f"Clean predictions saved to '{config.CLEAN_OUTPUT_DATA_PATH}'")


if __name__ == "__main__":
    main()


--- Using device: cuda ---
Loading fine-tuned model from Hub: TripleH/Meta-Llama-3-8B-Instruct-qlora-modular...


adapter_config.json:   0%|          | 0.00/918 [00:00<?, ?B/s]

Base model identified as: meta-llama/Meta-Llama-3-8B-Instruct


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Loading LoRA adapters and merging with base model...


adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Model and tokenizer loaded successfully.

--- Generating predictions for 50 claims ---


Processing claims:   0%|          | 0/50 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Processing claims: 100%|██████████| 50/50 [10:19<00:00, 12.39s/it]


--- Batch Inference Complete ---
Full results with ground truth saved to '/content/drive/MyDrive/Colab/predictions_Meta-Llama-3-8B-Instruct.csv'
Clean predictions saved to '/content/drive/MyDrive/Colab/predictions_clean_Meta-Llama-3-8B-Instruct.csv'





### Evaluation Configurations

In [9]:
# --- CONFIGURATION ---
class EvaluationConfig:
    """
    Configuration class for the model evaluation script.
    """
    # --- Model Selection ---
    # IMPORTANT: Update this to the Hugging Face Hub repository ID of the model you want to evaluate.
    # Example: "TripleH/football-rebuttal-model-phi3-final"
    ADAPTER_MODEL_PATH = "TripleH/Meta-Llama-3-8B-Instruct-qlora-modular"

    # --- Data Paths ---
    # The path to your original dataset file.
    INPUT_DATA_PATH = "/content/drive/MyDrive/Colab/predictions_Meta-Llama-3-8B-Instruct.csv"
    # The name of the file where the evaluation results will be saved.
    # IMPORTANT: Change this for each model to avoid overwriting results.
    OUTPUT_FILE_PATH = "evaluation_results_llama-8b.csv"

    # --- Generation Parameters ---
    MAX_NEW_TOKENS = 128
    TEMPERATURE = 0.1
    TOP_P = 0.9
    DO_SAMPLE = True

# --- END CONFIGURATION ---


### Model Evaluation

In [10]:
def main():
    """
    Main function to run batch inference and evaluation on the entire dataset.
    """
    config = BatchInferenceConfig()

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"--- Using device: {device} ---")

    model, tokenizer = load_model_and_tokenizer(config.ADAPTER_MODEL_PATH)
    if model is None:
        return

    try:
        df = pd.read_csv(config.INPUT_DATA_PATH, encoding='latin1')
        df.columns = df.columns.str.strip()
    except FileNotFoundError:
        print(f"Error: Input data file not found at '{config.INPUT_DATA_PATH}'.", file=sys.stderr)
        return

    # Load metrics
    rouge_metric = evaluate.load("rouge")
    bleu_metric = evaluate.load("bleu")
    bertscore_metric = evaluate.load("bertscore")

    results = []

    print(f"\n--- Generating predictions and evaluating for {len(df)} claims ---")
    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing claims"):
        claim = row['claim']

        # Generate the model's prediction
        prediction = generate_prediction(model, tokenizer, claim, config)

        # Construct the ground truth for comparison
        ground_truth = f"**Football Term:** {row['Football_Term']}\n**Reasoning:** {row['Reasoning']}"

        # Calculate metrics for this single example
        rouge_scores = rouge_metric.compute(predictions=[prediction], references=[ground_truth])
        bleu_scores = bleu_metric.compute(predictions=[prediction], references=[[ground_truth]])
        bert_scores = bertscore_metric.compute(predictions=[prediction], references=[ground_truth], lang="en")

        results.append({
            "claim": claim,
            "ground_truth": ground_truth,
            "model_prediction": prediction,
            "rouge1": rouge_scores['rouge1'],
            "rouge2": rouge_scores['rouge2'],
            "rougeL": rouge_scores['rougeL'],
            "bleu": bleu_scores['bleu'],
            "bertscore_precision": bert_scores['precision'][0],
            "bertscore_recall": bert_scores['recall'][0],
            "bertscore_f1": bert_scores['f1'][0]
        })

    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)

    # --- Calculate and Print Average Metrics ---
    print("\n--- Average Evaluation Metrics ---")
    avg_metrics = {
        "rouge1": results_df['rouge1'].mean(),
        "rouge2": results_df['rouge2'].mean(),
        "rougeL": results_df['rougeL'].mean(),
        "bleu": results_df['bleu'].mean(),
        "bertscore_precision": results_df['bertscore_precision'].mean(),
        "bertscore_recall": results_df['bertscore_recall'].mean(),
        "bertscore_f1": results_df['bertscore_f1'].mean()
    }

    header = f"| {'Metric':<25} | {'Average Score':<20} |"
    separator = f"|{'-'*27}|{'-'*22}|"
    print(separator)
    print(header)
    print(separator)
    for key, value in avg_metrics.items():
        print(f"| {key:<25} | {value:<20.4f} |")
    print(separator)

    # Save the results to a new CSV file
    results_df.to_csv(config.OUTPUT_DATA_PATH, index=False)

    print("\n--- Batch Evaluation Complete ---")
    print(f"✅ Evaluation results saved to '{config.OUTPUT_DATA_PATH}'")

if __name__ == "__main__":
    main()

--- Using device: cuda ---
Loading fine-tuned model from Hub: TripleH/Meta-Llama-3-8B-Instruct-qlora-modular...


adapter_config.json:   0%|          | 0.00/918 [00:00<?, ?B/s]

Base model identified as: meta-llama/Meta-Llama-3-8B-Instruct


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Loading LoRA adapters and merging with base model...


adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Model and tokenizer loaded successfully.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]


--- Generating predictions and evaluating for 50 claims ---


Processing claims:   0%|          | 0/50 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing claims: 100%|██████████| 50/50 [10:26<00:00, 12.54s/it]



--- Average Evaluation Metrics ---
|---------------------------|----------------------|
| Metric                    | Average Score        |
|---------------------------|----------------------|
| rouge1                    | 0.1441               |
| rouge2                    | 0.0278               |
| rougeL                    | 0.1231               |
| bleu                      | 0.0922               |
| bertscore_precision       | 0.8352               |
| bertscore_recall          | 0.8762               |
| bertscore_f1              | 0.8552               |
|---------------------------|----------------------|

--- Batch Evaluation Complete ---
✅ Evaluation results saved to '/content/drive/MyDrive/Colab/predictions_Meta-Llama-3-8B-Instruct.csv'
