### Evaluation Notebook
- Evaluating perplexity and hallucination (Automated evaluation metrics)
- Perplexity is calculated using Huggingface
- Hallucination is calculated using SelfCheckGPT score/s

##### Installation / Google Drive Setup

In [None]:
! pip install selfcheckgpt
! pip install spacy

from google.colab import drive
drive.mount('/content/drive')

##### Loading LORA Model

In [None]:
# Loading LORA Model
if True:
    max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/content/drive/MyDrive/Fine-Tuning Llama 3.1 For Test Cases/lora", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

##### Perplexity

In [None]:
# Imports
from transformers import LlamaTokenizer, LlamaForCausalLM
import torch
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling
from tqdm import tqdm

import pandas as pd
from datasets import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Load the CSV file
df = pd.read_csv("/content/drive/MyDrive/Fine-Tuning Llama 3.1 For Test Cases/cycle2_test_reformatted.csv")

# Combine the columns into a single text input for the model
df["text"] = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{df["instruction"]}

### Input:
{df["input"]}

### Response:
{df["output"]}"""

# Create a HuggingFace dataset
dataset = Dataset.from_pandas(df[["text"]])

# Preprocess the dataset to match the input format
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["text"])

# Evaluate Perplexity using Torch
def compute_perplexity(model, dataset, tokenizer):
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
    dataloader = DataLoader(tokenized_datasets, collate_fn=data_collator, batch_size=8)
    model.eval()
    total_loss = 0
    total_length = 0

    with torch.no_grad():
        for batch in tqdm(dataloader):
            inputs = batch["input_ids"].to(device)
            outputs = model(inputs, labels=inputs)
            loss = outputs.loss
            total_loss += loss.item() * inputs.size(1)
            total_length += inputs.size(1)

    avg_loss = total_loss / total_length
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()

perplexity = compute_perplexity(model, dataset, tokenizer)
print(f"Perplexity: {perplexity}")

##### Hallucination

In [None]:
import torch
import spacy
from selfcheckgpt.modeling_selfcheck import SelfCheckLLMPrompt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
llm_model = "unsloth/Meta-Llama-3.1-8B"
selfcheck_prompt = SelfCheckLLMPrompt(llm_model, device)

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

outputs = selfcheck_prompt.tokenizer(
[
    alpaca_prompt.format(
        "Generate a test case for the following UI Element:", # instruction #
        "Paragraph Element: 'February 6: Sámi National Day (1917); Waitangi Day in New Zealand (1840)' With Color rgb(32, 33, 34) With Background Color rgba(0, 0, 0, 0) from the website: https://en.wikipedia.org/wiki/Main_Page", # input
        "", # output - leave this blank for generation!
    ),
])

repeated_outputs = selfcheck_prompt.tokenizer(
[
    alpaca_prompt.format(
        "Generate a test case for the following UI Element:", # instruction #
        "Paragraph Element: 'February 6: Sámi National Day (1917); Waitangi Day in New Zealand (1840)' With Color rgb(32, 33, 34) With Background Color rgba(0, 0, 0, 0) from the website: https://en.wikipedia.org/wiki/Main_Page", # input
        "", # output - leave this blank for generation!
    ),
])

sentences = outputs
# Response 2 / Samples
samples =  repeated_outputs

sent_scores_prompt = selfcheck_prompt.predict(
    sentences = sentences,
    sampled_passages = samples,
    verbose = True,
)
print(sent_scores_prompt)