### Evaluation Notebook
- Evaluating perplexity and hallucination (Automated evaluation metrics)
- Perplexity is calculated using Huggingface
- Hallucination is calculated using SelfCheckGPT score/s

##### Installation / Google Drive Setup

In [None]:
! pip install selfcheckgpt
! pip install spacy

from google.colab import drive
drive.mount('/content/drive')

##### Loading LORA Model

In [None]:
# Loading LORA Model
if True:
    max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/content/drive/MyDrive/Fine-Tuning Llama 3.1 For Test Cases/lora", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

##### Perplexity

In [None]:
# Imports
from transformers import LlamaTokenizer, LlamaForCausalLM
import torch
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling
from tqdm import tqdm

import pandas as pd
from datasets import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Load the CSV file
df = pd.read_csv("/content/drive/MyDrive/Fine-Tuning Llama 3.1 For Test Cases/cycle2_test_reformatted.csv")

# Combine the columns into a single text input for the model
df["text"] = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{df["instruction"]}

### Input:
{df["input"]}

### Response:
{df["output"]}"""

# Create a HuggingFace dataset
dataset = Dataset.from_pandas(df[["text"]])

# Preprocess the dataset to match the input format
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["text"])

# Evaluate Perplexity using Torch
def compute_perplexity(model, dataset, tokenizer):
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
    dataloader = DataLoader(tokenized_datasets, collate_fn=data_collator, batch_size=8)
    model.eval()
    total_loss = 0
    total_length = 0

    with torch.no_grad():
        for batch in tqdm(dataloader):
            inputs = batch["input_ids"].to(device)
            outputs = model(inputs, labels=inputs)
            loss = outputs.loss
            total_loss += loss.item() * inputs.size(1)
            total_length += inputs.size(1)

    avg_loss = total_loss / total_length
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()

perplexity = compute_perplexity(model, dataset, tokenizer)
print(f"Perplexity: {perplexity}")

##### Hallucination

In [None]:
!pip install numpy
!pip install selfcheckgpt
!pip install spacy

In [None]:
import numpy as np
import spacy
import torch
from transformers import pipeline

from selfcheckgpt.modeling_selfcheck import SelfCheckLLMPrompt

In [None]:
main_model = model
main_tokenizer = tokenizer

torch.manual_seed(28)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)main_

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")

prompt = """
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Generate a test case for the following UI Element: Link Element 'Euskara' With URL https://eu.wikipedia.org/wiki/ from the website: https://en.wikipedia.org/wiki/Main_Page

### Input:


### Response:

"""

# As per the original paper the response is generated with greedy decoding
Response = pipe(prompt, do_sample=False, max_new_tokens=128, return_full_text=False)
Response

# The samples are generated for the same prompt with temperature as 1.
N = 20
Samples = pipe(
    [prompt] * N,
    temperature=1.0,
    do_sample=True,
    max_new_tokens=128,
    return_full_text=False,
)
print(Samples[0])

Response = Response[0]["generated_text"]
Samples = [sample[0]["generated_text"] for sample in Samples]

from typing import Dict, List, Set, Tuple, Union

main_model.eval()
if device is None:
    device = torch.device("cpu")
main_model.to(device)
device = device
prompt_template = "Context: {context}\n\nSentence: {sentence}\n\nIs the sentence supported by the context above? Answer Yes or No.\n\nAnswer: "
text_mapping = {'yes': 0.0, 'no': 1.0, 'n/a': 0.5}
not_defined_text = set()
print(f"SelfCheck-LLMPrompt ({main_model}) initialized to device {device}")

from tqdm import tqdm
nlp = spacy.load("en_core_web_sm")
sentences = [
    sent.text.strip() for sent in nlp(Response).sents
]  # spacy sentence tokenization
# print(sentences)

def pred(sentences, sampled_passages, verbose=True):
  num_sentences = len(sentences)
  num_samples = len(sampled_passages)
  scores = np.zeros((num_sentences, num_samples))
  disable = not verbose
  for sent_i in tqdm(range(num_sentences), disable=disable):
      sentence = sentences[sent_i]
      for sample_i, sample in enumerate(sampled_passages):

          # this seems to improve performance when using the simple prompt template
          sample = sample.replace("\n", " ")

          prompt = "Context: {context}\n\nSentence: {sentence}\n\nIs the sentence supported by the context above? Answer Yes or No.\n\nAnswer: ".format(context=sample, sentence=sentence)
          inputs = main_tokenizer(prompt, return_tensors="pt").to(device)
          generate_ids = main_model.generate(
              inputs.input_ids,
              max_new_tokens=5,
              do_sample=False, # hf's default for Llama2 is True
          )
          output_text = main_tokenizer.batch_decode(
              generate_ids, skip_special_tokens=True,
              clean_up_tokenization_spaces=False
          )[0]
          generate_text = output_text.replace(prompt, "")
          score_ = text_postprocessing(generate_text)
          scores[sent_i, sample_i] = score_
  scores_per_sentence = scores.mean(axis=-1)
  return scores_per_sentence

def text_postprocessing(text):
  """
  To map from generated text to score
  Yes -> 0.0
  No  -> 1.0
  everything else -> 0.5
  """
  # tested on Llama-2-chat (7B, 13B) --- this code has 100% coverage on wikibio gpt3 generated data
  # however it may not work with other datasets, or LLMs
  text = text.lower().strip()
  if text[:3] == 'yes':
      text = 'yes'
  elif text[:2] == 'no':
      text = 'no'
  else:
      if text not in not_defined_text:
          # print(f"warning: {text} not defined")
          not_defined_text.add(text)
      text = 'n/a'
  return text_mapping[text]

sent_scores_prompt = pred(
    sentences=sentences,  # list of sentences
    sampled_passages=Samples,  # list of sampled passages
    verbose=True,  # whether to show a progress bar
)

print(sent_scores_prompt)
print("Hallucination Score:", np.mean(sent_scores_prompt))