### Evaluation Notebook
- Evaluating perplexity and hallucination (Automated evaluation metrics)
- Perplexity is calculated using Huggingface
- Hallucination is calculated using SelfCheckGPT score/s

##### Installation / Google Drive Setup

In [None]:
! pip install selfcheckgpt
! pip install spacy

from google.colab import drive
drive.mount('/content/drive')

##### Perplexity

In [None]:
# Imports
from transformers import LlamaTokenizer, LlamaForCausalLM
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling
from tqdm import tqdm

# Evaluate Perplexity using Torch
def compute_perplexity(model, dataset, tokenizer):
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
    dataloader = DataLoader(dataset, collate_fn=data_collator, batch_size=8)
    model.eval()
    total_loss = 0
    total_length = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            inputs = batch["input_ids"]
            outputs = model(inputs, labels=inputs)
            loss = outputs.loss
            total_loss += loss.item() * inputs.size(1)
            total_length += inputs.size(1)
    
    perplexity = torch.exp(total_loss / total_length)
    return perplexity.item()

# Load Model
model_location = ""
# Directory of pytorch .bin files
tokenizer = LlamaTokenizer.from_pretrained(model_location)
model = LlamaForCausalLM.from_pretrained(model_location)

# Perplexity Evaluation Dataset
dataset_location = ""
eval_dataset = load_dataset(dataset_location)

perplexity = compute_perplexity(model, eval_dataset, tokenizer)
print(f"Perplexity: {perplexity}")

##### Hallucination

In [None]:
## Call Model To Create Outputs for sentence and sample parameters

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Generate a test case for the following UI Element:", # instruction #
        "Link Element : ", # input
        "", # output - leave this blank for generation!
    ),
    alpaca_prompt.format(
        "Generate a test case for the following UI Element:", # instruction #
        "Link Element : ", # input
        "", # output - leave this blank for generation!
    ),

    alpaca_prompt.format(
        "Generate a test case for the following UI Element:", # instruction
        "Button Element : ", # input
        "", # output - leave this blank for generation!
    ),
    alpaca_prompt.format(
        "Generate a test case for the following UI Element:", # instruction
        "Button Element : ", # input
        "", # output - leave this blank for generation!
    ),

    alpaca_prompt.format(
        "Generate a test case for the following UI Element:", # instruction #
        "Header Element : ", #
        "", # output - leave this blank for generation!
    ),
    alpaca_prompt.format(
        "Generate a test case for the following UI Element:", # instruction #
        "Header Element : ", #
        "", # output - leave this blank for generation!
    ),

    alpaca_prompt.format(
        "Generate a test case for the following UI Element:", # instruction #
        "Input Element : ", #
        "", # output - leave this blank for generation!
    ),
    alpaca_prompt.format(
        "Generate a test case for the following UI Element:", # instruction #
        "Input Element : ", #
        "", # output - leave this blank for generation!
    )

], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

In [None]:
import torch
import spacy
from selfcheckgpt.modeling_selfcheck import SelfCheckLLMPrompt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
llm_model = "mistralai/Mistral-7B-Instruct-v0.2"
selfcheck_prompt = SelfCheckLLMPrompt(llm_model, device)

## Manually entered in responses
# Response
sentences = []
# Response 2 / Samples
samples = []

sent_scores_prompt = selfcheck_prompt.predict(
    sentences = sentences,                         
    sampled_passages = samples,
    verbose = True,
)
print(sent_scores_prompt)