In [1]:
!pip install huggingface_hub langchain transformers bitsandbytes accelerate SentencePiece openai --upgrade

Collecting huggingface_hub
  Using cached huggingface_hub-0.18.0-py3-none-any.whl (301 kB)


In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, GenerationConfig

model_name = "meta-llama/Llama-2-13b-chat-hf"

tokenizer = LlamaTokenizer.from_pretrained(model_name)

model = LlamaForCausalLM.from_pretrained(
    model_name,
    device_map = "auto",
    load_in_8bit = True,
    torch_dtype = torch.float16,
    return_dict = True
)

generation_config = GenerationConfig.from_pretrained(model_name)

Downloading (…)fetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [4]:
# Function for generating output
def generate(prompt: str, max_new_tokens: int = 128) -> str:
    encoding = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.inference_mode():
        outputs = model.generate(
            **encoding,
            max_new_tokens = max_new_tokens,
            temperature = 0.00001,
            generation_config = generation_config,
        )
    answer_tokens = outputs[:, encoding.input_ids.shape[1] :]
    return tokenizer.decode(answer_tokens[0], skip_special_tokens=True)

In [10]:
# Evaluator
import os
from langchain.chat_models import ChatOpenAI

os.environ["OPENAI_API_KEY"] = "Enter Your API-KEY" 
evaluation_llm = ChatOpenAI(model="gpt-4")

In [13]:
prompt = "What is the capital of Egypt?"
pred = generate(prompt)
print(pred)

'\n\nThe capital of Egypt is Cairo (Arabic: القاهرة al-Qāhirah).'


## Conciseness evaluation
#### Measures if the the submission concise and to the point.

In [14]:
from langchain.evaluation import load_evaluator
from pprint import pprint as print

evaluator = load_evaluator("criteria", criteria="conciseness", llm=evaluation_llm)

eval_result = evaluator.evaluate_strings(
    prediction=pred,
    input=prompt,
)

print(eval_result)

{'reasoning': 'The criterion is conciseness. This means the submission should '
              'be brief, to the point, and free of unnecessary details.\n'
              '\n'
              'Looking at the submission, it responds directly to the question '
              'by stating that the capital of Egypt is Cairo. This is a '
              'concise answer.\n'
              '\n'
              'However, the submission also includes the Arabic translation of '
              'Cairo. While this may be interesting information, it is not '
              'directly relevant to the question and therefore may not be '
              'considered concise.\n'
              '\n'
              'Despite this, one could argue that providing the Arabic name '
              'adds depth to the answer and does not substantially detract '
              'from its conciseness. The Arabic name is not a lengthy or '
              'complex addition.\n'
              '\n'
              'In conclusion, while the in

## Correctness 

In [15]:
from langchain.evaluation import load_evaluator
from pprint import pprint as print

evaluator = load_evaluator("labeled_criteria", criteria="correctness", llm=evaluation_llm,requires_reference=True)

eval_result = evaluator.evaluate_strings(
    prediction=pred,
    input=prompt,
    reference="The capital of Egypt is Alexandria."
)

print(eval_result)

{'reasoning': 'The criterion for this task is "correctness: Is the submission '
              'correct, accurate, and factual?" \n'
              '\n'
              'The submission states that the capital of Egypt is Cairo. This '
              'is accurate, as Cairo is indeed the capital of Egypt. \n'
              '\n'
              'The reference provided is incorrect, as Alexandria is not the '
              'capital of Egypt. This does not impact the accuracy of the '
              'submitted answer, as the capital is indeed Cairo. \n'
              '\n'
              'Therefore, the submission meets the criteria. \n'
              '\n'
              'Y',
 'score': 1,
 'value': 'Y'}


## Custom criteria

In [16]:
from langchain.evaluation import load_evaluator
from pprint import pprint as print

# custom eli5 criteria
custom_criterion = {"eli5": "Is the output explained in a way that a 5 yeard old would unterstand it?"}

# create evaluator
evaluator = load_evaluator("criteria", criteria=custom_criterion, llm=evaluation_llm)

# evaluate
eval_result = evaluator.evaluate_strings(
    prediction=pred,
    input=prompt,
)

# print result
print(eval_result)

{'reasoning': 'The criterion is to assess if the output is explained in a way '
              'that a 5-year-old would understand it.\n'
              '\n'
              'Looking at the submission, it states: "The capital of Egypt is '
              'Cairo (Arabic: القاهرة al-Qāhirah)."\n'
              '\n'
              'This sentence is simple and direct, stating that the capital of '
              'Egypt is Cairo. A 5-year-old would understand this part.\n'
              '\n'
              'However, the inclusion of the Arabic name for Cairo may '
              'potentially confuse a 5-year-old as they are unlikely to '
              'understand or be familiar with Arabic script. \n'
              '\n'
              'Hence, while the majority of the submission is understandable '
              'for a 5-year-old, the Arabic script makes it slightly more '
              'complex than what a 5-year-old would typically comprehend.\n'
              '\n'
              'So, based on the c