# Libraries for LLM evaluation

## 0. Setup

In [None]:
%pip install -q huggingface-hub==0.23.2
%pip install -q transformers==4.47.0
%pip install -q datasets==2.19.1
%pip install -q deepeval==2.1.1
%pip install -q lm-format-enforcer==0.10.9
%pip install -q pydantic==2.10.4
%pip install -q bitsandbytes==0.45.0

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.7/401.7 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
peft 0.14.0 requires huggingface-hub>=0.25.0, but you have huggingface-hub 0.23.2 which is incompatible.
transformers 4.47.1 requires huggingface-hub<1.0,>=0.24.0, but you have huggingface-hub 0.23.2 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m450.7/450.7 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m37.2 MB/s[0m e

In [None]:
import os
import yaml
from huggingface_hub import login
from google.colab import drive
from getpass import getpass
from IPython.display import clear_output

drive.mount('/content/drive')

KeyboardInterrupt: 

# 1. HF (loading models) + Deepeval

In [9]:
import json
import torch
import transformers
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from pydantic import BaseModel
from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import (
    build_transformers_prefix_allowed_tokens_fn,
)

from deepeval.models import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase
from deepeval import evaluate

In [10]:
class HuggingFaceModel(DeepEvalBaseLLM):
    def __init__(self, model_name: str):
        self.model_name = model_name
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )
        model_4bit = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            #load_in_8bit=True
            quantization_config=quantization_config,
        )
        tokenizer = AutoTokenizer.from_pretrained(
            model_name
        )

        self.model = model_4bit
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        model = self.load_model()

        pipeline = transformers.pipeline(
            "text-generation",
            model=model,
            tokenizer=self.tokenizer,
            use_cache=True,
            device_map="auto",
            max_length=2500,
            do_sample=True,
            top_k=5,
            num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )

        # Create parser required for JSON confinement using lmformatenforcer
        #parser = JsonSchemaParser(schema.schema())
        parser = JsonSchemaParser(schema.model_json_schema())
        prefix_function = build_transformers_prefix_allowed_tokens_fn(
            pipeline.tokenizer, parser
        )

        # Output and load valid JSON
        output_dict = pipeline(prompt, prefix_allowed_tokens_fn=prefix_function)
        output = output_dict[0]["generated_text"][len(prompt) :]
        json_result = json.loads(output)

        # Return valid JSON object according to the schema DeepEval supplied
        return schema(**json_result)

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return self.model_name

In [11]:
#model_name = "mistralai/Mistral-7B-v0.1"
#model_name = "unsloth/Llama-3.2-3B-Instruct"
#model_name = "JunxiongWang/Llama3.2-Mamba-3B-distill"
#model_name = "HuggingFaceTB/SmolLM2-360M-Instruct"
#model_name = "HuggingFaceTB/SmolLM-360M"
model_name = "microsoft/Phi-3.5-mini-instruct"
#model_name = "microsoft/phi-4"

hf_model = HuggingFaceModel(model_name = model_name)

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [12]:
# Define a schema for the expected JSON output
class RefSchema(BaseModel):
    joke: str

print(hf_model.generate("Write me a joke", schema=RefSchema))

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


joke="Why don't scientists trust atoms? Because they make up everything, even their stories!"


In [13]:
RefSchema.model_json_schema()

{'properties': {'joke': {'title': 'Joke', 'type': 'string'}},
 'required': ['joke'],
 'title': 'RefSchema',
 'type': 'object'}

In [14]:
from deepeval.metrics import ToxicityMetric
from deepeval.test_case import LLMTestCase

metric = ToxicityMetric(model=hf_model, threshold=0.5)

input_prompt = "Write me a joke"
output_prompt = hf_model.generate(input_prompt, schema=RefSchema)

test_case = LLMTestCase(
    input = input_prompt,
    # Replace this with the actual output from your LLM application
    actual_output = output_prompt
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

Device set to use cuda:0


Output()

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


0.0
The toxicity score of 0.00 is justified because the actual output did not contain any offensive, aggressive, discriminatory, or inflammatory language, which aligns with the non-toxic nature implied by a zero score.


In [None]:
from deepeval.metrics import AnswerRelevancyMetric

metric = AnswerRelevancyMetric(model=hf_model, include_reason=True)

# Replace this with the actual output from your LLM application
actual_output = hf_model.generate("Write me a joke", schema=RefSchema)
actual_output = actual_output.joke
test_case = LLMTestCase(
    input="Why did the crab cross the road? It didn’t—it used the sidewalk.",
    actual_output=actual_output
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

# or evaluate test cases in bulk
evaluate([test_case], [metric])

Device set to use cuda:0


Output()

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


0.0
The score is 0.00 because the actual output includes the irrelevant statement 'Why don't skeletons fight each other on the internet? Because they don't have the bone to pick!' which is unrelated to the original question about why a crab would not cross the road, thus not contributing to a better understanding of the input.


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s]Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:24, 24.12s/test case]



Metrics Summary

  - ❌ Answer Relevancy (score: 0.0, threshold: 0.5, strict: False, evaluation model: microsoft/Phi-3.5-mini-instruct, reason: The score is 0.00 because the irrelevant statements 'Why don't skeletons fight each other on the internet' and 'Because they don't have the bone to pick' in the actual output do not contribute to explaining why a crab would choose to use the sidewalk over the road., error: None)

For test case:

  - input: Why did the crab cross the road? It didn’t—it used the sidewalk.
  - actual output: Why don't skeletons fight each other on the internet? Because they don't have the bone to pick!
  - expected output: None
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Answer Relevancy: 0.00% pass rate







EvaluationResult(test_results=[TestResult(name='test_case_0', success=False, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.5, success=False, score=0.0, reason="The score is 0.00 because the irrelevant statements 'Why don't skeletons fight each other on the internet' and 'Because they don't have the bone to pick' in the actual output do not contribute to explaining why a crab would choose to use the sidewalk over the road.", strict_mode=False, evaluation_model='microsoft/Phi-3.5-mini-instruct', error=None, evaluation_cost=None, verbose_logs='Statements:\n[\n    "Why don\'t skeletons fight each other on the internet",\n    "Because they don\'t have the bone to pick"\n] \n \nVerdicts:\n[\n    {\n        "verdict": "no",\n        "reason": "The \'Why don\'t skeletons fight each other on the internet\' statement is irrelevant to the input which is about the reason behind a crab\'s action, and why it used the sidewalk instead of crossing the road."\n    },\n    {\n        "ve

"I'm a person who likes a lot of people. I like to be around them, but I like to be with them in the privacy of my own home. What do you call a person who likes to be around people but not with them?  A...?  A..."

### RAGAS + Deepeval

In [15]:
%pip install -q ragas==0.2.11

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.9/176.9 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [16]:
from deepeval import evaluate
from deepeval.metrics.ragas import RagasMetric
from deepeval.test_case import LLMTestCase

In [18]:
# Replace this with the actual output from your LLM application
actual_output = "We offer a 30-day full refund at no extra cost."

# Replace this with the expected output from your RAG generator
expected_output = "You are eligible for a 30 day full refund at no extra cost."

# Replace this with the actual retrieved context from your RAG pipeline
retrieval_context = ["All customers are eligible for a 30 day full refund at no extra cost."]

#metric = RagasMetric(threshold=0.5, model="gpt-3.5-turbo")
metric = RagasMetric(model=hf_model, threshold=0.5)

test_case = LLMTestCase(
    input="What if these shoes don't fit?",
    actual_output=actual_output,
    expected_output=expected_output,
    retrieval_context=retrieval_context
)

metric.measure(test_case)
print(metric.score)

# or evaluate test cases in bulk
evaluate([test_case], [metric])

None


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s]


AttributeError: 'HuggingFaceModel' object has no attribute 'set_run_config'

### G-Eval

In [19]:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepeval.test_case import LLMTestCase

In [20]:
correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
        "You should also heavily penalize omission of detail",
        "Vague language, or contradicting OPINIONS, are OK"
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model=hf_model
)

In [22]:
test_case = LLMTestCase(
    input="The dog chased the cat up the tree, who ran up the tree?",
    actual_output="It depends, some might consider the cat, while others might argue the dog.",
    expected_output="The cat."
)

correctness_metric.measure(test_case)
print(correctness_metric.score)
print(correctness_metric.reason)

Output()

Device set to use cuda:0


0.5
The actual output acknowledges the ambiguity present in the question but fails to directly state the 'expected output' as the cat being pursued by the dog. This results in a lack of specificity, which is a key criterion in the evaluation steps.


### Summarization

In [23]:
from deepeval import evaluate
from deepeval.metrics import SummarizationMetric
from deepeval.test_case import LLMTestCase

In [24]:
# This is the original text to be summarized
input = """
The 'coverage score' is calculated as the percentage of assessment questions
for which both the summary and the original document provide a 'yes' answer. This
method ensures that the summary not only includes key information from the original
text but also accurately represents it. A higher coverage score indicates a
more comprehensive and faithful summary, signifying that the summary effectively
encapsulates the crucial points and details from the original content.
"""

# This is the summary, replace this with the actual output from your LLM application
actual_output="""
The coverage score quantifies how well a summary captures and
accurately represents key information from the original text,
with a higher score indicating greater comprehensiveness.
"""

In [25]:
test_case = LLMTestCase(input=input, actual_output=actual_output)
metric = SummarizationMetric(
    threshold=0.5,
    model=hf_model,
    assessment_questions=[
        "Is the coverage score based on a percentage of 'yes' answers?",
        "Does the score ensure the summary's accuracy with the source?",
        "Does a higher score mean a more comprehensive summary?"
    ]
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

# or evaluate test cases in bulk
evaluate([test_case], [metric])

Output()

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


0.6666666666666666
The summarization score of 0.67 reflects a partial but incomplete representation of the original text, as indicated by the existence of unanswered questions. The summary likely misses critical details necessary to fully grasp the context of the coverage score. A more comprehensive summary that addresses the 'yes' answer question related to the coverage score' explanation would improve the understanding and thus the summary quality.


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s]Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:39, 39.42s/test case]



Metrics Summary

  - ✅ Summarization (score: 0.6666666666666666, threshold: 0.5, strict: False, evaluation model: microsoft/Phi-3.5-mini-instruct, reason: The summarization score of 0.67 indicates that while the summary attempts to convey the essence of the original text, key information regarding the coverage score being based on a percentage of 'yes' answers is missing, which is crucial for fully understanding the context. The absence of contradicting information and extra information suggests that the summary may be generally accurate without introducing inaccuracies or unrelated data., error: None)

For test case:

  - input: 
The 'coverage score' is calculated as the percentage of assessment questions
for which both the summary and the original document provide a 'yes' answer. This
method ensures that the summary not only includes key information from the original
text but also accurately represents it. A higher coverage score indicates a
more comprehensive and faithful summary,




EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Summarization', threshold=0.5, success=True, score=0.6666666666666666, reason="The summarization score of 0.67 indicates that while the summary attempts to convey the essence of the original text, key information regarding the coverage score being based on a percentage of 'yes' answers is missing, which is crucial for fully understanding the context. The absence of contradicting information and extra information suggests that the summary may be generally accurate without introducing inaccuracies or unrelated data.", strict_mode=False, evaluation_model='microsoft/Phi-3.5-mini-instruct', error=None, evaluation_cost=None, verbose_logs='Truths (limit=None):\n[\n    "The coverage score is calculated as the percentage of assessment questions for which both the summary and the original document provide a \'yes\' answer",\n    "This method ensures that the summary includes key information

### Hallucination

In [26]:
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric
from deepeval.test_case import LLMTestCase

In [28]:
# Replace this with the actual documents that you are passing as input to your LLM.
context=["A man with blond-hair, and a brown shirt drinking out of a public water fountain."]

# Replace this with the actual output from your LLM application
actual_output="A blond drinking water in public."

test_case = LLMTestCase(
    input="What was the blond doing?",
    actual_output=actual_output,
    context=context
)
metric = HallucinationMetric(model=hf_model, threshold=0.5)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

# or evaluate test cases in bulk
evaluate([test_case], [metric])

Output()

Device set to use cuda:0
Device set to use cuda:0


1.0
The hallucination score is 1.00 because the actual output fails to fully align with the context, omitting key details such as the man's specific hair color (blond) and the setting (public water fountain).


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s]Device set to use cuda:0
Device set to use cuda:0
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:13, 13.38s/test case]



Metrics Summary

  - ❌ Hallucination (score: 1.0, threshold: 0.5, strict: False, evaluation model: microsoft/Phi-3.5-mini-instruct, reason: The hallucination score is 1.00 because the actual output omits the crucial detail of the shirt'aine color being brown as per the context, leading to a complete misalignment with the expected factual consistency., error: None)

For test case:

  - input: What was the blond doing?
  - actual output: A blond drinking water in public.
  - expected output: None
  - context: ['A man with blond-hair, and a brown shirt drinking out of a public water fountain.']
  - retrieval context: None


Overall Metric Pass Rates

Hallucination: 0.00% pass rate







EvaluationResult(test_results=[TestResult(name='test_case_0', success=False, metrics_data=[MetricData(name='Hallucination', threshold=0.5, success=False, score=1.0, reason="The hallucination score is 1.00 because the actual output omits the crucial detail of the shirt'aine color being brown as per the context, leading to a complete misalignment with the expected factual consistency.", strict_mode=False, evaluation_model='microsoft/Phi-3.5-mini-instruct', error=None, evaluation_cost=None, verbose_logs='Verdicts:\n[\n    {\n        "verdict": "no",\n        "reason": "The actual output does not mention about the color of the shirt which was brown according to the context. Therefore, it\'s a contradiction."\n    }\n]')], conversational=False, multimodal=False, input='What was the blond doing?', actual_output='A blond drinking water in public.', expected_output=None, context=['A man with blond-hair, and a brown shirt drinking out of a public water fountain.'], retrieval_context=None)], con

# Inference API - WIP

## 1. Common functions

In [None]:
import os
import re
from huggingface_hub import InferenceClient
from deepeval.models.base_model import DeepEvalBaseLLM



In [None]:
def llm_call(
    prompt: str,
    system_prompt: str = "",
    model="microsoft/Phi-3.5-mini-instruct"
    ) -> str:
    """
    Calls the model with the given prompt and returns the response.

    NOTE: Uses HF Inference API

    Args:
        prompt (str): The user prompt to send to the model.
        system_prompt (str, optional): The system prompt to send to the model. Defaults to "".
        model (str, optional): The model to use for the call. Defaults to "claude-3-5-sonnet-20241022".

    Returns:
        str: The response from the language model.
    """
    dct_params = {'max_new_tokens': 1000, 'temperature': 0.1, 'return_full_text': False}
    client = InferenceClient()
    input_prompt = system_prompt + '\n\n' + prompt
    response = client.text_generation(
        input_prompt,
        model=model,
        **dct_params
        )
    return response

In [None]:
import json
import torch
import transformers
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from pydantic import BaseModel
from lmformatenforcer import JsonSchemaParser
from lmformatenforcer.integrations.transformers import (
    build_transformers_prefix_allowed_tokens_fn,
)

from deepeval.models import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase
from deepeval import evaluate

In [None]:
import json
from pydantic import BaseModel

class HuggingFaceModel(DeepEvalBaseLLM):
    def __init__(self, model_name: str):
        """
        Initializes the HuggingFaceModel with the given model name.
        """
        self.model_name = model_name

    def load_model(self):
        """
        Returns the model name being used.
        """
        return self.model_name

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        """
        Generates a response from the model using llm_call and validates it against the provided schema.
        """
        # Call the model using llm_call
        output = llm_call(prompt=prompt, system_prompt="", model=self.model_name)
        json_result = {'output': output}
        print("Raw output:", output)

        '''
        # Parse the output into JSON
        try:
            json_result = json.loads(output)
        except json.JSONDecodeError as e:
            raise ValueError(f"Invalid JSON output from model: {e}")
        '''

        # Validate and return the JSON result as a schema instance
        #return schema(**json_result)
        return json_result

    async def a_generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        """
        Asynchronous version of the generate method.
        """
        return self.generate(prompt, schema)

    def get_model_name(self):
        """
        Returns the name of the model being used.
        """
        return self.model_name


In [None]:
#model_name = "mistralai/Mistral-7B-v0.1"
#model_name = "unsloth/Llama-3.2-3B-Instruct"
#model_name = "microsoft/Phi-3.5-mini-instruct"
#model_name = "JunxiongWang/Llama3.2-Mamba-3B-distill"
#model_name = "HuggingFaceTB/SmolLM2-360M-Instruct"
#model_name = "HuggingFaceTB/SmolLM-360M"
model_name = "microsoft/phi-4"

hf_model = HuggingFaceModel(model_name = model_name)

In [None]:
# Define a schema for the expected JSON output
class RefSchema(BaseModel):
    joke: str

print(hf_model.generate("Write me a joke", schema=RefSchema))

HfHubHTTPError: (Request ID: cS0HpdEa2RjSishSeYe_J)

403 Forbidden: None.
Cannot access content at: https://api-inference.huggingface.co/models/microsoft/phi-4.
Make sure your token has the correct permissions.
The model microsoft/phi-4 is too large to be loaded automatically (29GB > 10GB). Please use Spaces (https://huggingface.co/spaces) or Inference Endpoints (https://huggingface.co/inference-endpoints).

In [None]:
from deepeval.metrics import ToxicityMetric
from deepeval.test_case import LLMTestCase

metric = ToxicityMetric(model=hf_model, threshold=0.5)

input_prompt = "Write me a joke"
output_prompt = hf_model.generate(input_prompt, schema=RefSchema)

test_case = LLMTestCase(
    input = input_prompt,
    # Replace this with the actual output from your LLM application
    actual_output = output_prompt
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

Output()

Raw output (I):  that involves a cat, a computer, and a misunderstanding.

Certainly! Here's a light-hearted joke involving a cat, a computer, and a misunderstanding:

Why did the cat sit next to the computer during the meeting?

Because it heard the mouse was going to be the "keyboard" to the company's success, but it wanted to make sure it was "purr-fectly" in line with the "cat-astrophic" plan!

(Note: The joke plays on the words "keyboard" and "catastrophic," as well as the idea that a cat might be concerned about its role in a plan, humorously suggesting it's worried about being part of a disastrous scheme.)


RecursionError: maximum recursion depth exceeded while calling a Python object

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="microsoft/phi-4", trust_remote_code=True)
pipe(messages)

config.json:   0%|          | 0.00/820 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
messages = [
	{
		"role": "user",
		"content": "What is the capital of France?"
	}
]

completion = client.chat.completions.create(
    model="microsoft/phi-4",
	messages=messages,
	max_tokens=500
)

print(completion.choices[0].message)