# LangSmith - Example of Creating an Evaluation 

In [None]:
from langsmith import Client

# create LangSmith client
client = Client()


## Create the Dataset

In [None]:
# create a dataset

dataset_name = "QA Example Dataset 1"
dataset = client.create_dataset(dataset_name)
# if the dataset already exists, you can retrieve it like this:
# dataset = client.list_datasets(dataset_name=dataset_name).__next__()

## Populate the Dataset with Examples

In [4]:

client.create_examples(
    dataset_id=dataset.id,
    examples=[
        {
            "inputs": {"question": "What is LangChain?"},
            "outputs": {"answer": "A framework for building LLM applications"},
        },
        {
            "inputs": {"question": "What is LangSmith?"},
            "outputs": {"answer": "A platform for observing and evaluating LLM applications"},
        },
        {
            "inputs": {"question": "What is OpenAI?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        },
        {
            "inputs": {"question": "What is Google?"},
            "outputs": {"answer": "A technology company known for search"},
        },
        {
            "inputs": {"question": "What is Mistral?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        }
    ]
)

{'example_ids': ['d47d460e-fcd0-43ee-93a6-68be98fb0908',
  '674f72c8-2c83-4ad0-ae57-070fa53a0ed1',
  '841d296d-99c3-4b25-8831-774305a66889',
  'e8ef35a2-4e1d-4b8e-b09a-e8575c417aff',
  'd34b95ec-93f4-4f8f-bef8-acf7b19abfe3'],
 'count': 5}

In [None]:
# create a split from the dataset
examples=[example.id for example in dataset.list_examples()]
split = client.crea  create_split(
    dataset_id=dataset.id,
    name="evaluation_split",
    example_ids=examples[3:]  # use last two examples for evaluation
)

## Create the Evaluators

In [None]:
# imports
import openai
from langsmith import wrappers
import os
from dotenv import load_dotenv


In [None]:
# load environment variables from .env file
load_dotenv()

True

### Create the Concision Evaluator
This simply compares the number of words produced w.r.t to the reference output, and returns as a score (less is better).
This could also be a simple boolean test to check word count is below a certain level

In [50]:
def concision(outputs: dict, reference_outputs: dict) -> int:
    """Evaluate the conciseness of the response compared to the reference answer.
    Returns a positive score if the response is shorter than the reference answer,"""
    # count words in outputs['response'] and reference_outputs['answer']
    resp_wc = outputs['response'].split()
    ref_wc = reference_outputs['answer'].split()

    print(f"Response length: {len(resp_wc)}, Reference length: {len(ref_wc)}")
    return len(ref_wc) - len(resp_wc)

### Correctness Evaluator
This used an LLM to assess whether the output is correct.  It compares it to the reference answer in order to grade how correct the output is.

In [51]:
# use SDK to wrap Gemini LLM
model = openai.Client( api_key=os.getenv("GEMINI_API_KEY"),
                        base_url=os.getenv("GEMINI_API_BASE"))
llm = wrappers.wrap_openai(model)

In [None]:
# Prompt to evaluate correctness
user_prompt = """"
You are grading the following question:
{question}
Here is the correct answer:
{ref_answer}
You are grading the following predicted answer:
{answer}
Provide a score from 1 to 5, where 5 is completely correct and 1 is completely incorrect:
Score:
Provide a brief explanation of the score:
Comment:

"""

eval_prompt = " You are an expert professor in grading student answers to questions."

In [46]:
# Use a structured output to ensure LLM returns a score.  Also returns a comment to provide explanation of score.
from pydantic import BaseModel, Field
class CorrectnessEvalSchema(BaseModel):
    """CLass to define the schema for correctness evaluation."""
    score: int = Field(description="An integer score from 1 to 5 indicating the correctness of the answer")
    comment: str = Field(description="A brief explanation of the score")

In [54]:


def q_correctness(inputs: dict,
                outputs: dict,
                reference_outputs: dict) -> int:
    """ Evaluate the correctness of the response using Gemini LLM with a structured output."""
    # extract response from outputs
    resp =  outputs['response']
    # call Gemini LLM with evaluation prompt
    response = llm.chat.completions.parse(
        model="gemini-2.5-flash-lite",
        messages = [{"role": "system", "content": eval_prompt},
                    {"role": "user", 
                     "content": user_prompt.format(question=inputs['question'],
                                                   answer=resp,
                                                   ref_answer=reference_outputs['answer'])}],
                temperature = 0,
                response_format=CorrectnessEvalSchema,
    )
    # extract score and comment from response
    result = response.choices[0].message.parsed
    print(f"Score: {result.score}, Comment: {result.comment}")
    # return score and comment as a dict. LangSmith expects a dict return type with these keys.
    # You can also return just an integer score or boolean if you prefer.
    return {"score": result.score, "comment": result.comment}



### Create the Example Application
This is a simple call to an LLM to get the answer to the question.

In [25]:
def my_app(question: str ):
    instruction = "Response to the question in a very brief, concise manner (one short sentence)"
    messages = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": question}
    ]
    resp = llm.chat.completions.create(
            model = "gemini-2.5-flash-lite",
            messages=messages,
            temperature=0.0,)
            
    return resp.choices[0].message.content


### Wrap the Application
This function calls the application and returns the response as a dictionary.  This response is used in the output.  You can include multiple keys in the output to use in your evaluators, if required.  You can wrap any function here, for instance if you evaluating a graph, you can invoke the graph inside this function.

In [36]:
def ls_target(inputs: dict) -> dict:
    return {"response": my_app(inputs['question'])}

### Execute the Evaluation
Finally execute the evaluation.  Pass the function to call the target application, the dataset , the evaluators, and optionally provide an experiment prefix, to make the results easier to identify.

In [None]:
# This call will evaluate the examples in the split using the two evaluators defined above.
# It will use the whole dataset.
client.evaluate(
    ls_target,
    dataset_name,
    evaluators=[concision, q_correctness],
    experiment_prefix="exp"
)


View the evaluation results for experiment: 'exp-1869a2a9' at:
https://eu.smith.langchain.com/o/3a7d9ffc-8b3e-42a7-9129-2ca245324f40/datasets/3ff3f321-1ea3-462a-8bd0-1461ecc7d5da/compare?selectedSessions=330b328d-118b-4a46-972a-b1fba9cf9326




0it [00:00, ?it/s]

Response length: 11, Reference length: 8


1it [00:21, 21.49s/it]

Score: 4, Comment: The answer is very close to the correct answer. It correctly identifies LangSmith as a platform for LLM applications and includes key functionalities like developing, testing, and monitoring, which are all part of observing and evaluating. The slight difference in wording ('observing and evaluating' vs. 'developing, testing, and monitoring') prevents a perfect score, but it's a strong answer.
Response length: 7, Reference length: 7


2it [00:22,  9.66s/it]

Score: 4, Comment: The answer is mostly correct, as OpenAI is indeed an AI research laboratory. However, it could be more specific by mentioning their work with Large Language Models, which is a key aspect of their identity and output.
Response length: 10, Reference length: 7


3it [00:24,  5.93s/it]

Score: 4, Comment: The answer correctly identifies Mistral as a large language model and mentions the developing company. However, the question asks 'What is Mistral?', implying the entity itself, not just its product. The correct answer focuses on Mistral being a company that creates LLMs, which is a more direct answer to the question.
Response length: 11, Reference length: 6


4it [00:25,  3.93s/it]

Score: 5, Comment: The predicted answer is a perfect match for the correct answer, accurately defining LangChain as a framework for developing applications powered by language models.
Response length: 12, Reference length: 6


5it [00:26,  2.90s/it]

Score: 5, Comment: The predicted answer is comprehensive and accurate, correctly identifying Google as a multinational technology company specializing in internet-related services and products. This aligns perfectly with the expected answer.


5it [00:26,  5.31s/it]


Unnamed: 0,inputs.question,outputs.response,error,reference.answer,feedback.concision,feedback.q_correctness,execution_time,example_id,id
0,What is LangSmith?,"LangSmith is a platform for developing, testin...",,A platform for observing and evaluating LLM ap...,-3,4,20.535326,674f72c8-2c83-4ad0-ae57-070fa53a0ed1,df97ac9f-6120-4073-852f-04dc5d484148
1,What is OpenAI?,OpenAI is an artificial intelligence research ...,,A company that creates Large Language Models,0,4,0.598477,841d296d-99c3-4b25-8831-774305a66889,e3f2967c-6e3b-4ddf-9a5b-2c64de886ef6
2,What is Mistral?,Mistral is a large language model developed by...,,A company that creates Large Language Models,-3,4,0.564737,d34b95ec-93f4-4f8f-bef8-acf7b19abfe3,1a64c81b-f5b3-4445-9dc9-de91320c20fd
3,What is LangChain?,LangChain is a framework for developing applic...,,A framework for building LLM applications,-5,5,0.307963,d47d460e-fcd0-43ee-93a6-68be98fb0908,9e174744-48ae-411d-b24c-4b519384a8af
4,What is Google?,Google is a multinational technology company s...,,A technology company known for search,-6,5,0.368799,e8ef35a2-4e1d-4b8e-b09a-e8575c417aff,4dd41f7f-3e9e-48ae-9c63-85a110b6069b


In [None]:
# IF you create a split, you can evaluate just that split like this.eval
# In this case the split is called "test1", but you can replace with your split name.
# You need to create the split first in LangSmith.
results = client.evaluate(
    ls_target,
    data=client.list_examples(dataset_name=dataset_name, splits=["test1"]),
    evaluators=[concision, q_correctness],
    experiment_prefix="exp"
)
