# LangSmith - Example of Creating an Evaluation 

In [1]:
from langsmith import Client

# create LangSmith client
client = Client()


## Create the Dataset

In [22]:
# create a dataset

dataset_name = "QA Example Dataset 1"
dataset = client.create_dataset(dataset_name)
# if the dataset already exists, you can retrieve it like this:
# dataset = client.list_datasets(dataset_name=dataset_name).__next__()

LangSmithConflictError: Conflict for /datasets. HTTPError('409 Client Error: Conflict for url: https://api.smith.langchain.com/datasets', '{"detail":"Dataset with this name already exists."}')

## Populate the Dataset with Examples

In [3]:

client.create_examples(
    dataset_id=dataset.id,
    examples=[
        {
            "inputs": {"question": "What is LangChain?"},
            "outputs": {"answer": "A framework for building LLM applications"},
        },
        {
            "inputs": {"question": "What is LangSmith?"},
            "outputs": {"answer": "A platform for observing and evaluating LLM applications"},
        },
        {
            "inputs": {"question": "What is OpenAI?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        },
        {
            "inputs": {"question": "What is Google?"},
            "outputs": {"answer": "A technology company known for search"},
        },
        {
            "inputs": {"question": "What is Mistral?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        }
    ]
)

{'example_ids': ['65f271ca-333d-420d-8e93-01d1e0e4da75',
  'd1566b24-8470-42f2-a5a8-3db096ca702a',
  'e076a36d-ca17-49e8-b75c-7dadb711bd75',
  '4dadfed7-129d-4f50-9ddd-00a4b57d8b95',
  'db23c766-5f72-4907-ae14-9a33b1319619'],
 'count': 5}

In [17]:
# create a split from the dataset
examples = [example.id for example in client.list_examples(dataset_id=dataset.id)]
split = client.create_split(
    dataset_id=dataset.id,
    name="evaluation_split",
    example_ids=examples[3:]  # use last two examples for evaluation
)

AttributeError: 'Client' object has no attribute 'create_split'

## Create the Evaluators

In [5]:
# imports
import openai
from langsmith import wrappers
import os
from dotenv import load_dotenv


In [6]:
# load environment variables from .env file
load_dotenv()

True

### Create the Concision Evaluator
This simply compares the number of words produced w.r.t to the reference output, and returns as a score (less is better).
This could also be a simple boolean test to check word count is below a certain level

In [7]:
def concision(outputs: dict, reference_outputs: dict) -> int:
    """Evaluate the conciseness of the response compared to the reference answer.
    Returns a positive score if the response is shorter than the reference answer,"""
    # count words in outputs['response'] and reference_outputs['answer']
    resp_wc = outputs['response'].split()
    ref_wc = reference_outputs['answer'].split()

    print(f"Response length: {len(resp_wc)}, Reference length: {len(ref_wc)}")
    return len(ref_wc) - len(resp_wc)

### Correctness Evaluator
This used an LLM to assess whether the output is correct.  It compares it to the reference answer in order to grade how correct the output is.

In [8]:
# use SDK to wrap Gemini LLM
model = openai.Client( api_key=os.getenv("GEMINI_API_KEY"),
                        base_url=os.getenv("GEMINI_API_BASE"))
llm = wrappers.wrap_openai(model)

In [9]:
# Prompt to evaluate correctness
user_prompt = """"
You are grading the following question:
{question}
Here is the correct answer:
{ref_answer}
You are grading the following predicted answer:
{answer}
Provide a score from 1 to 5, where 5 is completely correct and 1 is completely incorrect:
Score:
Provide a brief explanation of the score:
Comment:

"""

eval_prompt = " You are an expert professor in grading student answers to questions."

In [10]:
# Use a structured output to ensure LLM returns a score.  Also returns a comment to provide explanation of score.
from pydantic import BaseModel, Field
class CorrectnessEvalSchema(BaseModel):
    """CLass to define the schema for correctness evaluation."""
    score: int = Field(description="An integer score from 1 to 5 indicating the correctness of the answer")
    comment: str = Field(description="A brief explanation of the score")

In [11]:


def q_correctness(inputs: dict,
                outputs: dict,
                reference_outputs: dict) -> int:
    """ Evaluate the correctness of the response using Gemini LLM with a structured output."""
    # extract response from outputs
    resp =  outputs['response']
    # call Gemini LLM with evaluation prompt
    response = llm.chat.completions.parse(
        model="gemini-2.5-flash-lite",
        messages = [{"role": "system", "content": eval_prompt},
                    {"role": "user", 
                     "content": user_prompt.format(question=inputs['question'],
                                                   answer=resp,
                                                   ref_answer=reference_outputs['answer'])}],
                temperature = 0,
                response_format=CorrectnessEvalSchema,
    )
    # extract score and comment from response
    result = response.choices[0].message.parsed
    print(f"Score: {result.score}, Comment: {result.comment}")
    # return score and comment as a dict. LangSmith expects a dict return type with these keys.
    # You can also return just an integer score or boolean if you prefer.
    return {"score": result.score, "comment": result.comment}



### Create the Example Application
This is a simple call to an LLM to get the answer to the question.

In [12]:
def my_app(question: str ):
    instruction = "Response to the question in a very brief, concise manner (one short sentence)"
    messages = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": question}
    ]
    resp = llm.chat.completions.create(
            model = "gemini-2.5-flash-lite",
            messages=messages,
            temperature=0.0,)
            
    return resp.choices[0].message.content


### Wrap the Application
This function calls the application and returns the response as a dictionary.  This response is used in the output.  You can include multiple keys in the output to use in your evaluators, if required.  You can wrap any function here, for instance if you evaluating a graph, you can invoke the graph inside this function.

In [13]:
def ls_target(inputs: dict) -> dict:
    return {"response": my_app(inputs['question'])}

### Execute the Evaluation
Finally execute the evaluation.  Pass the function to call the target application, the dataset , the evaluators, and optionally provide an experiment prefix, to make the results easier to identify.

In [14]:
# This call will evaluate the examples in the split using the two evaluators defined above.
# It will use the whole dataset.
client.evaluate(
    ls_target,
    dataset_name,
    evaluators=[concision, q_correctness],
    experiment_prefix="exp"
)


  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'exp-802170a7' at:
https://smith.langchain.com/o/964e7404-67a2-4b32-87f1-9ca489d8bbd5/datasets/f2b609c2-4f5c-4b98-ad25-7223e7770299/compare?selectedSessions=3e0b7ecd-04c4-4012-801d-789a9244bc6b




0it [00:00, ?it/s]

Response length: 11, Reference length: 6


1it [00:01,  1.56s/it]

Score: 5, Comment: The predicted answer is a perfect match for the correct answer, accurately defining LangChain as a framework for developing applications powered by language models.
Response length: 7, Reference length: 7


2it [00:03,  1.52s/it]

Score: 4, Comment: The answer is mostly correct. OpenAI is indeed an AI research laboratory, and a significant part of its work involves creating Large Language Models. However, it could be more specific by mentioning the creation of LLMs as stated in the correct answer.
Response length: 12, Reference length: 6


3it [00:04,  1.40s/it]

Score: 5, Comment: The predicted answer is a comprehensive and accurate description of Google, aligning perfectly with the expected answer.
Response length: 11, Reference length: 8


4it [00:05,  1.43s/it]

Score: 4, Comment: The answer is very close to the correct answer. It correctly identifies LangSmith as a platform for LLM applications and includes key functionalities like developing, testing, and monitoring, which are all part of observing and evaluating. The slight difference in wording ('observing and evaluating' vs. 'developing, testing, and monitoring') prevents a perfect score, but it's a strong answer.
Response length: 10, Reference length: 7


5it [00:07,  1.49s/it]

Score: 4, Comment: The answer correctly identifies Mistral as a large language model and mentions the developing company. However, the question asks 'What is Mistral?', implying the entity itself, not just its product. While closely related, the ideal answer would focus on Mistral AI as a company.


5it [00:08,  1.60s/it]


Unnamed: 0,inputs.question,outputs.response,error,reference.answer,feedback.concision,feedback.q_correctness,execution_time,example_id,id
0,What is LangChain?,LangChain is a framework for developing applic...,,A framework for building LLM applications,-5,5,0.714097,4dadfed7-129d-4f50-9ddd-00a4b57d8b95,890d95c2-83ee-47f1-ba95-b7c89ac69699
1,What is OpenAI?,OpenAI is an artificial intelligence research ...,,A company that creates Large Language Models,0,4,0.660563,65f271ca-333d-420d-8e93-01d1e0e4da75,e8733198-c355-46e6-b9a8-d3c803d02d75
2,What is Google?,Google is a multinational technology company s...,,A technology company known for search,-6,5,0.628674,d1566b24-8470-42f2-a5a8-3db096ca702a,480d06be-68ae-49df-97ab-dbdb7bac4e38
3,What is LangSmith?,"LangSmith is a platform for developing, testin...",,A platform for observing and evaluating LLM ap...,-3,4,0.609095,db23c766-5f72-4907-ae14-9a33b1319619,fba5e928-a64c-4c19-bd78-b54ebba91223
4,What is Mistral?,Mistral is a large language model developed by...,,A company that creates Large Language Models,-3,4,0.588265,e076a36d-ca17-49e8-b75c-7dadb711bd75,59e082fe-11ea-4c05-9816-6f714c944ff3


In [41]:
# IF you create a split, you can evaluate just that split like this.eval
# In this case the split is called "test1", but you can replace with your split name.
# You need to create the split first in LangSmith.
results = client.evaluate(
    ls_target,
    data=client.list_examples(dataset_name=dataset_name, splits=["test1"]),
    evaluators=[concision, q_correctness],
    experiment_prefix="exp"
)


View the evaluation results for experiment: 'exp-b0ffc5dc' at:
https://smith.langchain.com/o/964e7404-67a2-4b32-87f1-9ca489d8bbd5/datasets/f2b609c2-4f5c-4b98-ad25-7223e7770299/compare?selectedSessions=0c4bc322-b0f5-4f9a-9512-e22948bebdfe




0it [00:00, ?it/s]

Response length: 11, Reference length: 8


1it [00:02,  2.22s/it]

Score: 4, Comment: The answer is very close to the correct answer. It correctly identifies LangSmith as a platform for LLM applications and includes key functionalities like developing, testing, and monitoring, which are all part of observing and evaluating. The slight difference in wording ('observing and evaluating' vs. 'developing, testing, and monitoring') prevents a perfect score, but it's a strong answer.
Response length: 10, Reference length: 7


2it [00:03,  1.81s/it]

Score: 4, Comment: The answer correctly identifies Mistral as a large language model and mentions the developing company. However, the question asks 'What is Mistral?', implying the entity itself, not just its product. While closely related, the ideal answer would focus on Mistral AI as a company.


2it [00:04,  2.18s/it]


In [18]:
print("Available methods:", [method for method in dir(client) if not method.startswith('_')])

Available methods: ['add_runs_to_annotation_queue', 'aevaluate', 'aevaluate_run', 'api_key', 'api_url', 'arun_on_dataset', 'batch_ingest_runs', 'cleanup', 'clone_public_dataset', 'compressed_traces', 'create_annotation_queue', 'create_chat_example', 'create_commit', 'create_comparative_experiment', 'create_dataset', 'create_example', 'create_example_from_run', 'create_examples', 'create_feedback', 'create_feedback_from_token', 'create_llm_example', 'create_presigned_feedback_token', 'create_presigned_feedback_tokens', 'create_project', 'create_prompt', 'create_run', 'delete_annotation_queue', 'delete_dataset', 'delete_example', 'delete_examples', 'delete_feedback', 'delete_project', 'delete_prompt', 'delete_run_from_annotation_queue', 'diff_dataset_versions', 'evaluate', 'evaluate_run', 'flush', 'flush_compressed_traces', 'get_experiment_results', 'get_prompt', 'get_run_from_annotation_queue', 'get_run_stats', 'get_run_url', 'get_test_results', 'has_dataset', 'has_project', 'index_data

In [19]:
# Create dataset
dataset = client.create_dataset("QA Example Dataset 2")

# Create examples with split tags
examples_with_splits = [
    {
        "inputs": {"question": "What is LangChain?"},
        "outputs": {"answer": "A framework for building LLM applications"},
        "split": "train"
    },
    {
        "inputs": {"question": "What is LangSmith?"},
        "outputs": {"answer": "A platform for observing and evaluating LLM applications"},
        "split": "train"
    },
    {
        "inputs": {"question": "What is OpenAI?"},
        "outputs": {"answer": "A company that creates Large Language Models"},
        "split": "train"
    },
    {
        "inputs": {"question": "What is Google?"},
        "outputs": {"answer": "A technology company known for search"},
        "split": "eval"
    },
    {
        "inputs": {"question": "What is Mistral?"},
        "outputs": {"answer": "A company that creates Large Language Models"},
        "split": "eval"
    }
]

client.create_examples(
    dataset_id=dataset.id,
    examples=examples_with_splits
)

{'example_ids': ['0cf0576f-efaa-4d47-8ffc-d28cd86d0148',
  '043a29e4-2a44-427c-a384-35073a2e8d89',
  '89c6c017-bec4-4521-b72d-49a9d2c85688',
  '4a8db125-c32c-4431-81d3-a28a48fec388',
  'fb0c0aeb-6304-4e4e-b46b-b8ebaabd8b0a'],
 'count': 5}

In [21]:
dataset

Dataset(name='QA Example Dataset 2', description=None, data_type=<DataType.kv: 'kv'>, id=UUID('02f20b15-23a0-4544-ac0c-ce49517545c7'), created_at=datetime.datetime(2025, 11, 13, 10, 0, 26, 74235, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2025, 11, 13, 10, 0, 26, 74235, tzinfo=datetime.timezone.utc), example_count=0, session_count=0, last_session_start_time=None, inputs_schema=None, outputs_schema=None, transformations=None, metadata={'runtime': {'sdk': 'langsmith-py', 'sdk_version': '0.4.35', 'library': 'langsmith', 'platform': 'Linux-6.14.0-34-generic-x86_64-with-glibc2.39', 'runtime': 'python', 'py_implementation': 'CPython', 'runtime_version': '3.12.3', 'langchain_version': '0.3.27', 'langchain_core_version': '0.3.79'}})

In [28]:
dataset = client.read_dataset(dataset_id='f2b609c2-4f5c-4b98-ad25-7223e7770299')

In [29]:
dataset

Dataset(name='QA Example Dataset 1', description=None, data_type=<DataType.kv: 'kv'>, id=UUID('f2b609c2-4f5c-4b98-ad25-7223e7770299'), created_at=datetime.datetime(2025, 11, 13, 9, 15, 50, 894871, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2025, 11, 13, 9, 15, 50, 894871, tzinfo=datetime.timezone.utc), example_count=5, session_count=1, last_session_start_time=datetime.datetime(2025, 11, 13, 9, 28, 44, 853255), inputs_schema=None, outputs_schema=None, transformations=None, metadata={'runtime': {'sdk': 'langsmith-py', 'library': 'langsmith', 'runtime': 'python', 'platform': 'Linux-6.14.0-34-generic-x86_64-with-glibc2.39', 'sdk_version': '0.4.35', 'runtime_version': '3.12.3', 'langchain_version': '0.3.27', 'py_implementation': 'CPython', 'langchain_core_version': '0.3.79'}})

In [34]:
examples = client.list_examples(dataset_id='f2b609c2-4f5c-4b98-ad25-7223e7770299')

In [35]:
examples

<generator object Client.list_examples at 0x76c17339fa10>

In [36]:
my_examples = [example.id for example in client.list_examples(dataset_id=dataset.id)]

In [37]:
my_examples

[UUID('4dadfed7-129d-4f50-9ddd-00a4b57d8b95'),
 UUID('65f271ca-333d-420d-8e93-01d1e0e4da75'),
 UUID('d1566b24-8470-42f2-a5a8-3db096ca702a'),
 UUID('db23c766-5f72-4907-ae14-9a33b1319619'),
 UUID('e076a36d-ca17-49e8-b75c-7dadb711bd75')]

In [38]:
my_examples[3:]

[UUID('db23c766-5f72-4907-ae14-9a33b1319619'),
 UUID('e076a36d-ca17-49e8-b75c-7dadb711bd75')]

In [40]:
client.update_examples(
  example_ids=my_examples[3:],
  metadata=[{"foo": "baz"}, {"foo": "qux"}],
  splits=["test1", "test1"] # Splits can be arrays or standalone strings
)

{'message': '2 examples updated',
 'example_ids': ['db23c766-5f72-4907-ae14-9a33b1319619',
  'e076a36d-ca17-49e8-b75c-7dadb711bd75'],
 'count': 2}