# Back Testing

---
# 1.Setup


### 1a. configure the environment

In [1]:
!pip install -U langsmith langchain langchainhub emoji langgraph langchain-community langchain-google-genai

Collecting langsmith
  Downloading langsmith-0.4.7-py3-none-any.whl.metadata (15 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting langgraph
  Downloading langgraph-0.5.3-py3-none-any.whl.metadata (6.9 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.1.8-py3-none-any.whl.metadata (7.0 kB)
Collecting packaging>=23.2 (from langsmith)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting types-requests<3.0.0.0,>=2.31.0.2 (from langchainhub)
  Downloading types_requests-2.32.4.20250611-py3-none-any.whl.metadata (2.1 kB)
Collecting langgraph-checkpoint<3.0.0,>=2.1.0 (from langgraph)
  Downloading langgraph_checkpoint-2.1.1-py3-none-any.whl.metadata (4.2 kB)
Collecting langgraph-prebuilt<0.6.0,>=0.5.0

In [1]:
from google.colab import userdata
import os

# Set the project name to whichever project you'd like to be testing against
project_name = "Tweet Writing Task"
os.environ["LANGSMITH_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGSMITH_PROJECT"] = project_name
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = userdata.get('Smith2')

# Optional. You can swap OpenAI for any other tool-calling chat model.
#os.environ["GEMINI_API_KEY"]= userdata.get('gemini')
GEMINI_API_KEY= userdata.get('gemini')
# Optional. You can swap Tavily for the free DuckDuckGo search tool if preferred.
# Get Tavily API key: https://tavily.com
os.environ["TAVILY_API_KEY"] = userdata.get('tavily')

### 1b. Define the Application


In [2]:
from langchain.chat_models import init_chat_model
from langgraph.prebuilt import create_react_agent
from langchain_community.tools import DuckDuckGoSearchRun, TavilySearchResults
from langchain_core.rate_limiters import InMemoryRateLimiter

# We will use Gemini 2.0 Flash-Lite as the baseline and compare against Gemini 2.5 Pro
gem_2FL = init_chat_model("gemini-2.0-flash", model_provider="google_genai",google_api_key=GEMINI_API_KEY)

# The instrucitons are passed as a system message to the agent
instructions = """You are a tweet writing assistant. Given a topic, do some research and write a relevant and engaging tweet about it.
- Use at least 3 emojis in each tweet
- The tweet should be no longer than 280 characters
- Always use the search tool to gather recent information on the tweet topic
- Write the tweet only based on the search content. Do not rely on your internal knowledge
- When relevant, link to your sources
- Make your tweet as engaging as possible"""

# Define the tools our agent can use

# If you have a higher tiered Tavily API plan you can increase this
rate_limiter = InMemoryRateLimiter(requests_per_second=0.08)

# Use DuckDuckGo if you don't have a Tavily API key:
# tools = [DuckDuckGoSearchRun(rate_limiter=rate_limiter)]
tools = [TavilySearchResults(max_results=5, rate_limiter=rate_limiter)]

agent = create_react_agent(gem_2FL, tools=tools, prompt=instructions)

  tools = [TavilySearchResults(max_results=5, rate_limiter=rate_limiter)]


### 1c. Simulate Production Data

In [None]:
# @title
# IGNORE for next cell. This was a trial run with small batch to ensure langsmith traces

import time

fake_production_inputs = [
    "Alan turing's early childhood",
    "Economic impacts of the European Union",
    "Underrated philosophers",
]
agent.batch(
    [{"messages": [{"role": "user", "content": content}]} for content in fake_production_inputs],
)

[{'messages': [HumanMessage(content="Alan turing's early childhood", additional_kwargs={}, response_metadata={}, id='15c992f9-0b30-41b1-bcf7-98349ccd3b16'),
   AIMessage(content='', additional_kwargs={'function_call': {'name': 'tavily_search_results_json', 'arguments': '{"query": "Alan Turing early childhood"}'}}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []}, id='run--54d54fb0-c6da-448d-835e-62384dafca53-0', tool_calls=[{'name': 'tavily_search_results_json', 'args': {'query': 'Alan Turing early childhood'}, 'id': '464daf32-030a-4a36-91c2-21cff1fa1b0b', 'type': 'tool_call'}], usage_metadata={'input_tokens': 157, 'output_tokens': 13, 'total_tokens': 170, 'input_token_details': {'cache_read': 0}}),
   AIMessage(content='Born in London 🇬🇧, Alan Turing spent his early years in foster care while his parents were in India. Even as a child, his intelligence shone through! ✨📚 He 

In [None]:
# 1 min wait batch processing_WayAround

import time

fake_production_inputs = [
    "Alan turing's early childhood",
    "Economic impacts of the European Union",
    "Underrated philosophers",
    "History of the Roxie theater in San Francisco",
    "ELI5: gravitational waves",
    "The arguments for and against a parliamentary system",
    "Pivotal moments in music history",
    "Big ideas in programming languages",
    "Big questions in biology",
    "The relationship between math and reality",
    "What makes someone funny",
]

batch_size = len(fake_production_inputs) // 3
if len(fake_production_inputs) % 3 != 0:
    batch_size += 1 # Ensure all inputs are included

for i in range(0, len(fake_production_inputs), batch_size):
    batch_inputs = fake_production_inputs[i:i + batch_size]
    print(f"Processing batch {i // batch_size + 1}...")
    output = agent.batch(
        [{"messages": [{"role": "user", "content": content}]} for content in batch_inputs],
    )
    print(output) # Print the output of agent.batch
    if i + batch_size < len(fake_production_inputs):
        print("Waiting for 1 minute before processing the next batch...")
        time.sleep(60)

print("All batches processed.")

Processing batch 1...
Waiting for 1 minute before processing the next batch...
Processing batch 2...
Waiting for 1 minute before processing the next batch...
Processing batch 3...
[{'messages': [HumanMessage(content='Big questions in biology', additional_kwargs={}, response_metadata={}, id='07365f9f-5e01-4770-8c16-05b7df59ecfd'), AIMessage(content='', additional_kwargs={'function_call': {'name': 'tavily_search_results_json', 'arguments': '{"query": "big questions in biology"}'}}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []}, id='run--ba9b1dec-4b80-4c22-8133-d248a89c9099-0', tool_calls=[{'name': 'tavily_search_results_json', 'args': {'query': 'big questions in biology'}, 'id': '1e005463-9c19-4b1c-8105-7529054c8cb4', 'type': 'tool_call'}], usage_metadata={'input_tokens': 154, 'output_tokens': 13, 'total_tokens': 167, 'input_token_details': {'cache_read': 0}}), ToolMessage(

---
# 2.Convert Production Traces to experiment


### 2a. Select Runs to **backtest** on

In [None]:
from datetime import datetime, timedelta, timezone
from uuid import uuid4
from langsmith import Client
from langsmith.beta import convert_runs_to_test

# Fetch the runs we want to convert to a dataset/experiment
client = Client()

# How we are sampling runs to include in our dataset
end_time = datetime.now(tz=timezone.utc)
start_time = end_time - timedelta(days=1)
run_filter = f'and(gt(start_time, "{start_time.isoformat()}"), lt(end_time, "{end_time.isoformat()}"))'
prod_runs = list(client.list_runs(
        project_name=project_name,
        is_root=True,
        filter=run_filter,
    )
)

### 2b. Convert Runs to Experiment

In [None]:
# Name of the dataset we want to create
dataset_name = f'{project_name}-backtesting {start_time.strftime("%Y-%m-%d")}-{end_time.strftime("%Y-%m-%d")}'
# Name of the experiment we want to create from the historical runs
# baseline_experiment_name = f"prod-baseline-gemini-2.0-flash-{str(uuid4())[:4]}" # Removed this line

# This converts the runs to a dataset + experiment
baseline_experiment_run = convert_runs_to_test(
    prod_runs,
    # Name of the resulting dataset
    dataset_name=dataset_name,
    # Whether to include the run outputs as reference/ground truth
    include_outputs=False,
    # Whether to include the full traces in the resulting experiment
    # (default is to just include the root run)
    load_child_runs=True,
    # Name of the experiment so we can apply evalautors to it after
    # test_project_name=baseline_experiment_name # Removed this line
    test_project_name=f"prod-baseline-gemini-2.0-flash-{str(uuid4())[:4]}" # Generate the name here
)

# Store the actual experiment name
baseline_experiment_name = baseline_experiment_run.name

---
# 3.Benchmark against new system


## 3a. Define Evaluators

In [3]:
!pip install --upgrade langchain-google-genai



In [4]:
import emoji
from pydantic import BaseModel, Field
from langchain_core.messages import convert_to_openai_messages
from langchain_google_genai import ChatGoogleGenerativeAI

class Grader(BaseModel):
    """Grade whether a response is supported by some context."""
    grounded: bool = Field(..., description="Is the majority of the response supported by the retrieved context?")

grounded_instructions = f"""You have given somebody some contextual information and asked them to write a statement grounded in that context.

Grade whether their response is fully supported by the context you have provided. \
If any meaningful part of their statement is not backed up directly by the context you provided, then their response is not grounded. \
Otherwise it is grounded."""

grounded_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=0, google_api_key=GEMINI_API_KEY).with_structured_output(Grader, method="json_mode",strict=True)
#Gemini 2.5 Flash-Lite Preview 06-17

def lt_280_chars(outputs: dict) -> bool:
    messages = convert_to_openai_messages(outputs["messages"])
    return len(messages[-1]['content']) <= 280

def gte_3_emojis(outputs: dict) -> bool:
    messages = convert_to_openai_messages(outputs["messages"])
    return len(emoji.emoji_list(messages[-1]['content'])) >= 3

async def is_grounded(outputs: dict) -> bool:
    context = ""
    messages = convert_to_openai_messages(outputs["messages"])
    for message in messages:
        if message["role"] == "tool":
            # Tool message outputs are the results returned from the Tavily/DuckDuckGo tool
            context += "\n\n" + message["content"]
    tweet = messages[-1]["content"]
    user = f"""CONTEXT PROVIDED:
    {context}

    RESPONSE GIVEN:
    {tweet}"""

    grade: Grader = await grounded_model.ainvoke([
        {"role": "system", "content": grounded_instructions},
        {"role": "user", "content": user}
    ])
    return {"score": grade.grounded}

In [5]:
#@title genai_call example
from google import genai
from pydantic import BaseModel
from google.colab import userdata # Import userdata

class Recipe(BaseModel):
    recipe_name: str
    ingredients: list[str]

# Retrieve the API key within the cell
GOOGLE_API_KEY= userdata.get('gemini')

# Pass the API key to the client
client = genai.Client(api_key=GOOGLE_API_KEY)
response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents="List a few popular cookie recipes, and include the amounts of ingredients.",
    config={
        "response_mime_type": "application/json",
        "response_schema": list[Recipe],
    },
    # Remove the redundant API key argument here as it's passed to the client
    # google_api_key="GEMINI_API_KEY"
)
# Use the response as a JSON string.
print(response.text)

# Use instantiated objects.
my_recipes: list[Recipe] = response.parsed

[{"recipe_name":"Classic Chocolate Chip Cookies","ingredients":["1 cup (2 sticks) unsalted butter, softened","3/4 cup granulated sugar","3/4 cup packed light brown sugar","2 large eggs","1 teaspoon vanilla extract","2 1/4 cups all-purpose flour","1 teaspoon baking soda","1/2 teaspoon salt","1 cup (6 oz) semi-sweet chocolate chips"]},{"recipe_name":"Oatmeal Raisin Cookies","ingredients":["1 cup (2 sticks) unsalted butter, softened","1 1/2 cups packed light brown sugar","2 large eggs","1 teaspoon vanilla extract","1 1/2 cups all-purpose flour","1 teaspoon baking soda","1/2 teaspoon ground cinnamon","1/4 teaspoon salt","3 cups old-fashioned oats","1 cup raisins"]},{"recipe_name":"Simple Sugar Cookies","ingredients":["1 cup (2 sticks) unsalted butter, softened","1 1/2 cups granulated sugar","1 large egg","2 teaspoons vanilla extract","3 cups all-purpose flour","2 teaspoons baking powder","1/4 teaspoon salt"]}]


## 3b. Evaluate Baseline

In [None]:
from langsmith import Client
from uuid import uuid4
client = Client()
baseline_results = await client.aevaluate(
    "prod-baseline-gemini-2.0-flash-e561", # Use the existing baseline_experiment_name
    evaluators=[lt_280_chars, gte_3_emojis, is_grounded],
)
# If you have pandas installed can easily explore results as df:
# baseline_results.to_pandas()

View the evaluation results for experiment: 'prod-baseline-gemini-2.0-flash-e561' at:
https://smith.langchain.com/o/ef9e87ab-1348-4c3e-9139-19b869acd75b/datasets/7f9df51a-56ac-4bbd-a5e1-edf2b1566101/compare?selectedSessions=5859ffe9-59cf-4d9d-b9c5-0bd6948ef623




0it [00:00, ?it/s]

## 3c. Define & Evaluate new System

In [12]:
# @title cand_gem 2 pro
from langsmith import Client
from uuid import uuid4
client = Client()
dataset_name="Tweet Writing Task-backtesting 2025-07-13-2025-07-14"
candidate_results = await client.aevaluate(
    agent.with_config(model="gemini-2.5-pro"),
    data=dataset_name,
    evaluators=[lt_280_chars, gte_3_emojis, is_grounded],
    experiment_prefix="candidate-gpt-4o",
)
# If you have pandas installed can easily explore results as df:
# candidate_results.to_pandas()

View the evaluation results for experiment: 'candidate-gpt-4o-b18cf4cc' at:
https://smith.langchain.com/o/ef9e87ab-1348-4c3e-9139-19b869acd75b/datasets/7f9df51a-56ac-4bbd-a5e1-edf2b1566101/compare?selectedSessions=325fbe09-f583-4969-9f0c-011835526831




0it [00:00, ?it/s]

  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 9
}
].


In [15]:
# @title cand_gem 1.5f
from langsmith import Client
from uuid import uuid4
client = Client()
dataset_name="Tweet Writing Task-backtesting 2025-07-13-2025-07-14"
candidate_results = await client.aevaluate(
    agent.with_config(model="gemini-1.5-flash"),
    data=dataset_name,
    evaluators=[lt_280_chars, gte_3_emojis, is_grounded],
    experiment_prefix="candidate-gem1.5f",
)

View the evaluation results for experiment: 'candidate-gem1.5f-072a1216' at:
https://smith.langchain.com/o/ef9e87ab-1348-4c3e-9139-19b869acd75b/datasets/7f9df51a-56ac-4bbd-a5e1-edf2b1566101/compare?selectedSessions=3ea3a94b-d31d-45d3-83c8-c486c2313c62




0it [00:00, ?it/s]

  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 57
}
].
ERROR:langsmith.evaluation._arunner:Error running target function: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    

In [16]:
# @title cand_gem 1.5f 8b
from langsmith import Client
from uuid import uuid4
client = Client()
dataset_name="Tweet Writing Task-backtesting 2025-07-13-2025-07-14"
candidate_results = await client.aevaluate(
    agent.with_config(model="gemini-1.5-flash-8b"),
    data=dataset_name,
    evaluators=[lt_280_chars, gte_3_emojis, is_grounded],
    experiment_prefix="candidate-gem1.5f8b",
)

View the evaluation results for experiment: 'candidate-gem1.5f8b-7bc6fae4' at:
https://smith.langchain.com/o/ef9e87ab-1348-4c3e-9139-19b869acd75b/datasets/7f9df51a-56ac-4bbd-a5e1-edf2b1566101/compare?selectedSessions=331cad0e-a675-4a8b-a4f8-5aee69c7ac4b




0it [00:00, ?it/s]

  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 54
}
].
ERROR:langsmith.evaluation._arunner:Error running target function: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    

In [14]:
#to pandas
candidate_results.to_pandas()

Unnamed: 0,inputs.messages,outputs.messages,error,feedback.lt_280_chars,feedback.gte_3_emojis,feedback.is_grounded,execution_time,example_id,id
0,"[{'role': 'user', 'content': 'History of the R...",[content='History of the Roxie theater in San ...,,False,True,True,5.966543,104d796c-eb14-4cf1-b516-4f66f6009dfb,b5ab823a-a5a9-4995-ad96-4ab714a1cf84
1,"[{'role': 'user', 'content': 'Underrated philo...",[content='Underrated philosophers' additional_...,,True,True,True,4.108118,24879533-b9e4-43de-9d3f-a0801f488d82,f4d76f5d-9a05-4e88-8762-8f5a1ebe114e
2,"[{'role': 'user', 'content': 'What makes someo...",[content='What makes someone funny' additional...,,True,True,True,5.212969,2becd6f3-e3de-450c-b208-2acb4d00146e,68707a8c-c0e6-4967-b4a8-e5ed21138a3d
3,"[{'role': 'user', 'content': 'The relationship...",[content='The relationship between math and re...,,True,True,True,4.438169,2c008f5a-5b87-4ffe-bc8b-3ce129d1b0aa,029ced54-707e-4745-9e52-f726add7a3f7
4,"[{'role': 'user', 'content': 'Pivotal moments ...",[content='Pivotal moments in music history' ad...,,True,True,True,3.384445,2d5d4522-6b19-4534-a052-cf922f85c703,ca659a34-f3ce-411f-8be1-261bd82b2761
5,"[{'role': 'user', 'content': 'Economic impacts...",[content='Economic impacts of the European Uni...,,True,True,True,6.247763,5d0eb5b0-8826-477b-9bfa-3bf2c13e5c7f,92fe8bc6-794b-4dc0-9dad-7120a041b5ff
6,"[{'role': 'user', 'content': 'Big ideas in pro...",[content='Big ideas in programming languages' ...,,True,True,True,6.108578,749266c0-d288-4782-99e8-99dc2fc1a513,cb504b0a-8d23-4315-946e-c7db937c7e19
7,"[{'role': 'user', 'content': 'The arguments fo...",[content='The arguments for and against a parl...,,True,True,True,5.390452,991b267d-881f-4b18-b795-f8f96de2aa8c,102a7170-087c-4843-b220-fa75d8c1106e
8,"[{'role': 'user', 'content': 'ELI5: gravitatio...",[content='ELI5: gravitational waves' additiona...,,False,True,True,7.086553,a27d1556-b285-4738-a71e-2f0a70aa3a14,5f15001f-82c5-47f4-86cc-d0581e8ae82c
9,"[{'role': 'user', 'content': 'Alan turing's ea...","[content=""Alan turing's early childhood"" addit...",,True,True,True,4.585607,c70db2ac-2b20-4c30-a16c-6d2b704264b2,f73c762f-dc90-4326-8e34-972f2c40a4de





Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.



---
# More..

In [None]:
# @title Evaluator (from RAG eval sheet)
from typing_extensions import Annotated
from pydantic import BaseModel, Field

# Grade output schema
class CorrectnessGrade(BaseModel):
    # Note that the order in the fields are defined is the order in which the model will generate them.
    # It is useful to put explanations before responses because it forces the model to think through
    # its final response before generating it:
    explanation: str = Field(..., description="Explain your reasoning for the score")
    correct: bool = Field(..., description="True if the answer is correct, False otherwise.")

# Grade prompt
correctness_instructions = """You are a teacher grading a quiz.

You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER.

Here is the grade criteria to follow:
(1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer.
(2) Ensure that the student answer does not contain any conflicting statements.
(3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.

Correctness:
A correctness value of True means that the student's answer meets all of the criteria.
A correctness value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.

Avoid simply stating the correct answer at the outset."""

# Grader LLM
grader_llm_correctness = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=0, google_api_key=GEMINI_API_KEY).with_structured_output(CorrectnessGrade, method="json_mode", strict=True)


def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    """An evaluator for RAG answer accuracy"""
    answers = f"""\
QUESTION: {inputs['question']}
GROUND TRUTH ANSWER: {reference_outputs['answer']}
STUDENT ANSWER: {outputs['answer']}"""

    # Run evaluator
    grade: CorrectnessGrade = grader_llm_correctness.invoke([
        {"role": "system", "content": correctness_instructions},
        {"role": "user", "content": answers}
    ])
    return {"score": grade.correct}