---
# Setup


### configure the environment

In [1]:
!pip install -U langsmith langchain langchainhub emoji langgraph langchain-community langchain-google-genai

Collecting langsmith
  Downloading langsmith-0.4.6-py3-none-any.whl.metadata (15 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting langgraph
  Downloading langgraph-0.5.3-py3-none-any.whl.metadata (6.9 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.1.8-py3-none-any.whl.metadata (7.0 kB)
Collecting types-requests<3.0.0.0,>=2.31.0.2 (from langchainhub)
  Downloading types_requests-2.32.4.20250611-py3-none-any.whl.metadata (2.1 kB)
Collecting langgraph-checkpoint<3.0.0,>=2.1.0 (from langgraph)
  Downloading langgraph_checkpoint-2.1.0-py3-none-any.whl.metadata (4.2 kB)
Collecting langgraph-prebuilt<0.6.0,>=0.5.0 (from langgraph)
  Downloading langgraph_prebuilt-0.5.2-py3-none-any.whl.metadata (4.5 kB)
Collecting langg

In [1]:
from google.colab import userdata
import os

# Set the project name to whichever project you'd like to be testing against
project_name = "Tweet Writing Task"
os.environ["LANGSMITH_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGSMITH_PROJECT"] = project_name
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = userdata.get('Smith2')

# Optional. You can swap OpenAI for any other tool-calling chat model.
#os.environ["GEMINI_API_KEY"]= userdata.get('gemini')
GEMINI_API_KEY= userdata.get('gemini')
# Optional. You can swap Tavily for the free DuckDuckGo search tool if preferred.
# Get Tavily API key: https://tavily.com
os.environ["TAVILY_API_KEY"] = userdata.get('tavily')

### Define the Application


In [None]:
from langchain.chat_models import init_chat_model
from langgraph.prebuilt import create_react_agent
from langchain_community.tools import DuckDuckGoSearchRun, TavilySearchResults
from langchain_core.rate_limiters import InMemoryRateLimiter

# We will use Gemini 2.0 Flash-Lite as the baseline and compare against Gemini 2.5 Pro
gem_2FL = init_chat_model("gemini-2.0-flash", model_provider="google_genai",google_api_key=GEMINI_API_KEY)

# The instrucitons are passed as a system message to the agent
instructions = """You are a tweet writing assistant. Given a topic, do some research and write a relevant and engaging tweet about it.
- Use at least 3 emojis in each tweet
- The tweet should be no longer than 280 characters
- Always use the search tool to gather recent information on the tweet topic
- Write the tweet only based on the search content. Do not rely on your internal knowledge
- When relevant, link to your sources
- Make your tweet as engaging as possible"""

# Define the tools our agent can use

# If you have a higher tiered Tavily API plan you can increase this
rate_limiter = InMemoryRateLimiter(requests_per_second=0.08)

# Use DuckDuckGo if you don't have a Tavily API key:
# tools = [DuckDuckGoSearchRun(rate_limiter=rate_limiter)]
tools = [TavilySearchResults(max_results=5, rate_limiter=rate_limiter)]

agent = create_react_agent(gem_2FL, tools=tools, prompt=instructions)

  tools = [TavilySearchResults(max_results=5, rate_limiter=rate_limiter)]


### Simulate Production Data

In [None]:
# @title
# IGNORE for next cell. This was a trial run with small batch to ensure langsmith traces

import time

fake_production_inputs = [
    "Alan turing's early childhood",
    "Economic impacts of the European Union",
    "Underrated philosophers",
]
agent.batch(
    [{"messages": [{"role": "user", "content": content}]} for content in fake_production_inputs],
)

[{'messages': [HumanMessage(content="Alan turing's early childhood", additional_kwargs={}, response_metadata={}, id='15c992f9-0b30-41b1-bcf7-98349ccd3b16'),
   AIMessage(content='', additional_kwargs={'function_call': {'name': 'tavily_search_results_json', 'arguments': '{"query": "Alan Turing early childhood"}'}}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []}, id='run--54d54fb0-c6da-448d-835e-62384dafca53-0', tool_calls=[{'name': 'tavily_search_results_json', 'args': {'query': 'Alan Turing early childhood'}, 'id': '464daf32-030a-4a36-91c2-21cff1fa1b0b', 'type': 'tool_call'}], usage_metadata={'input_tokens': 157, 'output_tokens': 13, 'total_tokens': 170, 'input_token_details': {'cache_read': 0}}),
   AIMessage(content='Born in London 🇬🇧, Alan Turing spent his early years in foster care while his parents were in India. Even as a child, his intelligence shone through! ✨📚 He 

In [None]:
# 1 min wait batch processing_WayAround

import time

fake_production_inputs = [
    "Alan turing's early childhood",
    "Economic impacts of the European Union",
    "Underrated philosophers",
    "History of the Roxie theater in San Francisco",
    "ELI5: gravitational waves",
    "The arguments for and against a parliamentary system",
    "Pivotal moments in music history",
    "Big ideas in programming languages",
    "Big questions in biology",
    "The relationship between math and reality",
    "What makes someone funny",
]

batch_size = len(fake_production_inputs) // 3
if len(fake_production_inputs) % 3 != 0:
    batch_size += 1 # Ensure all inputs are included

for i in range(0, len(fake_production_inputs), batch_size):
    batch_inputs = fake_production_inputs[i:i + batch_size]
    print(f"Processing batch {i // batch_size + 1}...")
    output = agent.batch(
        [{"messages": [{"role": "user", "content": content}]} for content in batch_inputs],
    )
    print(output) # Print the output of agent.batch
    if i + batch_size < len(fake_production_inputs):
        print("Waiting for 1 minute before processing the next batch...")
        time.sleep(60)

print("All batches processed.")

Processing batch 1...
Waiting for 1 minute before processing the next batch...
Processing batch 2...
Waiting for 1 minute before processing the next batch...
Processing batch 3...
[{'messages': [HumanMessage(content='Big questions in biology', additional_kwargs={}, response_metadata={}, id='07365f9f-5e01-4770-8c16-05b7df59ecfd'), AIMessage(content='', additional_kwargs={'function_call': {'name': 'tavily_search_results_json', 'arguments': '{"query": "big questions in biology"}'}}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []}, id='run--ba9b1dec-4b80-4c22-8133-d248a89c9099-0', tool_calls=[{'name': 'tavily_search_results_json', 'args': {'query': 'big questions in biology'}, 'id': '1e005463-9c19-4b1c-8105-7529054c8cb4', 'type': 'tool_call'}], usage_metadata={'input_tokens': 154, 'output_tokens': 13, 'total_tokens': 167, 'input_token_details': {'cache_read': 0}}), ToolMessage(

---
# Convert Production Traces to experiment


### Select Runs to **backtest** on

In [None]:
from datetime import datetime, timedelta, timezone
from uuid import uuid4
from langsmith import Client
from langsmith.beta import convert_runs_to_test

# Fetch the runs we want to convert to a dataset/experiment
client = Client()

# How we are sampling runs to include in our dataset
end_time = datetime.now(tz=timezone.utc)
start_time = end_time - timedelta(days=1)
run_filter = f'and(gt(start_time, "{start_time.isoformat()}"), lt(end_time, "{end_time.isoformat()}"))'
prod_runs = list(client.list_runs(
        project_name=project_name,
        is_root=True,
        filter=run_filter,
    )
)

### Convert Runs to Experiment

In [None]:
# Name of the dataset we want to create
dataset_name = f'{project_name}-backtesting {start_time.strftime("%Y-%m-%d")}-{end_time.strftime("%Y-%m-%d")}'
# Name of the experiment we want to create from the historical runs
# baseline_experiment_name = f"prod-baseline-gemini-2.0-flash-{str(uuid4())[:4]}" # Removed this line

# This converts the runs to a dataset + experiment
baseline_experiment_run = convert_runs_to_test(
    prod_runs,
    # Name of the resulting dataset
    dataset_name=dataset_name,
    # Whether to include the run outputs as reference/ground truth
    include_outputs=False,
    # Whether to include the full traces in the resulting experiment
    # (default is to just include the root run)
    load_child_runs=True,
    # Name of the experiment so we can apply evalautors to it after
    # test_project_name=baseline_experiment_name # Removed this line
    test_project_name=f"prod-baseline-gemini-2.0-flash-{str(uuid4())[:4]}" # Generate the name here
)

# Store the actual experiment name
baseline_experiment_name = baseline_experiment_run.name

---
# Benchmark against new system


## Define Evaluators

In [2]:
import emoji
from pydantic import BaseModel, Field
from langchain_core.messages import convert_to_openai_messages
from langchain_google_genai import ChatGoogleGenerativeAI

class Grader(BaseModel):
    """Grade whether a response is supported by some context."""
    grounded: bool = Field(..., description="Is the majority of the response supported by the retrieved context?")

grounded_instructions = f"""You have given somebody some contextual information and asked them to write a statement grounded in that context.

Grade whether their response is fully supported by the context you have provided. \
If any meaningful part of their statement is not backed up directly by the context you provided, then their response is not grounded. \
Otherwise it is grounded."""

grounded_model = ChatGoogleGenerativeAI(model="Gemini 2.5 Flash-Lite Preview 06-17", temperature=0, google_api_key=GEMINI_API_KEY).with_structured_output(Grader, method="json_mode", strict=True)


def lt_280_chars(outputs: dict) -> bool:
    messages = convert_to_openai_messages(outputs["messages"])
    return len(messages[-1]['content']) <= 280

def gte_3_emojis(outputs: dict) -> bool:
    messages = convert_to_openai_messages(outputs["messages"])
    return len(emoji.emoji_list(messages[-1]['content'])) >= 3

async def is_grounded(outputs: dict) -> bool:
    context = ""
    messages = convert_to_openai_messages(outputs["messages"])
    for message in messages:
        if message["role"] == "tool":
            # Tool message outputs are the results returned from the Tavily/DuckDuckGo tool
            context += "\n\n" + message["content"]
    tweet = messages[-1]["content"]
    user = f"""CONTEXT PROVIDED:
    {context}

    RESPONSE GIVEN:
    {tweet}"""

    grade: Grader = await grounded_model.ainvoke([
        {"role": "system", "content": grounded_instructions},
        {"role": "user", "content": user}
    ])
    return {"score": grade.grounded}

## Evaluate Baseline

In [None]:
# @title ###### Break in Batches_quota problem)
from langsmith import Client
import time
from uuid import uuid4
import asyncio # Import asyncio

client = Client()

# Retrieve the runs for the baseline experiment
# We still list runs to know how many batches we would have, but we won't pass run_ids to evaluate
runs_to_evaluate = list(client.list_runs(project_name=baseline_experiment_name, is_root=True))
run_ids_to_evaluate = [run.id for run in runs_to_evaluate] # Keep this to determine batching structure

# Determine batch size
batch_size = len(run_ids_to_evaluate) // 3
if len(run_ids_to_evaluate) % 3 != 0:
    batch_size += 1 # Ensure all inputs are included

# Process in batches (simulate batching by calling evaluate multiple times with delays)
# NOTE: client.evaluate on an existing experiment evaluates ALL runs in that experiment.
# This loop with batch_run_ids and delays is a simulation based on your request,
# but each call to client.evaluate will process the whole experiment.
# For actual rate limiting, use the 'concurrency' parameter in client.evaluate.
async def evaluate_batches(): # Define an async function to contain the loop
    for i in range(0, len(run_ids_to_evaluate), batch_size):
        batch_start_index = i
        batch_end_index = i + batch_size
        print(f"Simulating processing batch {i // batch_size + 1} (runs {batch_start_index} to {min(batch_end_index, len(run_ids_to_evaluate)) - 1})...")

        # Evaluate the entire experiment asynchronously
        # Adjust 'concurrency' for rate limiting.
        batch_results = await client.aevaluate( # Changed to aevaluate and added await
            baseline_experiment_name,
            evaluators=[lt_280_chars, gte_3_emojis, is_grounded],
            # runs=batch_run_ids # This argument is not supported for evaluating existing experiments
        )
        print(f"Simulated batch {i // batch_size + 1} evaluation complete.")
        # You can process batch_results here if needed

        if i + batch_size < len(run_ids_to_evaluate):
            print("Waiting for 1 minute before simulating processing the next batch...")
            time.sleep(60)

    print("All simulated evaluation batches processed.")
    # Note: The final 'baseline_results' variable will only hold the results of the last evaluation call.
    # You would need to collect results from each batch if you need the full set.
    global baseline_results # Make baseline_results accessible outside the function
    baseline_results = batch_results

# Run the async evaluation function
await evaluate_batches()

In [None]:
from langsmith import Client
from uuid import uuid4
client = Client()
baseline_results = await client.aevaluate(
    "prod-baseline-gemini-2.0-flash-e561", # Use the existing baseline_experiment_name
    evaluators=[lt_280_chars, gte_3_emojis, is_grounded],
)
# If you have pandas installed can easily explore results as df:
# baseline_results.to_pandas()

View the evaluation results for experiment: 'prod-baseline-gemini-2.0-flash-e561' at:
https://smith.langchain.com/o/ef9e87ab-1348-4c3e-9139-19b869acd75b/datasets/7f9df51a-56ac-4bbd-a5e1-edf2b1566101/compare?selectedSessions=5859ffe9-59cf-4d9d-b9c5-0bd6948ef623




0it [00:00, ?it/s]

In [3]:
from langsmith import Client
from uuid import uuid4
client = Client()
baseline_results = await client.aevaluate(
    "prod-baseline-gemini-2.0-flash-e561", # Use the existing baseline_experiment_name
    evaluators=[lt_280_chars, gte_3_emojis, is_grounded],
)
# If you have pandas installed can easily explore results as df:
# baseline_results.to_pandas()

View the evaluation results for experiment: 'prod-baseline-gemini-2.0-flash-e561' at:
https://smith.langchain.com/o/ef9e87ab-1348-4c3e-9139-19b869acd75b/datasets/7f9df51a-56ac-4bbd-a5e1-edf2b1566101/compare?selectedSessions=5859ffe9-59cf-4d9d-b9c5-0bd6948ef623




0it [00:00, ?it/s]

---
# Extra


## configure

## environment

---
# More..

In [None]:
# @title Evaluator (from RAG eval sheet)
from typing_extensions import Annotated
from pydantic import BaseModel, Field

# Grade output schema
class CorrectnessGrade(BaseModel):
    # Note that the order in the fields are defined is the order in which the model will generate them.
    # It is useful to put explanations before responses because it forces the model to think through
    # its final response before generating it:
    explanation: str = Field(..., description="Explain your reasoning for the score")
    correct: bool = Field(..., description="True if the answer is correct, False otherwise.")

# Grade prompt
correctness_instructions = """You are a teacher grading a quiz.

You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER.

Here is the grade criteria to follow:
(1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer.
(2) Ensure that the student answer does not contain any conflicting statements.
(3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.

Correctness:
A correctness value of True means that the student's answer meets all of the criteria.
A correctness value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.

Avoid simply stating the correct answer at the outset."""

# Grader LLM
grader_llm_correctness = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=0, google_api_key=GEMINI_API_KEY).with_structured_output(CorrectnessGrade, method="json_mode", strict=True)


def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    """An evaluator for RAG answer accuracy"""
    answers = f"""\
QUESTION: {inputs['question']}
GROUND TRUTH ANSWER: {reference_outputs['answer']}
STUDENT ANSWER: {outputs['answer']}"""

    # Run evaluator
    grade: CorrectnessGrade = grader_llm_correctness.invoke([
        {"role": "system", "content": correctness_instructions},
        {"role": "user", "content": answers}
    ])
    return {"score": grade.correct}

In [None]:
import emoji
from pydantic import BaseModel, Field
from langchain_core.messages import convert_to_openai_messages
from langchain_google_genai import ChatGoogleGenerativeAI

class Grade(BaseModel):
    """Grade whether a response is supported by some context."""
    grounded: bool = Field(..., description="Is the majority of the response supported by the retrieved context?")

grounded_instructions = f"""You have given somebody some contextual information and asked them to write a statement grounded in that context.

Grade whether their response is fully supported by the context you have provided. \
If any meaningful part of their statement is not backed up directly by the context you provided, then their response is not grounded. \
Otherwise it is grounded."""
# grounded_model = init_chat_model(model="gpt-4o").with_structured_output(Grade)
grounded_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=0, google_api_key=GEMINI_API_KEY).with_structured_output(Grade)


def lt_280_chars(outputs: dict) -> bool:
    messages = convert_to_openai_messages(outputs["messages"])
    return len(messages[-1]['content']) <= 280

def gte_3_emojis(outputs: dict) -> bool:
    messages = convert_to_openai_messages(outputs["messages"])
    return len(emoji.emoji_list(messages[-1]['content'])) >= 3

async def is_grounded(outputs: dict) -> dict:
    context = ""
    messages = convert_to_openai_messages(outputs["messages"])
    for message in messages:
        if message["role"] == "tool":
            # Tool message outputs are the results returned from the Tavily/DuckDuckGo tool
            context += "\n\n" + message["content"]
    tweet = messages[-1]["content"]
    user = f"""CONTEXT PROVIDED:
    {context}

    RESPONSE GIVEN:
    {tweet}"""
    grade = await grounded_model.ainvoke([
        {"role": "system", "content": grounded_instructions},
        {"role": "user", "content": user}
    ])
    return {"score": grade.grounded}