# GSM8K with Langfun's Evaluation Framework

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/google/langfun/blob/main/docs/notebooks/langfun_eval.ipynb)

This notebook demonstrates how to build LLM benchmarks using Langfun's evaluation framework. By the end of this notebook, you will learn:

- How to create a new benchmark.
- How to execute a benchmark.
- How to view benchmark details.

We recommend opening this notebook in Colab, as GitHub's file preview does not render generated HTML views correctly.

In [None]:
!pip install langfun --pre

In [None]:
from typing import Any

import langfun as lf
import pandas as pd
import pyglove as pg

# Evaluation Input Functor
# NOTE: pg.functor decorator generates `pg.Object` subclass with a `__call__`
# method. It is commonly used for early or partial binding.
@pg.functor()
def gsm8k_inputs(num_examples=10):
  splits = {'train': 'main/train-00000-of-00001.parquet', 'test': 'main/test-00000-of-00001.parquet'}
  df = pd.read_parquet("hf://datasets/openai/gsm8k/" + splits["test"])
  return [
      pg.Dict(
          question=df['question'][i],
          answer=int(df['answer'][i].split('#### ')[1]),
      ) for i in range(num_examples)
  ]


#
# Output schema for lf.query.
#
class Answer(pg.Object):
  step_by_step_thoughts: list[str]
  final_answer: int


#
# Example of an evaluation class.
#
class Gsm8k(lf.eval.v2.Evaluation):
  """Gsm8K evaluation."""

  # `inputs` is a field declared from `lf.eval.v2.Evaluation`.
  inputs = gsm8k_inputs(num_examples=2)

  # User argument.
  lm: lf.LanguageModel

  #
  # Definition of the metric class.
  #
  class Match(lf.eval.v2.metrics.Match):

    def match(self, example_input: pg.Dict, output: Answer) -> bool:
      return example_input.answer == output.final_answer

  # Specification for which metrics to use.
  metrics = [Match()]

  # The processing logic, which maps an item from input functor to an output
  # which will be used for metric computation.
  def process(
      self,
      example: lf.eval.v2.Example
  ) -> tuple[
      Answer,              # Output for metric to process.
      dict[str, Any]       # Metadata for display.
  ]:
    with lf.track_queries() as queries:
      answer = lf.query(
          example.input.question,
          Answer,
          lm=self.lm
      )
    return answer, dict(queries=queries)


In [None]:
import os

os.environ['OPENAI_API_KEY'] = '<Update this field with your OpenAI key>'

benchmark = Gsm8k(
    # Use `pg.oneof` to enable benchmark sweeping, which runs multiple evaluations
    # in parallel.
    lm=pg.oneof([
        lf.llms.Gpt4oMini(),
        lf.llms.GptO1Mini()
    ])
)
root_dir = '/tmp/test_run'
run = benchmark.run(root_dir, 'new')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Study the evaluation details of a dataset

In [None]:
pg.view(
    benchmark.leaf_nodes[0],
    extra_flags=dict(
        current_run=run,
        interactive=False,
        card_view=False,
    ),
)

0
DefinitionMetric: matchLogs
"Gsm8k(  plugins = [],  inputs = gsm8k_inputs(  num_examples = 2  ),  metrics = [  0 : Match(  name = 'match',  oop_errors = 0.0%,  non_oop_errors = 0.0%,  matches = 100.0%,  mismatches = 0.0%  )  ],  max_workers = 32,  lm = Gpt4oMini(  sampling_options = LMSamplingOptions(  temperature = None,  max_tokens = None,  n = 1,  top_k = 40,  top_p = None,  stop = None,  random_seed = None,  logprobs = False,  top_logprobs = None  ),  cache = None,  max_concurrency = None,  timeout = 120.0,  max_attempts = 5,  retry_interval = (5, 60),  exponential_backoff = True,  max_retry_interval = 300,  debug = False,  api_endpoint = 'https://api.openai.com/v1/chat/completions',  model = 'gpt-4o-mini',  headers = None,  multimodal = True,  api_key = None,  organization = None,  project = None  ) )M__main__.Gsm8k.Match100.0%matches: 100.0% (2/2)0.0%mismatches: 0.0% (0/2)0.0%oop_errors: 0.0% (0/2)0.0%non_oop_errors: 0.0% (0/2)matchesmismatchesoop_errorsnon_oop_errors2118:18:23 INFO - Updated '/tmp/test_run/run_20250122_6/Gsm8k/40316ddb/index.html' in 0.26 seconds. 18:18:24 INFO - Found 0 checkpoint files to load. 18:18:24 INFO - No examples are loaded from checkpoint files. 2 examples will be processed from scratch. Example IDs: {1, 2}. 18:18:24 INFO - Starting evaluation 'Gsm8k@40316ddb' with 2 examples to evaluate. 18:18:28 INFO - Example 2 checkpointed to /tmp/test_run/run_20250122_6/Gsm8k/40316ddb/checkpoint.bagz. 18:18:29 INFO - Example 1 checkpointed to /tmp/test_run/run_20250122_6/Gsm8k/40316ddb/checkpoint.bagz. 18:18:29 INFO - 2 examples are checkpointed to /tmp/test_run/run_20250122_6/Gsm8k/40316ddb/checkpoint.bagz. 18:18:29 INFO - Gsm8k@40316ddb completed with 2 examples evaluated (0 from checkpoint, 2 newly processed). 18:18:29 INFO - Updated '/tmp/test_run/run_20250122_6/Gsm8k/40316ddb/index.html' in 0.26 seconds. 18:18:29 INFO - '2.html' generated in 1.27 seconds. 18:18:30 INFO - '1.html' generated in 1.28 seconds."

0
matchesmismatchesoop_errorsnon_oop_errors
21


# View an detailed example

In [None]:
example_id = 1  # @param int
pg.view(benchmark.leaf_nodes[0].state.evaluated_examples[example_id])

0
InputOutputOutput Metadata
"Dict(...){  'question': ""Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day..."",  'answer': 18 }questioninput.questionstr""Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day...""Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?answerinput.answerint1818Answer(...)Answer(  step_by_step_thoughts=[  ""Janet's ducks lay 16 eggs per day."",  'She eats 3 eggs for breakfast.',  'She uses 4 eggs for baking muffins.',  'Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs.',  'Eggs left to sell = 16 (total) - 7 (used) = 9 eggs.',  'She sells each egg for $2.',  'Total money made = 9 eggs * $2/egg = $18.'  ],  final_answer=18 )step_by_step_thoughtsoutput.step_by_step_thoughtsList(...)[  ""Janet's ducks lay 16 eggs per day."",  'She eats 3 eggs for breakfast.',  'She uses 4 eggs for baking muffins.',  'Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs.',  'Eggs left to sell = 16 (total) - 7 (used) = 9 eggs.',  'She sells each egg for $2.',  'Total money made = 9 eggs * $2/egg = $18.' ]0output.step_by_step_thoughts[0]""Janet's ducks lay 16 eggs per day.""1output.step_by_step_thoughts[1]'She eats 3 eggs for breakfast.'2output.step_by_step_thoughts[2]'She uses 4 eggs for baking muffins.'3output.step_by_step_thoughts[3]'Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs.'4output.step_by_step_thoughts[4]'Eggs left to sell = 16 (total) - 7 (used) = 9 eggs.'5output.step_by_step_thoughts[5]'She sells each egg for $2.'6output.step_by_step_thoughts[6]'Total money made = 9 eggs * $2/egg = $18.'final_answeroutput.final_answerint1818dict(...){  'queries': [  QueryInvocation(  input=Ref(  value = Template(  template_str = ""Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"",  clean = True  )  ),  schema=Schema(  spec=Object(  Answer  )  ),  lm_response=AIMessage(  text='```python\nAnswer(\n step_by_step_thoughts=[\n ""Janet\'s ducks lay 16 eggs per day."",\n ""She eats 3 eggs for breakfast."",\n ""She uses 4 eggs for baking muffins."",\n ""Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs."",\n ...',  sender='AI',  metadata={},  tags=[]  ),  lm=Ref(  value = Gpt4oMini(  sampling_options = LMSamplingOptions(  temperature = None,  max_tokens = None,  n = 1,  top_k = 40,  top_p = None,  stop = None,  random_seed = None,  logprobs = False,  top_logprobs = None  ),  cache = ContextualAttribute(  type = None,  default = None  ),  max_concurrency = None,  timeout = 120.0,  max_attempts = 5,  retry_interval = (5, 60),  exponential_backoff = True,  max_retry_interval = 300,  debug = False,  api_endpoint = 'https://api.openai.com/v1/chat/completions',  model = 'gpt-4o-mini',  headers = None,  multimodal = True,  api_key = None,  organization = None,  project = None  )  ),  examples=[],  usage_summary=UsageSummary(  cached=AggregatedUsage(  total=LMSamplingUsage(  prompt_tokens=0,  completion_tokens=0,  total_tokens=0,  num_requests=0,  estimated_cost=0.0,  retry_stats=RetryStats(  num_occurences=0,  total_wait_interval=0.0,  total_call_interval=0.0,  errors={}  )  ),  breakdown={}  ),  uncached=AggregatedUsage(  total=LMSamplingUsage(  prompt_tokens=180,  completion_tokens=136,  total_tokens=316,  num_requests=1,  estimated_cost=0.00010859999999999998,  retry_stats=RetryStats(  num_occurences=0,  total_wait_interval=0.0,  total_call_interval=2.882235288619995,  errors={}  )  ),  breakdown={  'OpenAI(gpt-4o-mini)': LMSamplingUsage(  prompt_tokens=180,  completion_tokens=136,  total_tokens=316,  num_requests=1,  estimated_cost=0.00010859999999999998,  retry_stats=RetryStats(  num_occurences=0,  total_wait_interval=0.0,  total_call_interval=2.882235288619995,  errors={}  )  )  }  )  ),  start_time=1737569905.6691015,  end_time=1737569908.6945176  )  ] }queriesmetadata.querieslist(...)[  QueryInvocation(  input=Ref(  value = Template(  template_str = ""Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"",  clean = True  )  ),  schema=Schema(  spec=Object(  Answer  )  ),  lm_response=AIMessage(  text='```python\nAnswer(\n step_by_step_thoughts=[\n ""Janet\'s ducks lay 16 eggs per day."",\n ""She eats 3 eggs for breakfast."",\n ""She uses 4 eggs for baking muffins."",\n ""Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs."",\n ...',  sender='AI',  metadata={},  tags=[]  ),  lm=Ref(  value = Gpt4oMini(  sampling_options = LMSamplingOptions(  temperature = None,  max_tokens = None,  n = 1,  top_k = 40,  top_p = None,  stop = None,  random_seed = None,  logprobs = False,  top_logprobs = None  ),  cache = ContextualAttribute(  type = None,  default = None  ),  max_concurrency = None,  timeout = 120.0,  max_attempts = 5,  retry_interval = (5, 60),  exponential_backoff = True,  max_retry_interval = 300,  debug = False,  api_endpoint = 'https://api.openai.com/v1/chat/completions',  model = 'gpt-4o-mini',  headers = None,  multimodal = True,  api_key = None,  organization = None,  project = None  )  ),  examples=[],  usage_summary=UsageSummary(  cached=AggregatedUsage(  total=LMSamplingUsage(  prompt_tokens=0,  completion_tokens=0,  total_tokens=0,  num_requests=0,  estimated_cost=0.0,  retry_stats=RetryStats(  num_occurences=0,  total_wait_interval=0.0,  total_call_interval=0.0,  errors={}  )  ),  breakdown={}  ),  uncached=AggregatedUsage(  total=LMSamplingUsage(  prompt_tokens=180,  completion_tokens=136,  total_tokens=316,  num_requests=1,  estimated_cost=0.00010859999999999998,  retry_stats=RetryStats(  num_occurences=0,  total_wait_interval=0.0,  total_call_interval=2.882235288619995,  errors={}  )  ),  breakdown={  'OpenAI(gpt-4o-mini)': LMSamplingUsage(  prompt_tokens=180,  completion_tokens=136,  total_tokens=316,  num_requests=1,  estimated_cost=0.00010859999999999998,  retry_stats=RetryStats(  num_occurences=0,  total_wait_interval=0.0,  total_call_interval=2.882235288619995,  errors={}  )  )  }  )  ),  start_time=1737569905.6691015,  end_time=1737569908.6945176  ) ]0metadata.queries[0]lf.querylm=OpenAI(gpt-4o-mini)Gpt4oMini()3 seconds0.000UsageSummary(  cached = AggregatedUsage(  total = LMSamplingUsage(  prompt_tokens = 0,  completion_tokens = 0,  total_tokens = 0,  num_requests = 0,  estimated_cost = 0.000,  retry_stats = RetryStats(  num_occurences = 0,  total_wait_interval = 0.000,  total_call_interval = 0.000,  errors = {}  )  ),  breakdown = {}  ),  uncached = AggregatedUsage(  total = LMSamplingUsage(  prompt_tokens = 180,  completion_tokens = 136,  total_tokens = 316,  num_requests = 1,  estimated_cost = 0.000,  retry_stats = RetryStats(  num_occurences = 0,  total_wait_interval = 0.000,  total_call_interval = 2.882,  errors = {}  )  ),  breakdown = {  OpenAI(gpt-4o-mini) = LMSamplingUsage(  prompt_tokens = 180,  completion_tokens = 136,  total_tokens = 316,  num_requests = 1,  estimated_cost = 0.000,  retry_stats = RetryStats(  num_occurences = 0,  total_wait_interval = 0.000,  total_call_interval = 2.882,  errors = {}  )  )  }  ) )inputoutputschemalm_requestlm_responseTemplate(...)Template(  template_str=""Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day..."",  clean=True )template_strvariablesJanet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?Answer(...)Answer(  step_by_step_thoughts=[  ""Janet's ducks lay 16 eggs per day."",  'She eats 3 eggs for breakfast.',  'She uses 4 eggs for baking muffins.',  'Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs.',  'Eggs left to sell = 16 (total) - 7 (used) = 9 eggs.',  'She sells each egg for $2.',  'Total money made = 9 eggs * $2/egg = $18.'  ],  final_answer=18 )step_by_step_thoughtsstep_by_step_thoughtsList(...)[  ""Janet's ducks lay 16 eggs per day."",  'She eats 3 eggs for breakfast.',  'She uses 4 eggs for baking muffins.',  'Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs.',  'Eggs left to sell = 16 (total) - 7 (used) = 9 eggs.',  'She sells each egg for $2.',  'Total money made = 9 eggs * $2/egg = $18.' ]0step_by_step_thoughts[0]""Janet's ducks lay 16 eggs per day.""1step_by_step_thoughts[1]'She eats 3 eggs for breakfast.'2step_by_step_thoughts[2]'She uses 4 eggs for baking muffins.'3step_by_step_thoughts[3]'Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs.'4step_by_step_thoughts[4]'Eggs left to sell = 16 (total) - 7 (used) = 9 eggs.'5step_by_step_thoughts[5]'She sells each egg for $2.'6step_by_step_thoughts[6]'Total money made = 9 eggs * $2/egg = $18.'final_answerfinal_answerint1818Schema(...)Schema(  spec=Object(  Answer  ) )Answer ```python class Answer:  step_by_step_thoughts: list[str]  final_answer: int ```UserMessage(...)UserMessage(  text='Please respond to the last INPUT_OBJECT with OUTPUT_OBJECT according to OUTPUT_TYPE.\n\nINPUT_OBJECT:\n 1 + 1 =\n\nOUTPUT_TYPE:\n Answer\n\n ```python\n class Answer:\n final_answer: int\n ```\n\nOUTPUT_OBJECT:\n ```python\n Answer(\n final_answer=2\n )\n ```...',  sender='User',  metadata={  'mapping_template': Ref(  value = Template(  template_str = '{%- if example.context -%}\n{{ context_title}}:\n{{ example.context | indent(2, True)}}\n\n{% endif -%}\n\n{{ input_title }}:\n{{ example.input_repr(protocol, compact=False) | indent(2, True) }}\n\n{% if example.schema -%}\n{{ schema_title }}:\n{{ example.schema_repr(protocol) | indent(2, True) }}\n\n{% endif -%}\n\n{{ output_title }}:\n{%- if example.has_output %}\n{{ example.output_repr(protocol, compact=False) | indent(2, True) }}\n{% endif -%}',  clean = True,  context_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  input_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  output_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  schema_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  protocol = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  example = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  )  )  ),  'examples': None,  'preamble': Ref(  value = LangFunc(  template_str = 'Please respond to the last {{ input_title }} with {{ output_title }} according to {{ schema_title }}.\n\n{{ input_title }}:\n 1 + 1 =\n\n{{ schema_title }}:\n Answer\n\n ```python\n class Answer:\n final_answer: int\n ```\n\n{{ output_title }}:\n ```python\n Answer(\n final_answer=2\n )\n ```',  clean = True,  lm = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  input_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  output_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  schema_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  )  )  ),  'mapping_request': Ref(  value = MappingExample(  input = ""Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"",  output = MISSING(Any),  schema = Ref(  value = Schema(  spec = Object(  Answer  )  )  ),  context = None,  metadata = {}  )  ),  'input': Ref(  value = UserMessage(  text = ""Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"",  sender = 'User',  metadata = {},  tags = [  0 : 'rendered'  ]  )  ),  'schema': Ref(  value = Schema(  spec = Object(  Answer  )  )  )  },  tags=[  'rendered'  ] )renderedPlease respond to the last INPUT_OBJECT with OUTPUT_OBJECT according to OUTPUT_TYPE. INPUT_OBJECT:  1 + 1 = OUTPUT_TYPE:  Answer  ```python  class Answer:  final_answer: int  ``` OUTPUT_OBJECT:  ```python  Answer(  final_answer=2  )  ``` INPUT_OBJECT:  Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market? OUTPUT_TYPE:  Answer  ```python  class Answer:  step_by_step_thoughts: list[str]  final_answer: int  ``` OUTPUT_OBJECT:AIMessage(...)AIMessage(  text='```python\nAnswer(\n step_by_step_thoughts=[\n ""Janet\'s ducks lay 16 eggs per day."",\n ""She eats 3 eggs for breakfast."",\n ""She uses 4 eggs for baking muffins."",\n ""Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs."",\n ...',  sender='AI',  metadata={},  tags=[] )```python Answer(  step_by_step_thoughts=[  ""Janet's ducks lay 16 eggs per day."",  ""She eats 3 eggs for breakfast."",  ""She uses 4 eggs for baking muffins."",  ""Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs."",  ""Eggs left to sell = 16 (total) - 7 (used) = 9 eggs."",  ""She sells each egg for $2."",  ""Total money made = 9 eggs * $2/egg = $18.""  ],  final_answer=18 ) ```"

0,1
0output.step_by_step_thoughts[0],"""Janet's ducks lay 16 eggs per day."""
1output.step_by_step_thoughts[1],'She eats 3 eggs for breakfast.'
2output.step_by_step_thoughts[2],'She uses 4 eggs for baking muffins.'
3output.step_by_step_thoughts[3],'Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs.'
4output.step_by_step_thoughts[4],'Eggs left to sell = 16 (total) - 7 (used) = 9 eggs.'
5output.step_by_step_thoughts[5],'She sells each egg for $2.'
6output.step_by_step_thoughts[6],'Total money made = 9 eggs * $2/egg = $18.'

0,1
0metadata.queries[0],"lf.querylm=OpenAI(gpt-4o-mini)Gpt4oMini()3 seconds0.000UsageSummary(  cached = AggregatedUsage(  total = LMSamplingUsage(  prompt_tokens = 0,  completion_tokens = 0,  total_tokens = 0,  num_requests = 0,  estimated_cost = 0.000,  retry_stats = RetryStats(  num_occurences = 0,  total_wait_interval = 0.000,  total_call_interval = 0.000,  errors = {}  )  ),  breakdown = {}  ),  uncached = AggregatedUsage(  total = LMSamplingUsage(  prompt_tokens = 180,  completion_tokens = 136,  total_tokens = 316,  num_requests = 1,  estimated_cost = 0.000,  retry_stats = RetryStats(  num_occurences = 0,  total_wait_interval = 0.000,  total_call_interval = 2.882,  errors = {}  )  ),  breakdown = {  OpenAI(gpt-4o-mini) = LMSamplingUsage(  prompt_tokens = 180,  completion_tokens = 136,  total_tokens = 316,  num_requests = 1,  estimated_cost = 0.000,  retry_stats = RetryStats(  num_occurences = 0,  total_wait_interval = 0.000,  total_call_interval = 2.882,  errors = {}  )  )  }  ) )inputoutputschemalm_requestlm_responseTemplate(...)Template(  template_str=""Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day..."",  clean=True )template_strvariablesJanet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?Answer(...)Answer(  step_by_step_thoughts=[  ""Janet's ducks lay 16 eggs per day."",  'She eats 3 eggs for breakfast.',  'She uses 4 eggs for baking muffins.',  'Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs.',  'Eggs left to sell = 16 (total) - 7 (used) = 9 eggs.',  'She sells each egg for $2.',  'Total money made = 9 eggs * $2/egg = $18.'  ],  final_answer=18 )step_by_step_thoughtsstep_by_step_thoughtsList(...)[  ""Janet's ducks lay 16 eggs per day."",  'She eats 3 eggs for breakfast.',  'She uses 4 eggs for baking muffins.',  'Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs.',  'Eggs left to sell = 16 (total) - 7 (used) = 9 eggs.',  'She sells each egg for $2.',  'Total money made = 9 eggs * $2/egg = $18.' ]0step_by_step_thoughts[0]""Janet's ducks lay 16 eggs per day.""1step_by_step_thoughts[1]'She eats 3 eggs for breakfast.'2step_by_step_thoughts[2]'She uses 4 eggs for baking muffins.'3step_by_step_thoughts[3]'Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs.'4step_by_step_thoughts[4]'Eggs left to sell = 16 (total) - 7 (used) = 9 eggs.'5step_by_step_thoughts[5]'She sells each egg for $2.'6step_by_step_thoughts[6]'Total money made = 9 eggs * $2/egg = $18.'final_answerfinal_answerint1818Schema(...)Schema(  spec=Object(  Answer  ) )Answer ```python class Answer:  step_by_step_thoughts: list[str]  final_answer: int ```UserMessage(...)UserMessage(  text='Please respond to the last INPUT_OBJECT with OUTPUT_OBJECT according to OUTPUT_TYPE.\n\nINPUT_OBJECT:\n 1 + 1 =\n\nOUTPUT_TYPE:\n Answer\n\n ```python\n class Answer:\n final_answer: int\n ```\n\nOUTPUT_OBJECT:\n ```python\n Answer(\n final_answer=2\n )\n ```...',  sender='User',  metadata={  'mapping_template': Ref(  value = Template(  template_str = '{%- if example.context -%}\n{{ context_title}}:\n{{ example.context | indent(2, True)}}\n\n{% endif -%}\n\n{{ input_title }}:\n{{ example.input_repr(protocol, compact=False) | indent(2, True) }}\n\n{% if example.schema -%}\n{{ schema_title }}:\n{{ example.schema_repr(protocol) | indent(2, True) }}\n\n{% endif -%}\n\n{{ output_title }}:\n{%- if example.has_output %}\n{{ example.output_repr(protocol, compact=False) | indent(2, True) }}\n{% endif -%}',  clean = True,  context_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  input_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  output_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  schema_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  protocol = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  example = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  )  )  ),  'examples': None,  'preamble': Ref(  value = LangFunc(  template_str = 'Please respond to the last {{ input_title }} with {{ output_title }} according to {{ schema_title }}.\n\n{{ input_title }}:\n 1 + 1 =\n\n{{ schema_title }}:\n Answer\n\n ```python\n class Answer:\n final_answer: int\n ```\n\n{{ output_title }}:\n ```python\n Answer(\n final_answer=2\n )\n ```',  clean = True,  lm = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  input_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  output_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  schema_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  )  )  ),  'mapping_request': Ref(  value = MappingExample(  input = ""Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"",  output = MISSING(Any),  schema = Ref(  value = Schema(  spec = Object(  Answer  )  )  ),  context = None,  metadata = {}  )  ),  'input': Ref(  value = UserMessage(  text = ""Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"",  sender = 'User',  metadata = {},  tags = [  0 : 'rendered'  ]  )  ),  'schema': Ref(  value = Schema(  spec = Object(  Answer  )  )  )  },  tags=[  'rendered'  ] )renderedPlease respond to the last INPUT_OBJECT with OUTPUT_OBJECT according to OUTPUT_TYPE. INPUT_OBJECT:  1 + 1 = OUTPUT_TYPE:  Answer  ```python  class Answer:  final_answer: int  ``` OUTPUT_OBJECT:  ```python  Answer(  final_answer=2  )  ``` INPUT_OBJECT:  Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market? OUTPUT_TYPE:  Answer  ```python  class Answer:  step_by_step_thoughts: list[str]  final_answer: int  ``` OUTPUT_OBJECT:AIMessage(...)AIMessage(  text='```python\nAnswer(\n step_by_step_thoughts=[\n ""Janet\'s ducks lay 16 eggs per day."",\n ""She eats 3 eggs for breakfast."",\n ""She uses 4 eggs for baking muffins."",\n ""Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs."",\n ...',  sender='AI',  metadata={},  tags=[] )```python Answer(  step_by_step_thoughts=[  ""Janet's ducks lay 16 eggs per day."",  ""She eats 3 eggs for breakfast."",  ""She uses 4 eggs for baking muffins."",  ""Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs."",  ""Eggs left to sell = 16 (total) - 7 (used) = 9 eggs."",  ""She sells each egg for $2."",  ""Total money made = 9 eggs * $2/egg = $18.""  ],  final_answer=18 ) ```"

0
inputoutputschemalm_requestlm_response
"Template(...)Template(  template_str=""Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day..."",  clean=True )template_strvariablesJanet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?Answer(...)Answer(  step_by_step_thoughts=[  ""Janet's ducks lay 16 eggs per day."",  'She eats 3 eggs for breakfast.',  'She uses 4 eggs for baking muffins.',  'Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs.',  'Eggs left to sell = 16 (total) - 7 (used) = 9 eggs.',  'She sells each egg for $2.',  'Total money made = 9 eggs * $2/egg = $18.'  ],  final_answer=18 )step_by_step_thoughtsstep_by_step_thoughtsList(...)[  ""Janet's ducks lay 16 eggs per day."",  'She eats 3 eggs for breakfast.',  'She uses 4 eggs for baking muffins.',  'Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs.',  'Eggs left to sell = 16 (total) - 7 (used) = 9 eggs.',  'She sells each egg for $2.',  'Total money made = 9 eggs * $2/egg = $18.' ]0step_by_step_thoughts[0]""Janet's ducks lay 16 eggs per day.""1step_by_step_thoughts[1]'She eats 3 eggs for breakfast.'2step_by_step_thoughts[2]'She uses 4 eggs for baking muffins.'3step_by_step_thoughts[3]'Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs.'4step_by_step_thoughts[4]'Eggs left to sell = 16 (total) - 7 (used) = 9 eggs.'5step_by_step_thoughts[5]'She sells each egg for $2.'6step_by_step_thoughts[6]'Total money made = 9 eggs * $2/egg = $18.'final_answerfinal_answerint1818Schema(...)Schema(  spec=Object(  Answer  ) )Answer ```python class Answer:  step_by_step_thoughts: list[str]  final_answer: int ```UserMessage(...)UserMessage(  text='Please respond to the last INPUT_OBJECT with OUTPUT_OBJECT according to OUTPUT_TYPE.\n\nINPUT_OBJECT:\n 1 + 1 =\n\nOUTPUT_TYPE:\n Answer\n\n ```python\n class Answer:\n final_answer: int\n ```\n\nOUTPUT_OBJECT:\n ```python\n Answer(\n final_answer=2\n )\n ```...',  sender='User',  metadata={  'mapping_template': Ref(  value = Template(  template_str = '{%- if example.context -%}\n{{ context_title}}:\n{{ example.context | indent(2, True)}}\n\n{% endif -%}\n\n{{ input_title }}:\n{{ example.input_repr(protocol, compact=False) | indent(2, True) }}\n\n{% if example.schema -%}\n{{ schema_title }}:\n{{ example.schema_repr(protocol) | indent(2, True) }}\n\n{% endif -%}\n\n{{ output_title }}:\n{%- if example.has_output %}\n{{ example.output_repr(protocol, compact=False) | indent(2, True) }}\n{% endif -%}',  clean = True,  context_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  input_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  output_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  schema_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  protocol = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  example = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  )  )  ),  'examples': None,  'preamble': Ref(  value = LangFunc(  template_str = 'Please respond to the last {{ input_title }} with {{ output_title }} according to {{ schema_title }}.\n\n{{ input_title }}:\n 1 + 1 =\n\n{{ schema_title }}:\n Answer\n\n ```python\n class Answer:\n final_answer: int\n ```\n\n{{ output_title }}:\n ```python\n Answer(\n final_answer=2\n )\n ```',  clean = True,  lm = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  input_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  output_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  ),  schema_title = ContextualAttribute(  type = None,  default = (MISSING_VALUE)  )  )  ),  'mapping_request': Ref(  value = MappingExample(  input = ""Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"",  output = MISSING(Any),  schema = Ref(  value = Schema(  spec = Object(  Answer  )  )  ),  context = None,  metadata = {}  )  ),  'input': Ref(  value = UserMessage(  text = ""Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"",  sender = 'User',  metadata = {},  tags = [  0 : 'rendered'  ]  )  ),  'schema': Ref(  value = Schema(  spec = Object(  Answer  )  )  )  },  tags=[  'rendered'  ] )renderedPlease respond to the last INPUT_OBJECT with OUTPUT_OBJECT according to OUTPUT_TYPE. INPUT_OBJECT:  1 + 1 = OUTPUT_TYPE:  Answer  ```python  class Answer:  final_answer: int  ``` OUTPUT_OBJECT:  ```python  Answer(  final_answer=2  )  ``` INPUT_OBJECT:  Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market? OUTPUT_TYPE:  Answer  ```python  class Answer:  step_by_step_thoughts: list[str]  final_answer: int  ``` OUTPUT_OBJECT:AIMessage(...)AIMessage(  text='```python\nAnswer(\n step_by_step_thoughts=[\n ""Janet\'s ducks lay 16 eggs per day."",\n ""She eats 3 eggs for breakfast."",\n ""She uses 4 eggs for baking muffins."",\n ""Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs."",\n ...',  sender='AI',  metadata={},  tags=[] )```python Answer(  step_by_step_thoughts=[  ""Janet's ducks lay 16 eggs per day."",  ""She eats 3 eggs for breakfast."",  ""She uses 4 eggs for baking muffins."",  ""Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs."",  ""Eggs left to sell = 16 (total) - 7 (used) = 9 eggs."",  ""She sells each egg for $2."",  ""Total money made = 9 eggs * $2/egg = $18.""  ],  final_answer=18 ) ```"

0
template_strvariables
Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?

0,1
0step_by_step_thoughts[0],"""Janet's ducks lay 16 eggs per day."""
1step_by_step_thoughts[1],'She eats 3 eggs for breakfast.'
2step_by_step_thoughts[2],'She uses 4 eggs for baking muffins.'
3step_by_step_thoughts[3],'Total eggs used per day = 3 (breakfast) + 4 (baking) = 7 eggs.'
4step_by_step_thoughts[4],'Eggs left to sell = 16 (total) - 7 (used) = 9 eggs.'
5step_by_step_thoughts[5],'She sells each egg for $2.'
6step_by_step_thoughts[6],'Total money made = 9 eggs * $2/egg = $18.'
