# Building a Report Picker eval

Based on the [examples in openai/evals](https://github.com/openai/evals/tree/26a5191230e91953f4f2fdd74a82051fda2aba23/examples)

In [60]:
%load_ext autoreload
%autoreload 2

import datetime
import os
import requests
import pandas as pd
import yaml
import json

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [61]:
# Construct the system prompt, similar to ../lib/openai.ts `createSystemPrompt`

# Read the prompt_template as a scalar value
with open("../lib/prompt_template.json") as f:
    prompt_template_data = json.load(f)
    prompt_template_prompt = prompt_template_data['prompt'] # Adjust this based on the actual structure

# Read the available_reports as a DataFrame
with open("../lib/available_reports/square.json") as f:
    available_reports = json.load(f)

sample_date = "2023-08-21"

system_prompt = f"""
{prompt_template_prompt}

Today's date: {sample_date}

Available reports: 
{available_reports}
"""

system_prompt

'\nYou\'re an expert at finding the best report(s) to answer a user\'s question about their business. If the question isn\'t answerable by these reports, return "NA" as the "error".\n\nToday\'s date: 2023-08-21\n\nAvailable reports: \n[{\'key\': \'SalesSummary\', \'description\': \'Sales Summary: A general overview of sales from a given time period. The report includes itemization metrics like gross sales, refunds, net sales, discounts, tips, and taxes. It also includes payment metrics like total collected (revenue by tender type) and fees.\', \'filters\': [\'startDate\', \'endDate\']}, {\'key\': \'SalesTrends\', \'description\': \'Sales Trends: Compares your daily, weekly, and yearly gross sales.\', \'filters\': [\'startDate\', \'endDate\']}, {\'key\': \'PaymentMethods\', \'description\': \'Payment Methods: A summary of the total collected and any associated fees from credit, debit and gift cards, and any other tender types. The reports include payments broken down by debit vs credit,

In [62]:
eval_key = "report_picker"
registry_path = os.path.join(os.getcwd(), ".local_registry")
os.makedirs(os.path.join(registry_path, "evals"), exist_ok=True)
os.makedirs(os.path.join(registry_path, "data", eval_key), exist_ok=True)

# display(registry_path)

def create_chat_prompt(user_question):
    return [
        {"role": "system", "content": system_prompt}, 
        {"role": "user", "content": user_question}
    ]

# Read the YAML file
yaml_file_path = os.path.join("report_questions.yaml")
with open(yaml_file_path, 'r') as f:
    yaml_data = yaml.load(f, Loader=yaml.FullLoader)

# Transform the dictionary
data = []
for key, questions in yaml_data.items():
    for question in questions:
        input_prompt = create_chat_prompt(question)
        data.append({"input": input_prompt, "ideal": key})

# Convert to a Pandas DataFrame
df = pd.DataFrame(data)

# Save to a JSON file
json_output_path = os.path.join(registry_path, "data", eval_key, "samples.jsonl")
df.to_json(json_output_path, orient="records", lines=True)

# Display the DataFrame
display(df.head())

Unnamed: 0,input,ideal
0,"[{'role': 'system', 'content': ' You're an exp...",SalesSummary
1,"[{'role': 'system', 'content': ' You're an exp...",SalesSummary
2,"[{'role': 'system', 'content': ' You're an exp...",SalesSummary
3,"[{'role': 'system', 'content': ' You're an exp...",SalesTrends
4,"[{'role': 'system', 'content': ' You're an exp...",SalesTrends


In [63]:
samples_path = os.path.join(registry_path, "data", eval_key, "samples.jsonl")

eval_yaml = f"""
{eval_key}:
  id: {eval_key}.test.v1
  metrics: [accuracy]

{eval_key}.test.v1:
  class: evals.elsuite.basic.match:Match
  args:
    samples_jsonl: {samples_path}
""".strip()

with open(os.path.join(registry_path, "evals", f"{eval_key}.yaml"), "w") as f:
    f.write(eval_yaml)

## Run the `oaieval` CLI tool

In [65]:
# !oaieval gpt-3.5-turbo report_picker --max_samples 5 --debug

# Get the absolute path to the registry
abs_registry_path = os.path.abspath(registry_path)

# Create the results directory
results_dir = os.path.join(os.getcwd(), ".results")
os.makedirs(results_dir, exist_ok=True)
ts = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
results_file_name = f"{eval_key}-{ts}.jsonl"
abs_record_path = os.path.join(results_dir, results_file_name)

# Run the eval with paths passed in
# other flags to use: --debug, --dry-run, --max_samples 5, --registry_path, --record_path
get_ipython().system(f"oaieval gpt-3.5-turbo report_picker --max_samples 5 --registry_path {abs_registry_path} --record_path {abs_record_path}")

[2023-08-21 16:42:44,617] [registry.py:249] Loading registry from /Users/briansigafoos/Development/briansigafoos/report-picker-gpt/evals/venv/lib/python3.11/site-packages/evals/registry/evals
[2023-08-21 16:42:44,658] [registry.py:249] Loading registry from /Users/briansigafoos/.evals/evals
[2023-08-21 16:42:44,658] [registry.py:249] Loading registry from /Users/briansigafoos/Development/briansigafoos/report-picker-gpt/evals/.local_registry/evals
[2023-08-21 16:42:44,659] [registry.py:135] Looking for report_picker
[2023-08-21 16:42:44,659] [oaieval.py:110] [1;35mRun started: 230821204244MA36LDYO[0m
[2023-08-21 16:42:44,660] [data.py:75] Fetching /Users/briansigafoos/Development/briansigafoos/report-picker-gpt/evals/.local_registry/data/report_picker/samples.jsonl
[2023-08-21 16:42:44,660] [eval.py:34] Evaluating 5 samples
[2023-08-21 16:42:44,666] [eval.py:153] Running in threaded mode with 10 threads!
  0%|                                                     | 0/5 [00:00<?, ?it/s][