### Minibatching

When annotating data, it's wasteful to process a single example at a time instead of doing multiple at once.


In [52]:
from sammo.runners import OpenAIChat
from sammo.data import DataTable
from sammo.base import EvaluationScore
import os
import pandas as pd

runner = OpenAIChat(
    model_id="gpt-4o-mini",
    api_config={"api_key": os.environ["OPENAI_API_KEY"]},
    cache=os.getenv("CACHE_FILE", "cache.tsv"),
    timeout=30,
)

def load_data():
    df = pd.read_csv("transaction_data_with_classifications.csv")
    mydata = DataTable.from_pandas(df, input_fields="description", output_fields="classification", constants={"instructions": "Determine how to classify these transactions."})
    return mydata

def accuracy(y_true: DataTable, y_pred: DataTable) -> EvaluationScore:
    y_true = y_true.outputs.values
    y_pred = y_pred.outputs.values
    n_correct = sum([y_p == y_t for y_p, y_t in zip(y_pred, y_true)])
    return EvaluationScore(n_correct / len(y_true))

In [53]:
mydata = load_data()
mydata

+------------------------------------+---------------+
| input                              | output        |
| cash deposit at local branch       | Other         |
+------------------------------------+---------------+
| cash deposit at local branch       | Other         |
+------------------------------------+---------------+
| withdrew money for rent payment    | Rent          |
+------------------------------------+---------------+
| withdrew cash for weekend expenses | Other         |
+------------------------------------+---------------+
| purchased books from the bookstore | Other         |
+------------------------------------+---------------+
| tax refund deposit                 | Other         |
+------------------------------------+---------------+
| refund from clothing store         | Other         |
+------------------------------------+---------------+
| withdrew money for rent payment    | Rent          |
+------------------------------------+---------------+
| insuranc

In [54]:
from sammo.components import Output, GenerateText
from sammo.extractors import ExtractRegex
from sammo.base import Template

labeling_prompt = GenerateText(
    Template(
        "Instructions:{{constants.instructions}}\nOutput labels: Rent, Other, Food, Entertainment, Utilities\n"
        "{{#each inputs}}Input: {{this}}{{/each}}\nOutput:"
    )
)

mydata = load_data()
sample = mydata.sample(10, seed=42)

labeling_outputter = Output(
    ExtractRegex(labeling_prompt, "(?i)Rent|Other|Food|Entertainment|Utilities"), minibatch_size=10
)
result = labeling_outputter.run(runner, sample)
result

minibatches[################################################################################]1/1[00:00<00:00, 1033.90it/s]


+------------------------------------+----------+
| input                              | output   |
| cash withdrawal for holiday        | Other    |
+------------------------------------+----------+
| mortgage payment                   | Rent     |
+------------------------------------+----------+
| ATM withdrawal downtown            | Other    |
+------------------------------------+----------+
| cash withdrawal for holiday        | Other    |
+------------------------------------+----------+
| refund for cancelled subscription  | Other    |
+------------------------------------+----------+
| withdrew cash for weekend expenses | Other    |
+------------------------------------+----------+
| withdrew cash for weekend expenses | Other    |
+------------------------------------+----------+
| cash deposit at local branch       | Other    |
+------------------------------------+----------+
| refund for returned electronics    | Other    |
+------------------------------------+----------+


In [55]:
accuracy(result, sample)

name    value
------  -------
score   1.0

In [56]:
result.outputs[0].plot_call_trace()

### Meta prompting

You can use meta prompts to structure the sections of a prompt.

In [57]:
from sammo.instructions import MetaPrompt, Section, Paragraph, InputData, FewshotExamples
from sammo.dataformatters import (
    QuestionAnswerFormatter,
    JSONDataFormatter
)

mprompt = MetaPrompt(
    [
        Section("Instructions", mydata.constants["instructions"]),
        Section("Examples", FewshotExamples(mydata.sample(3, seed=43))),
        Paragraph("\nOutput labels: Rent, Other, Food, Entertainment, Utilities"),
        Paragraph(InputData()),
    ],
    render_as="markdown",
    data_formatter=QuestionAnswerFormatter(["Rent", "Other", "Food", "Entertainment", "Utilities"]),
)
# automatically wraps it with the right parser component
mprompt_parsed = mprompt.with_extractor("empty_result")

In [58]:
mprompt_parsed.plot_program()

In [59]:
result = Output(mprompt_parsed, minibatch_size=5, on_error="empty_result").run(
    runner, sample
)
result[:5]

minibatches[#################################################################################]2/2[00:00<00:00, 731.35it/s]


+-----------------------------------+----------+
| input                             | output   |
| cash withdrawal for holiday       | Other    |
+-----------------------------------+----------+
| mortgage payment                  | Rent     |
+-----------------------------------+----------+
| ATM withdrawal downtown           | Other    |
+-----------------------------------+----------+
| cash withdrawal for holiday       | Other    |
+-----------------------------------+----------+
| refund for cancelled subscription | Other    |
+-----------------------------------+----------+
Constants: {'instructions': 'Determine how to classify these transac...

In [60]:
print(result.outputs.llm_requests[0][0])

# Instructions
Determine how to classify these transactions.

# Examples
Q[0]: credit card bill payment
A[0]: Other

Q[1]: refund from clothing store
A[1]: Other

Q[2]: paid subscription service fee
A[2]: Entertainment



Output labels: Rent, Other, Food, Entertainment, Utilities


Q[0]: cash withdrawal for holiday

Q[1]: mortgage payment

Q[2]: ATM withdrawal downtown

Q[3]: cash withdrawal for holiday

Q[4]: refund for cancelled subscription


In [40]:
modified_mprompt = mprompt.clone().rebind({r"data_formatter": JSONDataFormatter()})

result = Output(
    modified_mprompt.with_extractor("empty_result"), minibatch_size=5, on_error="empty_result"
).run(runner, sample)
result[:5]

minibatches[#################################################################################]2/2[00:00<00:00, 894.79it/s]


+-----------------------------------+----------+
| input                             | output   |
| cash withdrawal for holiday       | Other    |
+-----------------------------------+----------+
| mortgage payment                  | Rent     |
+-----------------------------------+----------+
| ATM withdrawal downtown           | Other    |
+-----------------------------------+----------+
| cash withdrawal for holiday       | Other    |
+-----------------------------------+----------+
| refund for cancelled subscription | Other    |
+-----------------------------------+----------+
Constants: {'instructions': 'Determine how to classify these transac...

In [41]:
print(result.outputs.llm_requests[0][0])

# Instructions
Determine how to classify these transactions.

# Examples
[{"id": 0, "input": "credit card bill payment", "output": "Other"}, {"id": 1, "input": "refund from clothing store", "output": "Other"}, {"id": 2, "input": "paid subscription service fee", "output": "Entertainment"}]


Output labels: Rent, Other, Food, Entertainment, Utilities


[{"id": 0, "input": "cash withdrawal for holiday"}, {"id": 1, "input": "mortgage payment"}, {"id": 2, "input": "ATM withdrawal downtown"}, {"id": 3, "input": "cash withdrawal for holiday"}, {"id": 4, "input": "refund for cancelled subscription"}]


### RAG

Sammo has built in support for RAG which can be used for fewshot examples.

In [61]:
from sammo.runners import OpenAIEmbedding

embedder = OpenAIEmbedding(
    model_id="text-embedding-3-small",
    api_config={"api_key": os.getenv("OPENAI_API_KEY")},
    rate_limit=10,
    cache=os.getenv("EMBEDDING_FILE", "embeddings.tsv"),
)

In [62]:
total_len = len(mydata)
d_fewshot, d_train = mydata.random_split(int(total_len * 0.9), int(total_len * 0.1), seed=42)
d_train

+----------------------------------------+---------------+
| input                                  | output        |
| insurance claim refund                 | Other         |
+----------------------------------------+---------------+
| bought coffee and snacks from the cafe | Food          |
+----------------------------------------+---------------+
| paid for streaming service             | Entertainment |
+----------------------------------------+---------------+
| cash deposit at local branch           | Other         |
+----------------------------------------+---------------+
| purchased books from the bookstore     | Entertainment |
+----------------------------------------+---------------+
| school fee payment                     | Other         |
+----------------------------------------+---------------+
| paid phone bill online                 | Utilities     |
+----------------------------------------+---------------+
| bought gardening supplies              | Other        

In [63]:
from sammo.instructions import EmbeddingFewshotExamples

mprompt_rag = MetaPrompt(
    [
        Section("Instructions", mydata.constants["instructions"]),
        Section("Examples", EmbeddingFewshotExamples(
            embedder,
            d_fewshot,
            n_examples=3,
            budget="relative",
        )),
        Paragraph("\nOutput labels: Rent, Other, Food, Entertainment, Utilities"),
        Paragraph(InputData()),
    ],
    render_as="markdown",
    data_formatter=QuestionAnswerFormatter(["Rent", "Other", "Food", "Entertainment", "Utilities"]),
)

# automatically wraps it with the right parser component
mprompt_rag_parsed = mprompt_rag.with_extractor("empty_result")

In [64]:
result = Output(
    mprompt_rag.with_extractor("empty_result"), minibatch_size=5, on_error="empty_result"
).run(runner, d_train)
result[:5]

minibatches[############################################################################]20/20[00:00<00:00, 160476.20it/s]


+----------------------------------------+---------------+
| input                                  | output        |
| insurance claim refund                 | Other         |
+----------------------------------------+---------------+
| bought coffee and snacks from the cafe | Food          |
+----------------------------------------+---------------+
| paid for streaming service             | Entertainment |
+----------------------------------------+---------------+
| cash deposit at local branch           | Other         |
+----------------------------------------+---------------+
| purchased books from the bookstore     | Entertainment |
+----------------------------------------+---------------+
Constants: {'instructions': 'Determine how to classify these transac...

In [65]:
result.outputs[0].plot_call_trace()