In [20]:
import dspy
import os

openrouter = dspy.LM("openrouter/meta-llama/llama-3.1-8b-instruct",
                     cache=False, api_key=os.getenv("OPENROUTER_API_KEY"))
dspy.configure(lm=openrouter)

In [2]:
from phoenix.otel import register

register(
	project_name="dspy-evals",
	endpoint="http://localhost:6006/v1/traces",
	verbose=False,
    auto_instrument=True
)


<phoenix.otel.otel.TracerProvider at 0x7f92849e12b0>

# Evaluation and Optimization

DSPy provides the ability to run evaluations out of the box. Evaluations give us the ability to quantitatively measure the success of our AI programs. DSPy also provides optimizers that can be used alongside evaluations to automatically improve the performance of our AI programs.

In [3]:
import pandas as pd

df = pd.read_csv("Bitext_Sample_Customer_Service_Training_Dataset.csv")
df.drop(columns=["tags"], inplace=True)
df.rename(columns={
    "utterance": "customer_utterance",
    "category": "ticket_category",
}, inplace=True)

df


Unnamed: 0,customer_utterance,intent,ticket_category
0,would it be possible to cancel the order I made?,cancel_order,ORDER
1,cancelling order,cancel_order,ORDER
2,I need assistance canceling the last order I h...,cancel_order,ORDER
3,problem with canceling the order I made,cancel_order,ORDER
4,I don't know how to cancel the order I made,cancel_order,ORDER
...,...,...,...
6534,I do not know what I have to do to track the r...,track_refund,REFUND
6535,check refund status,track_refund,REFUND
6536,help me check the refund status,track_refund,REFUND
6537,how can I check if there is any updates on my ...,track_refund,REFUND


In [4]:
sorted(df["ticket_category"].unique())

['ACCOUNT',
 'CANCELLATION_FEE',
 'CONTACT',
 'DELIVERY',
 'FEEDBACK',
 'INVOICE',
 'NEWSLETTER',
 'ORDER',
 'PAYMENT',
 'REFUND',
 'SHIPPING_ADDRESS']

In [18]:
import dspy
from typing import Literal

TicketCategory = Literal[
    'ACCOUNT',
    'CANCELLATION_FEE',
    'CONTACT',
    'DELIVERY',
    'FEEDBACK',
    'INVOICE',
    'NEWSLETTER',
    'ORDER',
    'PAYMENT',
    'REFUND',
    'SHIPPING_ADDRESS'
]

class SupportCaseRouter(dspy.Signature):
	customer_utterance: str = dspy.InputField()
	ticket_category: TicketCategory = dspy.OutputField()

predictor = dspy.Predict(SupportCaseRouter)

In [6]:
TRAIN_SIZE = 10
TEST_SIZE = 5

def sampled_categories(df, n):
    indices = []
    for _, group in df.groupby(["ticket_category", "intent"]):
        indices.extend(group.sample(n=n).index)

    data = df.loc[indices].to_dict(orient="records")
    df = df.drop(indices)
    return data, df

train, df = sampled_categories(df, TRAIN_SIZE)
test, df = sampled_categories(df, TEST_SIZE)

train_examples = [dspy.Example(**x).with_inputs("customer_utterance") for x in train]
test_examples = [dspy.Example(**x).with_inputs("customer_utterance") for x in test]

In [21]:
def evaluate_accuracy(example, predicted, trace=None):
    return example.ticket_category == predicted.ticket_category

evaluate = dspy.Evaluate(devset=test_examples, metric=evaluate_accuracy, display_table=True, display_progress=True)

ans = evaluate(predictor, return_outputs=True)


Average Metric: 117.00 / 135 (86.7%): 100%|██████████| 135/135 [00:18<00:00,  7.18it/s]

2025/08/06 22:03:43 INFO dspy.evaluate.evaluate: Average Metric: 117 / 135 (86.7%)





Unnamed: 0,customer_utterance,intent,example_ticket_category,pred_ticket_category,evaluate_accuracy
0,what do i need to do to create an account,create_account,ACCOUNT,ACCOUNT,✔️ [True]
1,can you help me open a new online account for my husband?,create_account,ACCOUNT,ACCOUNT,✔️ [True]
2,i dont have a bloody account i want assistance tp sign up,create_account,ACCOUNT,ACCOUNT,✔️ [True]
3,help me to create an account,create_account,ACCOUNT,ACCOUNT,✔️ [True]
4,could I create an account?,create_account,ACCOUNT,ACCOUNT,✔️ [True]
...,...,...,...,...,...
130,can you help me set another shipping address up?,set_up_shipping_address,SHIPPING_ADDRESS,SHIPPING_ADDRESS,✔️ [True]
131,I try to set up a different shipping address,set_up_shipping_address,SHIPPING_ADDRESS,SHIPPING_ADDRESS,✔️ [True]
132,set new shipping address up,set_up_shipping_address,SHIPPING_ADDRESS,SHIPPING_ADDRESS,✔️ [True]
133,I have to set up a new shipping address,set_up_shipping_address,SHIPPING_ADDRESS,SHIPPING_ADDRESS,✔️ [True]


In [23]:
simba = dspy.SIMBA(metric=evaluate_accuracy, max_steps=4)

compiled_simba = simba.compile(predictor, trainset=train_examples)

2025/08/06 22:09:12 INFO dspy.teleprompt.simba: Starting batch 1 of 4.
2025/08/06 22:09:22 INFO dspy.teleprompt.simba: Sampling program trajectories on 32 examples x 6 samples.


Processed 192 / 192 examples: 100%|██████████| 192/192 [00:17<00:00, 11.09it/s]

2025/08/06 22:09:39 INFO dspy.teleprompt.simba: Batch 1: Baseline mini-batch score: 0.71875

2025/08/06 22:09:39 INFO dspy.teleprompt.simba: Batch 1: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/08/06 22:09:39 INFO dspy.teleprompt.simba: Batch 1: Invoking strategy: append_a_demo_
2025/08/06 22:09:39 INFO dspy.teleprompt.simba_utils: Added 1 demos (one each) across all predictors.
2025/08/06 22:09:39 INFO dspy.teleprompt.simba: 

2025/08/06 22:09:39 INFO dspy.teleprompt.simba: Batch 1: Processing bucket #2, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/08/06 22:09:39 INFO dspy.teleprompt.simba: Batch 1: Invoking strategy: append_a_demo_
2025/08/06 22:09:39 INFO dspy.teleprompt.simba_utils: Added 1 demos (one each) across all predictors.
2025/08/06 22:09:39 INFO dspy.teleprompt.simba: 

2025/08/06 22:09:39 INFO dspy.teleprompt.simba: Batch 1: Processing bucket #3, with max score 1.0, max-to




2025/08/06 22:09:43 INFO dspy.teleprompt.simba_utils: Advice for self: When processing the 'customer_utterance' input, consider using more robust natural language processing techniques to handle typos and unclear language. This may involve using techniques such as spell correction, sentiment analysis, or intent detection to better understand the customer's intent. Additionally, consider using more specific and targeted keywords to improve the accuracy of the ticket category classification.
2025/08/06 22:09:43 INFO dspy.teleprompt.simba: 

2025/08/06 22:09:43 INFO dspy.teleprompt.simba: Batch 1: Processing bucket #4, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.33333333333333337.
2025/08/06 22:09:43 INFO dspy.teleprompt.simba: Batch 1: Invoking strategy: append_a_rule
2025/08/06 22:09:46 INFO dspy.teleprompt.simba_utils: Advice for self: When processing customer utterances, pay closer attention to language related to account creation, such as 'create account', 'user acco

Processed 224 / 224 examples: 100%|██████████| 224/224 [00:20<00:00, 10.68it/s]

2025/08/06 22:10:11 INFO dspy.teleprompt.simba: Scores after 1 batches: [0.75, 0.84375, 0.75, 0.84375, 0.71875, 0.84375, 0.78125], Best: 0.84375

2025/08/06 22:10:11 INFO dspy.teleprompt.simba: Starting batch 2 of 4.





2025/08/06 22:10:21 INFO dspy.teleprompt.simba: Sampling program trajectories on 32 examples x 6 samples.


Processed 187 / 192 examples:  97%|█████████▋| 186/192 [00:17<00:00, 10.42it/s]



Processed 192 / 192 examples: 100%|██████████| 192/192 [00:19<00:00,  9.96it/s]


2025/08/06 22:10:41 INFO dspy.teleprompt.simba: Batch 2: Baseline mini-batch score: 0.875

2025/08/06 22:10:41 INFO dspy.teleprompt.simba: Batch 2: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/08/06 22:10:41 INFO dspy.teleprompt.simba: Batch 2: Invoking strategy: append_a_rule
2025/08/06 22:10:45 INFO dspy.teleprompt.simba_utils: Advice for self: When processing the input 'customer_utterance', the module should be more accurate in identifying the intent of the customer's utterance. It should be able to recognize that the customer is asking for help with recovering their password and map this intent to the correct ticket category, which is 'ACCOUNT'. To achieve this, the module can be fine-tuned on a larger dataset of customer utterances and their corresponding intents and ticket categories.
2025/08/06 22:10:45 INFO dspy.teleprompt.simba: 

2025/08/06 22:10:45 INFO dspy.teleprompt.simba: Batch 2: Processing bucket #2, with max

Processed 224 / 224 examples: 100%|██████████| 224/224 [00:23<00:00,  9.50it/s]

2025/08/06 22:11:25 INFO dspy.teleprompt.simba: Scores after 2 batches: [0.9375, 0.90625, 0.9375, 0.9375, 0.90625, 0.875, 0.90625], Best: 0.9375

2025/08/06 22:11:25 INFO dspy.teleprompt.simba: Starting batch 3 of 4.





2025/08/06 22:11:37 INFO dspy.teleprompt.simba: Sampling program trajectories on 32 examples x 6 samples.


Processed 192 / 192 examples: 100%|██████████| 192/192 [00:19<00:00,  9.63it/s]

2025/08/06 22:11:58 INFO dspy.teleprompt.simba: Batch 3: Baseline mini-batch score: 0.8697916666666666






2025/08/06 22:11:58 INFO dspy.teleprompt.simba: Batch 3: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/08/06 22:11:58 INFO dspy.teleprompt.simba: Batch 3: Invoking strategy: append_a_demo_
2025/08/06 22:11:58 INFO dspy.teleprompt.simba_utils: Added 1 demos (one each) across all predictors.
2025/08/06 22:11:58 INFO dspy.teleprompt.simba: 

2025/08/06 22:11:58 INFO dspy.teleprompt.simba: Batch 3: Processing bucket #2, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.6666666666666667.
2025/08/06 22:11:58 INFO dspy.teleprompt.simba: Batch 3: Invoking strategy: append_a_rule
2025/08/06 22:12:03 INFO dspy.teleprompt.simba_utils: Advice for self: When processing the input 'customer_utterance', the module should be more accurate in identifying the intent of the customer's utterance. It should be able to recognize that the customer is asking for help with recovering their password and map this intent to the correct ticket c

Processed 224 / 224 examples: 100%|██████████| 224/224 [00:21<00:00, 10.42it/s]

2025/08/06 22:12:39 INFO dspy.teleprompt.simba: Scores after 3 batches: [0.9375, 0.84375, 0.96875, 0.84375, 0.90625, 0.875, 0.875], Best: 0.96875

2025/08/06 22:12:39 INFO dspy.teleprompt.simba: Starting batch 4 of 4.





2025/08/06 22:12:51 INFO dspy.teleprompt.simba: Sampling program trajectories on 32 examples x 6 samples.


Processed 192 / 192 examples: 100%|██████████| 192/192 [00:20<00:00,  9.48it/s]


2025/08/06 22:13:11 INFO dspy.teleprompt.simba: Batch 4: Baseline mini-batch score: 0.9375

2025/08/06 22:13:11 INFO dspy.teleprompt.simba: Batch 4: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/08/06 22:13:11 INFO dspy.teleprompt.simba: Batch 4: Invoking strategy: append_a_demo_
2025/08/06 22:13:11 INFO dspy.teleprompt.simba_utils: Added 1 demos (one each) across all predictors.
2025/08/06 22:13:11 INFO dspy.teleprompt.simba: 

2025/08/06 22:13:11 INFO dspy.teleprompt.simba: Batch 4: Processing bucket #2, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.6666666666666667.
2025/08/06 22:13:12 INFO dspy.teleprompt.simba: Batch 4: Invoking strategy: append_a_rule, having dropped 1 demos per predictor
2025/08/06 22:13:15 INFO dspy.teleprompt.simba_utils: Advice for self: If the module receives a customer utterance that mentions 'new profile', it should consider the context of the conversation and the customer's intent.

Processed 224 / 224 examples: 100%|██████████| 224/224 [00:23<00:00,  9.34it/s]

2025/08/06 22:13:53 INFO dspy.teleprompt.simba: Scores after 4 batches: [1.0, 0.96875, 0.96875, 0.96875, 0.90625, 0.96875, 0.96875], Best: 1.0






2025/08/06 22:13:54 INFO dspy.teleprompt.simba: VALIDATION: Evaluating 5 programs on the full trainset.


Processed 1350 / 1350 examples: 100%|██████████| 1350/1350 [02:05<00:00, 10.71it/s]

2025/08/06 22:16:00 INFO dspy.teleprompt.simba: Final trainset scores: [0.8444444444444444, 0.8740740740740741, 0.9259259259259259, 0.9703703703703703, 0.9481481481481482], Best: 0.9703703703703703 (at index 3)








In [24]:
evaluate(compiled_simba)

Average Metric: 130.00 / 135 (96.3%): 100%|██████████| 135/135 [00:12<00:00, 11.04it/s]

2025/08/06 22:16:19 INFO dspy.evaluate.evaluate: Average Metric: 130 / 135 (96.3%)





Unnamed: 0,customer_utterance,intent,example_ticket_category,pred_ticket_category,evaluate_accuracy
0,what do i need to do to create an account,create_account,ACCOUNT,ACCOUNT,✔️ [True]
1,can you help me open a new online account for my husband?,create_account,ACCOUNT,ACCOUNT,✔️ [True]
2,i dont have a bloody account i want assistance tp sign up,create_account,ACCOUNT,ACCOUNT,✔️ [True]
3,help me to create an account,create_account,ACCOUNT,ACCOUNT,✔️ [True]
4,could I create an account?,create_account,ACCOUNT,ACCOUNT,✔️ [True]
...,...,...,...,...,...
130,can you help me set another shipping address up?,set_up_shipping_address,SHIPPING_ADDRESS,SHIPPING_ADDRESS,✔️ [True]
131,I try to set up a different shipping address,set_up_shipping_address,SHIPPING_ADDRESS,SHIPPING_ADDRESS,✔️ [True]
132,set new shipping address up,set_up_shipping_address,SHIPPING_ADDRESS,SHIPPING_ADDRESS,✔️ [True]
133,I have to set up a new shipping address,set_up_shipping_address,SHIPPING_ADDRESS,SHIPPING_ADDRESS,✔️ [True]


96.3

In [26]:
compiled_simba.save("simba_model.json")
predictor.save("simba.json")