In [9]:
import dspy

openai = dspy.LM("openai/gpt-4.1-nano", cache=False)
dspy.configure(lm=openai)

In [8]:
from phoenix.otel import register
from openinference.instrumentation.dspy import DSPyInstrumentor

register(
	project_name="dspy-demo",
	endpoint="http://localhost:6006/v1/traces",
	verbose=False
)

DSPyInstrumentor().instrument(skip_dep_check=True)

# Evaluation and Optimization

DSPy provides the ability to run evaluations out of the box. Evaluations give us the ability to quantitatively measure the success of our AI programs. DSPy also provides optimizers that can be used alongside evaluations to automatically improve the performance of our AI programs.

In [4]:
from csv import DictReader

data = []

# Create set
categories = set()
mix_intent = set()

with open("Bitext_Sample_Customer_Service_Training_Dataset.csv", "r") as f:
    # header = f.readline().split(",")
    reader = DictReader(f)
    for row in reader:
        data.append(row)
        categories.add(row["category"])
        mix_intent.add(f"{row["category"]}-{row["intent"]}")



{'ACCOUNT',
 'CANCELLATION_FEE',
 'CONTACT',
 'DELIVERY',
 'FEEDBACK',
 'INVOICE',
 'NEWSLETTER',
 'ORDER',
 'PAYMENT',
 'REFUND',
 'SHIPPING_ADDRESS'}

In [40]:
import dspy
from typing import Literal

TicketCategory = Literal[
    'ACCOUNT',
    'CANCELLATION_FEE',
    'CONTACT',
    'DELIVERY',
    'FEEDBACK',
    'INVOICE',
    'NEWSLETTER',
    'ORDER',
    'PAYMENT',
    'REFUND',
    'SHIPPING_ADDRESS'
]

class SupportCaseRouter(dspy.Signature):
	"""
	Determine which category best fits the customer's request given their stated issue in order to connect them with the appropriate support team.
	"""
	customer_utterance: str = dspy.InputField()
	ticket_category: TicketCategory = dspy.OutputField()

predictor = dspy.Predict(SupportCaseRouter)

In [47]:
import pandas as pd

df = pd.DataFrame(data)

df.rename(columns={
    "utterance": "customer_utterance",
    "category": "ticket_category",
    "intent": "ticket_intent",
}, inplace=True)

df.drop(columns=["tags"], inplace=True)

DEV_SIZE = 10
TRAIN_SIZE = 25

train_indices = []
for _, group in df.groupby(["ticket_category", "ticket_intent"]):
    train_indices.extend(group.sample(n =10).index)

train = [dspy.Example(**x).with_inputs("customer_utterance") for x in df.loc[train_indices].to_dict(orient="records")]
df = df.drop(train_indices)

test_indices = []
for _, group in df.groupby(["ticket_category", "ticket_intent"]):
    test_indices.extend(group.sample(n =5).index)
test = [dspy.Example(**x).with_inputs("customer_utterance") for x in df.loc[test_indices].to_dict(orient="records")]

In [41]:
def evaluate_accuracy(example, predicted, trace=None):
    return example.ticket_category == predicted.ticket_category

evaluate = dspy.Evaluate(devset=test, metric=evaluate_accuracy, display_table=True, display_progress=True)

ans = evaluate(predictor)


Average Metric: 126.00 / 135 (93.3%): 100%|██████████| 135/135 [00:08<00:00, 15.63it/s]

2025/07/15 22:41:26 INFO dspy.evaluate.evaluate: Average Metric: 126 / 135 (93.3%)





Unnamed: 0,customer_utterance,ticket_intent,example_ticket_category,tags,pred_ticket_category,evaluate_accuracy
0,ohw can i sign up,create_account,ACCOUNT,BILQZ,ACCOUNT,✔️ [True]
1,I have got to create a different user account for my bride,create_account,ACCOUNT,BL,ACCOUNT,✔️ [True]
2,I want help to create an account,create_account,ACCOUNT,B,ACCOUNT,✔️ [True]
3,I do not know how I could create an account,create_account,ACCOUNT,BEP,ACCOUNT,✔️ [True]
4,what do I need to do to create another online account?,create_account,ACCOUNT,BIL,ACCOUNT,✔️ [True]
...,...,...,...,...,...,...
130,I would like to set a new shipping address up,set_up_shipping_address,SHIPPING_ADDRESS,BP,SHIPPING_ADDRESS,✔️ [True]
131,help setting up a different shipping address,set_up_shipping_address,SHIPPING_ADDRESS,B,SHIPPING_ADDRESS,✔️ [True]
132,I do not know how I can set a new shipping address up,set_up_shipping_address,SHIPPING_ADDRESS,BE,SHIPPING_ADDRESS,✔️ [True]
133,how could I set another shipping address up?,set_up_shipping_address,SHIPPING_ADDRESS,BIP,SHIPPING_ADDRESS,✔️ [True]


In [42]:
[res[0].ticket_category for res in ans.results if res[-1] == False]

['ACCOUNT',
 'ACCOUNT',
 'FEEDBACK',
 'FEEDBACK',
 'FEEDBACK',
 'FEEDBACK',
 'FEEDBACK',
 'ORDER',
 'ORDER']

In [43]:
simba = dspy.SIMBA(metric=evaluate_accuracy, max_steps=4, bsize=16)

compiled_simba = simba.compile(predictor, trainset=train)

evaluate(compiled_simba)

2025/07/15 22:41:39 INFO dspy.teleprompt.simba: Starting batch 1 of 4.
2025/07/15 22:41:44 INFO dspy.teleprompt.simba: Sampling program trajectories on 16 examples x 6 samples.


Processed 96 / 96 examples: 100%|██████████| 96/96 [00:10<00:00,  9.38it/s]

2025/07/15 22:41:54 INFO dspy.teleprompt.simba: Batch 1: Baseline mini-batch score: 0.875

2025/07/15 22:41:54 INFO dspy.teleprompt.simba: Batch 1: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.6666666666666667.
2025/07/15 22:41:54 INFO dspy.teleprompt.simba: Batch 1: Invoking strategy: append_a_rule





2025/07/15 22:41:57 INFO dspy.teleprompt.simba_utils: Advice for self: If the customer_utterance contains keywords or phrases related to registration, account setup, or login issues (e.g., 'register', 'sign up', 'login problem', 'report a registration issue'), then the module should classify the ticket_category as 'ACCOUNT'. Avoid relying solely on words like 'report' or 'how' that might suggest a support contact rather than a specific issue. Incorporate intent recognition or keyword matching focused on account-related problems to improve accuracy.
2025/07/15 22:41:57 INFO dspy.teleprompt.simba: 

2025/07/15 22:41:57 INFO dspy.teleprompt.simba: Batch 1: Processing bucket #2, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.33333333333333337.
2025/07/15 22:41:57 INFO dspy.teleprompt.simba: Batch 1: Invoking strategy: append_a_rule
2025/07/15 22:41:58 INFO dspy.teleprompt.simba_utils: Advice for self: If the customer_utterance contains keywords like 'signup', 'registration', 

Processed 112 / 112 examples: 100%|██████████| 112/112 [00:07<00:00, 15.14it/s]

2025/07/15 22:42:12 INFO dspy.teleprompt.simba: Scores after 1 batches: [1.0, 1.0, 0.8125, 0.8125, 0.8125, 0.875, 0.75], Best: 1.0

2025/07/15 22:42:12 INFO dspy.teleprompt.simba: Starting batch 2 of 4.





2025/07/15 22:42:16 INFO dspy.teleprompt.simba: Sampling program trajectories on 16 examples x 6 samples.


Processed 96 / 96 examples: 100%|██████████| 96/96 [00:06<00:00, 15.57it/s]


2025/07/15 22:42:23 INFO dspy.teleprompt.simba: Batch 2: Baseline mini-batch score: 0.9895833333333334

2025/07/15 22:42:23 INFO dspy.teleprompt.simba: Batch 2: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.16666666666666663.
2025/07/15 22:42:23 INFO dspy.teleprompt.simba: Batch 2: Invoking strategy: append_a_demo_
2025/07/15 22:42:23 INFO dspy.teleprompt.simba_utils: Added 1 demos (one each) across all predictors.
2025/07/15 22:42:23 INFO dspy.teleprompt.simba: 

2025/07/15 22:42:23 INFO dspy.teleprompt.simba: Batch 2: Processing bucket #2, with max score 1.0, max-to-min gap 0.0, and max-to-avg gap 0.0.
2025/07/15 22:42:23 INFO dspy.teleprompt.simba: Batch 2: Invoking strategy: append_a_demo_
2025/07/15 22:42:23 INFO dspy.teleprompt.simba_utils: Added 1 demos (one each) across all predictors.
2025/07/15 22:42:23 INFO dspy.teleprompt.simba: 

2025/07/15 22:42:23 INFO dspy.teleprompt.simba: Batch 2: Processing bucket #3, with max score 1.0, max-to-mi

Processed 112 / 112 examples: 100%|██████████| 112/112 [00:09<00:00, 12.13it/s]

2025/07/15 22:42:43 INFO dspy.teleprompt.simba: Scores after 2 batches: [1.0, 1.0, 0.9375, 0.9375, 0.9375, 0.9375, 1.0], Best: 1.0

2025/07/15 22:42:43 INFO dspy.teleprompt.simba: Starting batch 3 of 4.





2025/07/15 22:42:48 INFO dspy.teleprompt.simba: Sampling program trajectories on 16 examples x 6 samples.


Processed 96 / 96 examples: 100%|██████████| 96/96 [00:06<00:00, 15.98it/s]

2025/07/15 22:42:54 INFO dspy.teleprompt.simba: Batch 3: Baseline mini-batch score: 0.875

2025/07/15 22:42:54 INFO dspy.teleprompt.simba: Batch 3: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/15 22:42:54 INFO dspy.teleprompt.simba: Batch 3: Invoking strategy: append_a_demo_
2025/07/15 22:42:54 INFO dspy.teleprompt.simba_utils: Added 1 demos (one each) across all predictors.
2025/07/15 22:42:54 INFO dspy.teleprompt.simba: 

2025/07/15 22:42:54 INFO dspy.teleprompt.simba: Batch 3: Processing bucket #2, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.6666666666666667.
2025/07/15 22:42:54 INFO dspy.teleprompt.simba: Batch 3: Invoking strategy: append_a_rule, having dropped 1 demos per predictor





2025/07/15 22:42:57 INFO dspy.teleprompt.simba_utils: Advice for self: If the customer_utterance contains words like 'cancel', 'cancel order', or similar phrases, then the module should consider the overall context and intent to determine if the request pertains to an order rather than defaulting to a specific subcategory like 'CANCELLATION_FEE'. To improve accuracy, incorporate intent recognition or keyword weighting that emphasizes the overall goal of the request, rather than focusing solely on specific keywords that might lead to misclassification.
2025/07/15 22:42:57 INFO dspy.teleprompt.simba: 

2025/07/15 22:42:57 INFO dspy.teleprompt.simba: Batch 3: Processing bucket #3, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.5.
2025/07/15 22:42:57 INFO dspy.teleprompt.simba: Batch 3: Invoking strategy: append_a_demo_
2025/07/15 22:42:57 INFO dspy.teleprompt.simba_utils: Added 1 demos (one each) across all predictors.
2025/07/15 22:42:57 INFO dspy.teleprompt.simba: 

2025/0

Processed 112 / 112 examples: 100%|██████████| 112/112 [00:07<00:00, 14.16it/s]

2025/07/15 22:43:11 INFO dspy.teleprompt.simba: Scores after 3 batches: [0.875, 0.9375, 0.9375, 0.875, 0.75, 0.9375, 0.875], Best: 0.9375

2025/07/15 22:43:11 INFO dspy.teleprompt.simba: Starting batch 4 of 4.





2025/07/15 22:43:16 INFO dspy.teleprompt.simba: Sampling program trajectories on 16 examples x 6 samples.


Processed 96 / 96 examples: 100%|██████████| 96/96 [00:07<00:00, 12.27it/s]

2025/07/15 22:43:24 INFO dspy.teleprompt.simba: Batch 4: Baseline mini-batch score: 0.90625

2025/07/15 22:43:24 INFO dspy.teleprompt.simba: Batch 4: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.5.
2025/07/15 22:43:24 INFO dspy.teleprompt.simba: Batch 4: Invoking strategy: append_a_rule





2025/07/15 22:43:26 INFO dspy.teleprompt.simba_utils: Advice for self: If the customer_utterance contains words like 'track' or 'tracking' combined with 'order', then the module should classify the ticket_category as 'ORDER' rather than 'DELIVERY'. To improve accuracy, incorporate intent recognition that prioritizes 'order' related actions when 'track' is mentioned in the context of an order, and avoid defaulting to delivery categories based solely on the presence of 'track'. Consider adding a keyword or phrase matching pattern that explicitly links 'track' with 'order' issues, especially when the context involves order management rather than delivery specifics.
2025/07/15 22:43:26 INFO dspy.teleprompt.simba: 

2025/07/15 22:43:26 INFO dspy.teleprompt.simba: Batch 4: Processing bucket #2, with max score 1.0, max-to-min gap 0.0, and max-to-avg gap 0.0.
2025/07/15 22:43:26 INFO dspy.teleprompt.simba: Batch 4: Invoking strategy: append_a_demo_
2025/07/15 22:43:26 INFO dspy.teleprompt.simb

Processed 112 / 112 examples: 100%|██████████| 112/112 [00:07<00:00, 15.46it/s]

2025/07/15 22:43:38 INFO dspy.teleprompt.simba: Scores after 4 batches: [0.9375, 0.9375, 0.9375, 0.9375, 0.9375, 0.9375, 0.9375], Best: 0.9375






2025/07/15 22:43:38 INFO dspy.teleprompt.simba: VALIDATION: Evaluating 5 programs on the full trainset.


Processed 1293 / 1350 examples:  96%|█████████▌| 1292/1350 [02:28<00:11,  4.89it/s]



Processed 1350 / 1350 examples: 100%|██████████| 1350/1350 [03:04<00:00,  7.32it/s]

2025/07/15 22:46:43 INFO dspy.teleprompt.simba: Final trainset scores: [0.9592592592592593, 0.9555555555555556, 0.9518518518518518, 0.9555555555555556, 0.9740740740740741], Best: 0.9740740740740741 (at index 4)






Average Metric: 128.00 / 135 (94.8%): 100%|██████████| 135/135 [00:09<00:00, 13.61it/s]

2025/07/15 22:46:53 INFO dspy.evaluate.evaluate: Average Metric: 128 / 135 (94.8%)





Unnamed: 0,customer_utterance,ticket_intent,example_ticket_category,tags,pred_ticket_category,evaluate_accuracy
0,ohw can i sign up,create_account,ACCOUNT,BILQZ,ACCOUNT,✔️ [True]
1,I have got to create a different user account for my bride,create_account,ACCOUNT,BL,ACCOUNT,✔️ [True]
2,I want help to create an account,create_account,ACCOUNT,B,ACCOUNT,✔️ [True]
3,I do not know how I could create an account,create_account,ACCOUNT,BEP,ACCOUNT,✔️ [True]
4,what do I need to do to create another online account?,create_account,ACCOUNT,BIL,ACCOUNT,✔️ [True]
...,...,...,...,...,...,...
130,I would like to set a new shipping address up,set_up_shipping_address,SHIPPING_ADDRESS,BP,SHIPPING_ADDRESS,✔️ [True]
131,help setting up a different shipping address,set_up_shipping_address,SHIPPING_ADDRESS,B,SHIPPING_ADDRESS,✔️ [True]
132,I do not know how I can set a new shipping address up,set_up_shipping_address,SHIPPING_ADDRESS,BE,SHIPPING_ADDRESS,✔️ [True]
133,how could I set another shipping address up?,set_up_shipping_address,SHIPPING_ADDRESS,BIP,SHIPPING_ADDRESS,✔️ [True]


EvaluationResult(score=94.81, results=<list of 135 results>)

In [None]:
chain = dspy.ChainOfThought(SupportCaseRouter)

ans = evaluate(chain)

Average Metric: 128.00 / 135 (94.8%): 100%|██████████| 135/135 [00:16<00:00,  8.13it/s]

2025/07/15 22:49:16 INFO dspy.evaluate.evaluate: Average Metric: 128 / 135 (94.8%)





Unnamed: 0,customer_utterance,ticket_intent,example_ticket_category,tags,reasoning,pred_ticket_category,evaluate_accuracy
0,ohw can i sign up,create_account,ACCOUNT,BILQZ,"The customer is asking about how to sign up, which relates to crea...",ACCOUNT,✔️ [True]
1,I have got to create a different user account for my bride,create_account,ACCOUNT,BL,The customer is requesting to create a new user account for someon...,ACCOUNT,✔️ [True]
2,I want help to create an account,create_account,ACCOUNT,B,"The customer is requesting assistance with creating an account, wh...",ACCOUNT,✔️ [True]
3,I do not know how I could create an account,create_account,ACCOUNT,BEP,"The customer is expressing difficulty with creating an account, wh...",ACCOUNT,✔️ [True]
4,what do I need to do to create another online account?,create_account,ACCOUNT,BIL,The customer is asking about the process to create a new online ac...,ACCOUNT,✔️ [True]
...,...,...,...,...,...,...,...
130,I would like to set a new shipping address up,set_up_shipping_address,SHIPPING_ADDRESS,BP,"The customer wants to set up a new shipping address, which relates...",SHIPPING_ADDRESS,✔️ [True]
131,help setting up a different shipping address,set_up_shipping_address,SHIPPING_ADDRESS,B,The customer is requesting assistance with changing or setting up ...,SHIPPING_ADDRESS,✔️ [True]
132,I do not know how I can set a new shipping address up,set_up_shipping_address,SHIPPING_ADDRESS,BE,"The customer is asking about how to set up a new shipping address,...",SHIPPING_ADDRESS,✔️ [True]
133,how could I set another shipping address up?,set_up_shipping_address,SHIPPING_ADDRESS,BIP,"The customer is asking about setting up a new shipping address, wh...",SHIPPING_ADDRESS,✔️ [True]
