In [None]:
import dspy
import os

openrouter = dspy.LM("openrouter/meta-llama/llama-3.1-8b-instruct",
                     cache=False, api_key=os.getenv("OPENROUTER_API_KEY"))
dspy.configure(lm=openrouter)

In [None]:
from phoenix.otel import register

register(
	project_name="dspy-evals",
	endpoint="http://localhost:6006/v1/traces",
	verbose=False,
    auto_instrument=True
)


# Evaluation and Optimization

DSPy provides the ability to run evaluations out of the box. Evaluations give us the ability to quantitatively measure the success of our AI programs. DSPy also provides optimizers that can be used alongside evaluations to automatically improve the performance of our AI programs.

In [None]:
import pandas as pd

df = pd.read_csv("Bitext_Sample_Customer_Service_Training_Dataset.csv")
df.drop(columns=["tags"], inplace=True)
df.rename(columns={
    "utterance": "customer_utterance",
    "category": "ticket_category",
}, inplace=True)

df


In [None]:
import dspy
from typing import Literal

TicketCategory = Literal[
    'ACCOUNT',
    'CANCELLATION_FEE',
    'CONTACT',
    'DELIVERY',
    'FEEDBACK',
    'INVOICE',
    'NEWSLETTER',
    'ORDER',
    'PAYMENT',
    'REFUND',
    'SHIPPING_ADDRESS'
]

class SupportCaseRouter(dspy.Signature):
	"""
	Determine which category best fits the customer's request given their stated issue in order to connect them with the appropriate support team.
	"""
	customer_utterance: str = dspy.InputField()
	ticket_category: TicketCategory = dspy.OutputField()

predictor = dspy.Predict(SupportCaseRouter)

In [None]:
TRAIN_SIZE = 10
TEST_SIZE = 5

def sampled_categories(df, n):
    indices = []
    for _, group in df.groupby(["ticket_category", "intent"]):
        indices.extend(group.sample(n=n).index)

    data = df.loc[indices].to_dict(orient="records")
    df = df.drop(indices)
    return data, df

train, df = sampled_categories(df, TRAIN_SIZE)
test, df = sampled_categories(df, TEST_SIZE)

train_examples = [dspy.Example(**x).with_inputs("customer_utterance") for x in train]
test_examples = [dspy.Example(**x).with_inputs("customer_utterance") for x in test]

In [None]:
def evaluate_accuracy(example, predicted, trace=None):
    return example.ticket_category == predicted.ticket_category

evaluate = dspy.Evaluate(devset=test_examples, metric=evaluate_accuracy, display_table=True, display_progress=True)

ans = evaluate(predictor, return_outputs=True)


In [None]:
simba = dspy.SIMBA(metric=evaluate_accuracy, max_steps=4, bsize=16)

compiled_simba = simba.compile(predictor, trainset=train_examples)

In [None]:
evaluate(compiled_simba)