# 1. To Traceout dspy's calls

In [1]:
import mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("DSPy")

<Experiment: artifact_location='mlflow-artifacts:/743460322438934521', creation_time=1743668579463, experiment_id='743460322438934521', last_update_time=1743668579463, lifecycle_stage='active', name='DSPy', tags={}>

In [2]:
import dspy
import requests

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
mlflow.dspy.autolog(log_traces_from_compile=True)

# 2. Setting up lm

In [4]:
# this is for inspecting the actual llm API call
import litellm
litellm.set_verbose = True

In [None]:
lm = dspy.LM(
    model='ollama_chat/deepseek-r1:14b',# use in the format of litellm i.e. provider/modelname  e.g. `groq/llama3-8b-8192`
    api_key=''# use your api key from that provider
)

In [6]:
# Reset DSPy configuration to ensure clean state
dspy.configure(lm=lm)


# 3 Let's Prepare Dataset

In [7]:
import time
import re
import requests
import json
import numpy as np
from litellm import RateLimitError
from tenacity import retry, wait_fixed, stop_after_attempt, retry_if_exception_type



In [8]:
import random
from dspy.datasets import DataLoader
from datasets import load_dataset

# Load the Banking77 dataset.
CLASSES = load_dataset("PolyAI/banking77", split="train", trust_remote_code=True).features['label'].names
kwargs = dict(fields=("text", "label"), input_keys=("text",), split="train", trust_remote_code=True)

# Load the first 2000 examples from the dataset, and assign a hint to each *training* example.
raw_data = [
    dspy.Example(x, label=CLASSES[x.label]).with_inputs("text")
    for x in DataLoader().from_huggingface(dataset_name="PolyAI/banking77", **kwargs)[:1000]
]

random.Random(0).shuffle(raw_data)

In [9]:
len(CLASSES), CLASSES[:10]

(77,
 ['activate_my_card',
  'age_limit',
  'apple_pay_or_google_pay',
  'atm_support',
  'automatic_top_up',
  'balance_not_updated_after_bank_transfer',
  'balance_not_updated_after_cheque_or_cash_deposit',
  'beneficiary_not_allowed',
  'cancel_transfer',
  'card_about_to_expire'])

In [10]:
unlabeled_trainset = [dspy.Example(text=x.text).with_inputs("text") for x in raw_data[:50]]

unlabeled_trainset[0]

Example({'text': 'What if there is an error on the exchange rate?'}) (input_keys={'text'})

# 4. Defining actual program/[pipeline]

In [None]:
from typing import Literal

In [22]:
class classifier(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog=dspy.ChainOfThought(f"text -> label: Literal{CLASSES}")
        
    def forward(self,text):
        return self.prog(text=text)

In [23]:
program=classifier()

In [24]:
program(text="I am still waiting ON my card   ?")



SYNC kwargs[caching]: False; litellm.cache: <litellm.caching.caching.Cache object at 0x7613101a89e0>; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'temperature': 0.0, 'num_predict': 1000}
RAW RESPONSE:
{"model":"deepseek-r1:14b","created_at":"2025-04-17T06:36:12.121821047Z","message":{"role":"assistant","content":"\u003cthink\u003e\nOkay, so I need to figure out what label corresponds to the text \"I am still waiting ON my card   ?\". Let me break this down step by step.\n\nFirst, looking at the text, the user is expressing that they're still waiting on their card. The phrase \"waiting on\" suggests that there's an expectation of receiving something related to their card, possibly a physical card or maybe a virtual one. \n\nNow, I'll go through the list of possible labels to see which one fits best. Let's consider some options:\n\n1. **card_arrival**: This seems relevant because if someone is waiting on their card, it likely refers to the arrival of a physica

Prediction(
    reasoning='The user is expressing that they are still waiting on their card, indicating an expectation for its arrival. This corresponds to the category related to the arrival of a physical card.',
    label='card_arrival'
)

success callbacks: Running Custom Callback Function - <function _litellm_track_cache_hit_callback at 0x7613101cd260>
success callbacks: Running Custom Callback Function - <function _litellm_track_cache_hit_callback at 0x7613101cd260>
success callbacks: Running Custom Callback Function - <function _litellm_track_cache_hit_callback at 0x7613101cd260>
success callbacks: Running Custom Callback Function - <function _litellm_track_cache_hit_callback at 0x7613101cd260>
success callbacks: Running Custom Callback Function - <function _litellm_track_cache_hit_callback at 0x7613101cd260>
success callbacks: Running Custom Callback Function - <function _litellm_track_cache_hit_callback at 0x7613101cd260>
success callbacks: Running Custom Callback Function - <function _litellm_track_cache_hit_callback at 0x7613101cd260>
success callbacks: Running Custom Callback Function - <function _litellm_track_cache_hit_callback at 0x7613101cd260>
success callbacks: Running Custom Callback Function - <function 

**here we can see , the prompt that has be sent as message to the lm call  :**

*[{'role': 'system', 'content': "Your input fields are:\n1. `text` (str)\nYour output fields are:\n1. `reasoning` (str)\n2. `label` (Literal['activate_my_card', 'age_limit', 'apple_pay_or_google_pay', 'atm_support', 'automatic_top_up', 'balance_not_updated_after_bank_transfer', 'balance_not_updated_after_cheque_or_cash_deposit', 'beneficiary_not_allowed', 'cancel_transfer', 'card_about_to_expire', 'card_acceptance', 'card_arrival', 'card_delivery_estimate', 'card_linking', 'card_not_working', 'card_payment_fee_charged', 'card_payment_not_recognised', 'card_payment_wrong_exchange_rate', 'card_swallowed', 'cash_withdrawal_charge', 'cash_withdrawal_not_recognised', 'change_pin', 'compromised_card', 'contactless_not_working', 'country_support', 'declined_card_payment', 'declined_cash_withdrawal', 'declined_transfer', 'direct_debit_payment_not_recognised', 'disposable_card_limits', 'edit_personal_details', 'exchange_charge', 'exchange_rate', 'exchange_via_app', 'extra_charge_on_statement', 'failed_transfer', 'fiat_currency_support', 'get_disposable_virtual_card', 'get_physical_card', 'getting_spare_card', 'getting_virtual_card', 'lost_or_stolen_card', 'lost_or_stolen_phone', 'order_physical_card', 'passcode_forgotten', 'pending_card_payment', 'pending_cash_withdrawal', 'pending_top_up', 'pending_transfer', 'pin_blocked', 'receiving_money', 'Refund_not_showing_up', 'request_refund', 'reverted_card_payment?', 'supported_cards_and_currencies', 'terminate_account', 'top_up_by_bank_transfer_charge', 'top_up_by_card_charge', 'top_up_by_cash_or_cheque', 'top_up_failed', 'top_up_limits', 'top_up_reverted', 'topping_up_by_card', 'transaction_charged_twice', 'transfer_fee_charged', 'transfer_into_account', 'transfer_not_received_by_recipient', 'transfer_timing', 'unable_to_verify_identity', 'verify_my_identity', 'verify_source_of_funds', 'verify_top_up', 'virtual_card_not_working', 'visa_or_mastercard', 'why_verify_identity', 'wrong_amount_of_cash_received', 'wrong_exchange_rate_for_cash_withdrawal'])\nAll interactions will be structured in the following way, with the appropriate values filled in.\n\n[[ ## text ## ]]\n{text}\n\n[[ ## reasoning ## ]]\n{reasoning}\n\n[[ ## label ## ]]\n{label}        # note: the value you produce must exactly match (no extra characters) one of: activate_my_card; age_limit; apple_pay_or_google_pay; atm_support; automatic_top_up; balance_not_updated_after_bank_transfer; balance_not_updated_after_cheque_or_cash_deposit; beneficiary_not_allowed; cancel_transfer; card_about_to_expire; card_acceptance; card_arrival; card_delivery_estimate; card_linking; card_not_working; card_payment_fee_charged; card_payment_not_recognised; card_payment_wrong_exchange_rate; card_swallowed; cash_withdrawal_charge; cash_withdrawal_not_recognised; change_pin; compromised_card; contactless_not_working; country_support; declined_card_payment; declined_cash_withdrawal; declined_transfer; direct_debit_payment_not_recognised; disposable_card_limits; edit_personal_details; exchange_charge; exchange_rate; exchange_via_app; extra_charge_on_statement; failed_transfer; fiat_currency_support; get_disposable_virtual_card; get_physical_card; getting_spare_card; getting_virtual_card; lost_or_stolen_card; lost_or_stolen_phone; order_physical_card; passcode_forgotten; pending_card_payment; pending_cash_withdrawal; pending_top_up; pending_transfer; pin_blocked; receiving_money; Refund_not_showing_up; request_refund; reverted_card_payment?; supported_cards_and_currencies; terminate_account; top_up_by_bank_transfer_charge; top_up_by_card_charge; top_up_by_cash_or_cheque; top_up_failed; top_up_limits; top_up_reverted; topping_up_by_card; transaction_charged_twice; transfer_fee_charged; transfer_into_account; transfer_not_received_by_recipient; transfer_timing; unable_to_verify_identity; verify_my_identity; verify_source_of_funds; verify_top_up; virtual_card_not_working; visa_or_mastercard; why_verify_identity; wrong_amount_of_cash_received; wrong_exchange_rate_for_cash_withdrawal\n\n[[ ## completed ## ]]\nIn adhering to this structure, your objective is: \n        Given the fields `text`, produce the fields `label`."}, {'role': 'user', 'content': "[[ ## text ## ]]\nI am still waiting ON my card   ?\n\nRespond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## label ## ]]` (must be formatted as a valid Python Literal['activate_my_card', 'age_limit', 'apple_pay_or_google_pay', 'atm_support', 'automatic_top_up', 'balance_not_updated_after_bank_transfer', 'balance_not_updated_after_cheque_or_cash_deposit', 'beneficiary_not_allowed', 'cancel_transfer', 'card_about_to_expire', 'card_acceptance', 'card_arrival', 'card_delivery_estimate', 'card_linking', 'card_not_working', 'card_payment_fee_charged', 'card_payment_not_recognised', 'card_payment_wrong_exchange_rate', 'card_swallowed', 'cash_withdrawal_charge', 'cash_withdrawal_not_recognised', 'change_pin', 'compromised_card', 'contactless_not_working', 'country_support', 'declined_card_payment', 'declined_cash_withdrawal', 'declined_transfer', 'direct_debit_payment_not_recognised', 'disposable_card_limits', 'edit_personal_details', 'exchange_charge', 'exchange_rate', 'exchange_via_app', 'extra_charge_on_statement', 'failed_transfer', 'fiat_currency_support', 'get_disposable_virtual_card', 'get_physical_card', 'getting_spare_card', 'getting_virtual_card', 'lost_or_stolen_card', 'lost_or_stolen_phone', 'order_physical_card', 'passcode_forgotten', 'pending_card_payment', 'pending_cash_withdrawal', 'pending_top_up', 'pending_transfer', 'pin_blocked', 'receiving_money', 'Refund_not_showing_up', 'request_refund', 'reverted_card_payment?', 'supported_cards_and_currencies', 'terminate_account', 'top_up_by_bank_transfer_charge', 'top_up_by_card_charge', 'top_up_by_cash_or_cheque', 'top_up_failed', 'top_up_limits', 'top_up_reverted', 'topping_up_by_card', 'transaction_charged_twice', 'transfer_fee_charged', 'transfer_into_account', 'transfer_not_received_by_recipient', 'transfer_timing', 'unable_to_verify_identity', 'verify_my_identity', 'verify_source_of_funds', 'verify_top_up', 'virtual_card_not_working', 'visa_or_mastercard', 'why_verify_identity', 'wrong_amount_of_cash_received', 'wrong_exchange_rate_for_cash_withdrawal']), and then ending with the marker for `[[ ## completed ## ]]`."}]*


# 5 Preparing dataset for training , testing and evaluation

In [None]:
trainset = raw_data[:500]
trainset[0]

Example({'text': 'What if there is an error on the exchange rate?', 'label': 'card_payment_wrong_exchange_rate'}) (input_keys={'text'})

In [13]:
testset_before_optimization=raw_data[500:700]

In [14]:
testset_after_optimization=raw_data[700:900]

In [15]:
devset=raw_data[900:1000]

# 6. Defining metric and evaluating the base program before optimization

In [16]:
from dspy.teleprompt import MIPROv2
from tqdm import tqdm

In [17]:
def custom_metric(example, prediction, trace=None):
    # Exact match
    if example.label == prediction.label:
        return 1.0
    else:
        return 0.0

In [18]:
from tqdm import tqdm

def evaluate_pipeline(pipeline, dataset, batch_size=5):
    predictions = []
    errors = 0
    total_examples = len(dataset)
    
    with tqdm(total=total_examples, desc="Processing examples", unit="example") as pbar:
        for i in range(0, len(dataset), batch_size):
            batch = dataset[i:i + batch_size]
            batch_texts = [example.text for example in batch]
            
            batch_predictions = []
            for text in batch_texts:
                try:
                    # Try with retries in case of transient errors
                    result = pipeline(text=text)
                    batch_predictions.append(result)
                except Exception as e:
                    errors += 1
                    batch_predictions.append(None)  # or appropriate default
                    tqdm.write(f"Error processing example {i}: {str(e)}")
                finally:
                    pbar.update(1)
            
            predictions.extend(batch_predictions)
    
    # Calculate custom metric score (skipping failed examples)
    custom_scores = []
    for example, pred in zip(dataset, predictions):
        if pred is not None:
            try:
                custom_scores.append(custom_metric(example, pred))
            except:
                errors += 1
    
    avg_custom_score = sum(custom_scores) / len(custom_scores) if custom_scores else 0
    print(f"\nEvaluation completed with {errors} errors")
    print(f"Custom Metric Score: {avg_custom_score:.2f} (calculated on {len(custom_scores)} valid examples)")


In [None]:
# Evaluation on pipeline before optimization
evaluate_pipeline(program.deepcopy(), testset_before_optimization)

Processing examples:   8%|▊         | 15/200 [00:02<00:39,  4.67example/s]

Error processing example 10: 'list' object has no attribute 'items'


Processing examples:  10%|█         | 21/200 [00:03<00:32,  5.52example/s]

Error processing example 15: 'list' object has no attribute 'items'


Processing examples:  36%|███▋      | 73/200 [01:12<39:25, 18.63s/example]

Error processing example 70: 'list' object has no attribute 'items'


Processing examples:  60%|█████▉    | 119/200 [21:08<1:00:00, 44.45s/example]

Error processing example 115: 'list' object has no attribute 'items'


Processing examples:  75%|███████▌  | 150/200 [35:21<32:15, 38.70s/example]  

Error processing example 145: 'list' object has no attribute 'items'


Processing examples:  76%|███████▋  | 153/200 [37:53<38:18, 48.91s/example]

Error processing example 150: 'list' object has no attribute 'items'


Processing examples:  77%|███████▋  | 154/200 [39:27<47:59, 62.60s/example]

Error processing example 150: 'list' object has no attribute 'items'


Processing examples:  82%|████████▏ | 164/200 [44:32<26:21, 43.93s/example]

Error processing example 160: 'list' object has no attribute 'items'


Processing examples: 100%|██████████| 200/200 [59:41<00:00, 17.91s/example]


Evaluation completed with 8 errors
Custom Metric Score: 0.65 (calculated on 192 valid examples)





**Hence, we got 65% accuracy on program before optimized**

# 7. PERFORMING MIPRO OPTIMIZATION

In [None]:
# Configure MIPRO optimizer
optimizer = MIPROv2(
    metric=custom_metric,
    num_candidates=15,
    auto="light"  # Options: 'light', 'medium', 'heavy'
)

In [None]:
# To avoid unnecessary so many inspection of API CALL
litellm.set_verbose = False

In [27]:
optimized_program = optimizer.compile(
    program,  # Your pipeline model
    trainset=trainset,
    num_trials=15,  
    valset=devset,  # Optional validation set
    max_bootstrapped_demos=4,
    max_labeled_demos=5,
)

2025/04/17 12:47:37 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 7
minibatch: True
num_candidates: 5
valset size: 100

2025/04/17 12:47:39 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/04/17 12:47:39 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/04/17 12:47:39 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=5 sets of demonstrations...


Bootstrapping set 1/5
Bootstrapping set 2/5
Bootstrapping set 3/5


  2%|▏         | 11/500 [04:30<3:20:11, 24.56s/it]


Bootstrapped 4 full traces after 11 examples for up to 1 rounds, amounting to 11 attempts.
Bootstrapping set 4/5


  1%|          | 3/500 [01:02<2:52:58, 20.88s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 5/5


  1%|          | 4/500 [01:30<3:07:09, 22.64s/it]
2025/04/17 12:54:43 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/04/17 12:54:43 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.


2025/04/17 12:54:45 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...

2025/04/17 13:01:59 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/04/17 13:01:59 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `text`, produce the fields `label`.

2025/04/17 13:01:59 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Given a piece of text, first generate a step-by-step reasoning process that explains why the text should be categorized in a specific way. Then, based on this reasoning, assign it to one of the predefined labels.

2025/04/17 13:01:59 INFO dspy.teleprompt.mipro_optimizer_v2: 2: Given a customer inquiry regarding card payments, provide a detailed step-by-step explanation (reasoning) that identifies and categorizes the main issue or concern in the text. Based on this analysis, assign the most appropriate predefined label from the list of categories related to card-related issues, financial transactions, or customer support 

Average Metric: 70.00 / 100 (70.0%): 100%|██████████| 100/100 [18:02<00:00, 10.82s/it]

2025/04/17 13:20:02 INFO dspy.evaluate.evaluate: Average Metric: 70.0 / 100 (70.0%)
2025/04/17 13:20:02 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 70.0






2025/04/17 13:20:02 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 10 - Minibatch ==


Average Metric: 28.00 / 35 (80.0%): 100%|██████████| 35/35 [08:59<00:00, 15.40s/it]

2025/04/17 13:29:01 INFO dspy.evaluate.evaluate: Average Metric: 28.0 / 35 (80.0%)
2025/04/17 13:29:01 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 1'].
2025/04/17 13:29:01 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0]
2025/04/17 13:29:01 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.0]
2025/04/17 13:29:01 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/04/17 13:29:01 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 10 - Minibatch ==



Average Metric: 23.00 / 35 (65.7%): 100%|██████████| 35/35 [10:11<00:00, 17.46s/it]

2025/04/17 13:39:12 INFO dspy.evaluate.evaluate: Average Metric: 23.0 / 35 (65.7%)
2025/04/17 13:39:12 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].
2025/04/17 13:39:12 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 65.71]
2025/04/17 13:39:12 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.0]
2025/04/17 13:39:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/04/17 13:39:12 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 10 - Minibatch ==



Average Metric: 27.00 / 35 (77.1%): 100%|██████████| 35/35 [12:14<00:00, 20.99s/it] 

2025/04/17 13:51:27 INFO dspy.evaluate.evaluate: Average Metric: 27.0 / 35 (77.1%)
2025/04/17 13:51:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 1'].
2025/04/17 13:51:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 65.71, 77.14]
2025/04/17 13:51:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.0]
2025/04/17 13:51:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/04/17 13:51:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 10 - Minibatch ==



Average Metric: 24.00 / 35 (68.6%): : 37it [09:22, 15.20s/it]                      

2025/04/17 14:00:49 INFO dspy.evaluate.evaluate: Average Metric: 24.0 / 35 (68.6%)





2025/04/17 14:00:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].
2025/04/17 14:00:50 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 65.71, 77.14, 68.57]
2025/04/17 14:00:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.0]
2025/04/17 14:00:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/04/17 14:00:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 10 - Minibatch ==


Average Metric: 29.00 / 35 (82.9%): 100%|██████████| 35/35 [10:08<00:00, 17.40s/it]

2025/04/17 14:10:59 INFO dspy.evaluate.evaluate: Average Metric: 29.0 / 35 (82.9%)





2025/04/17 14:10:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 82.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 3'].
2025/04/17 14:10:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 65.71, 77.14, 68.57, 82.86]
2025/04/17 14:10:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.0]
2025/04/17 14:10:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.0


2025/04/17 14:10:59 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 10 - Full Evaluation =====
2025/04/17 14:10:59 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 82.86) from minibatch trials...


Average Metric: 78.00 / 100 (78.0%): 100%|██████████| 100/100 [20:57<00:00, 12.57s/it]

2025/04/17 14:31:56 INFO dspy.evaluate.evaluate: Average Metric: 78.0 / 100 (78.0%)





2025/04/17 14:31:57 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 78.0
2025/04/17 14:31:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.0, 78.0]
2025/04/17 14:31:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 78.0
2025/04/17 14:31:57 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/04/17 14:31:57 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 10 - Minibatch ==


Average Metric: 22.00 / 35 (62.9%): 100%|██████████| 35/35 [08:59<00:00, 15.42s/it]

2025/04/17 14:40:57 INFO dspy.evaluate.evaluate: Average Metric: 22.0 / 35 (62.9%)
2025/04/17 14:40:57 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1'].
2025/04/17 14:40:57 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 65.71, 77.14, 68.57, 82.86, 62.86]
2025/04/17 14:40:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.0, 78.0]
2025/04/17 14:40:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 78.0


2025/04/17 14:40:57 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 10 - Minibatch ==



Average Metric: 25.00 / 35 (71.4%): 100%|██████████| 35/35 [11:37<00:00, 19.92s/it]

2025/04/17 14:52:34 INFO dspy.evaluate.evaluate: Average Metric: 25.0 / 35 (71.4%)
2025/04/17 14:52:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 71.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 4'].
2025/04/17 14:52:34 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 65.71, 77.14, 68.57, 82.86, 62.86, 71.43]
2025/04/17 14:52:34 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.0, 78.0]
2025/04/17 14:52:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 78.0


2025/04/17 14:52:34 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 10 / 10 - Full Evaluation =====
2025/04/17 14:52:34 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 80.0) from minibatch trials...



Average Metric: 74.00 / 100 (74.0%): 100%|██████████| 100/100 [16:23<00:00,  9.83s/it]

2025/04/17 15:08:57 INFO dspy.evaluate.evaluate: Average Metric: 74.0 / 100 (74.0%)
2025/04/17 15:08:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [70.0, 78.0, 74.0]
2025/04/17 15:08:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 78.0
2025/04/17 15:08:57 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/04/17 15:08:57 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 78.0!





**Here, easily we could trace out that it optimized the program by selecting the best combination of instructions and fewshots/demos**

In [28]:
optimized_program.set_lm(lm=lm)

In [29]:
optimized_program.save("mipro_optimized_deepseekr1-14b_latest.json")

In [30]:
optimized_program(text="Where can i get new card ?")

Prediction(
    reasoning='The user is asking about obtaining a new physical card, which involves the process of ordering or receiving a replacement card.',
    label='order_physical_card'
)

In [31]:
lm.inspect_history(1)





[34m[2025-04-17T15:10:08.536607][0m

[31mSystem message:[0m

Your input fields are:
1. `text` (str)
Your output fields are:
1. `reasoning` (str)
2. `label` (Literal['activate_my_card', 'age_limit', 'apple_pay_or_google_pay', 'atm_support', 'automatic_top_up', 'balance_not_updated_after_bank_transfer', 'balance_not_updated_after_cheque_or_cash_deposit', 'beneficiary_not_allowed', 'cancel_transfer', 'card_about_to_expire', 'card_acceptance', 'card_arrival', 'card_delivery_estimate', 'card_linking', 'card_not_working', 'card_payment_fee_charged', 'card_payment_not_recognised', 'card_payment_wrong_exchange_rate', 'card_swallowed', 'cash_withdrawal_charge', 'cash_withdrawal_not_recognised', 'change_pin', 'compromised_card', 'contactless_not_working', 'country_support', 'declined_card_payment', 'declined_cash_withdrawal', 'declined_transfer', 'direct_debit_payment_not_recognised', 'disposable_card_limits', 'edit_personal_details', 'exchange_charge', 'exchange_rate', 'exchange_via_app

**Here , we could see the prompt for the optimized program is now :-**

*[
  {
    "role": "system",
    "content": "Your input fields are:\n1. `text` (str)\nYour output fields are:\n1. `reasoning` (str)\n2. `label` (Literal['activate_my_card', 'age_limit', 'apple_pay_or_google_pay', 'atm_support', 'automatic_top_up', 'balance_not_updated_after_bank_transfer', 'balance_not_updated_after_cheque_or_cash_deposit', 'beneficiary_not_allowed', 'cancel_transfer', 'card_about_to_expire', 'card_acceptance', 'card_arrival', 'card_delivery_estimate', 'card_linking', 'card_not_working', 'card_payment_fee_charged', 'card_payment_not_recognised', 'card_payment_wrong_exchange_rate', 'card_swallowed', 'cash_withdrawal_charge', 'cash_withdrawal_not_recognised', 'change_pin', 'compromised_card', 'contactless_not_working', 'country_support', 'declined_card_payment', 'declined_cash_withdrawal', 'declined_transfer', 'direct_debit_payment_not_recognised', 'disposable_card_limits', 'edit_personal_details', 'exchange_charge', 'exchange_rate', 'exchange_via_app', 'extra_charge_on_statement', 'failed_transfer', 'fiat_currency_support', 'get_disposable_virtual_card', 'get_physical_card', 'getting_spare_card', 'getting_virtual_card', 'lost_or_stolen_card', 'lost_or_stolen_phone', 'order_physical_card', 'passcode_forgotten', 'pending_card_payment', 'pending_cash_withdrawal', 'pending_top_up', 'pending_transfer', 'pin_blocked', 'receiving_money', 'Refund_not_showing_up', 'request_refund', 'reverted_card_payment?', 'supported_cards_and_currencies', 'terminate_account', 'top_up_by_bank_transfer_charge', 'top_up_by_card_charge', 'top_up_by_cash_or_cheque', 'top_up_failed', 'top_up_limits', 'top_up_reverted', 'topping_up_by_card', 'transaction_charged_twice', 'transfer_fee_charged', 'transfer_into_account', 'transfer_not_received_by_recipient', 'transfer_timing', 'unable_to_verify_identity', 'verify_my_identity', 'verify_source_of_funds', 'verify_top_up', 'virtual_card_not_working', 'visa_or_mastercard', 'why_verify_identity', 'wrong_amount_of_cash_received', 'wrong_exchange_rate_for_cash_withdrawal'])\nAll interactions will be structured in the following way, with the appropriate values filled in.\n\n[[ ## text ## ]]\n{text}\n\n[[ ## reasoning ## ]]\n{reasoning}\n\n[[ ## label ## ]]\n{label}        # note: the value you produce must exactly match (no extra characters) one of: activate_my_card; age_limit; apple_pay_or_google_pay; atm_support; automatic_top_up; balance_not_updated_after_bank_transfer; balance_not_updated_after_cheque_or_cash_deposit; beneficiary_not_allowed; cancel_transfer; card_about_to_expire; card_acceptance; card_arrival; card_delivery_estimate; card_linking; card_not_working; card_payment_fee_charged; card_payment_not_recognised; card_payment_wrong_exchange_rate; card_swallowed; cash_withdrawal_charge; cash_withdrawal_not_recognised; change_pin; compromised_card; contactless_not_working; country_support; declined_card_payment; declined_cash_withdrawal; declined_transfer; direct_debit_payment_not_recognised; disposable_card_limits; edit_personal_details; exchange_charge; exchange_rate; exchange_via_app; extra_charge_on_statement; failed_transfer; fiat_currency_support; get_disposable_virtual_card; get_physical_card; getting_spare_card; getting_virtual_card; lost_or_stolen_card; lost_or_stolen_phone; order_physical_card; passcode_forgotten; pending_card_payment; pending_cash_withdrawal; pending_top_up; pending_transfer; pin_blocked; receiving_money; Refund_not_showing_up; request_refund; reverted_card_payment?; supported_cards_and_currencies; terminate_account; top_up_by_bank_transfer_charge; top_up_by_card_charge; top_up_by_cash_or_cheque; top_up_failed; top_up_limits; top_up_reverted; topping_up_by_card; transaction_charged_twice; transfer_fee_charged; transfer_into_account; transfer_not_received_by_recipient; transfer_timing; unable_to_verify_identity; verify_my_identity; verify_source_of_funds; verify_top_up; virtual_card_not_working; visa_or_mastercard; why_verify_identity; wrong_amount_of_cash_received; wrong_exchange_rate_for_cash_withdrawal\n\n[[ ## completed ## ]]\nIn adhering to this structure, your objective is: \n        Given the fields `text`, produce the fields `label`."
  },
  {
    "role": "user",
    "content": "This is an example of the task, though some input or output fields are not supplied.\n\n[[ ## text ## ]]\nMy card is no longer lost. How do I enter it back into the app?"
  },
  {
    "role": "assistant",
    "content": "[[ ## reasoning ## ]]\nNot supplied for this particular example. \n\n[[ ## label ## ]]\ncard_linking"
  },
  {
    "role": "user",
    "content": "This is an example of the task, though some input or output fields are not supplied.\n\n[[ ## text ## ]]\nThere is an extra 1£ charge on my app. Why did you charge me extra?"
  },
  {
    "role": "assistant",
    "content": "[[ ## reasoning ## ]]\nNot supplied for this particular example. \n\n[[ ## label ## ]]\nextra_charge_on_statement"
  },
  {
    "role": "user",
    "content": "This is an example of the task, though some input or output fields are not supplied.\n\n[[ ## text ## ]]\nYour foreign exchange rate is wrong."
  },
  {
    "role": "assistant",
    "content": "[[ ## reasoning ## ]]\nNot supplied for this particular example. \n\n[[ ## label ## ]]\ncard_payment_wrong_exchange_rate"
  },
  {
    "role": "user",
    "content": "This is an example of the task, though some input or output fields are not supplied.\n\n[[ ## text ## ]]\nI was charged the wing amount for an item."
  },
  {
    "role": "assistant",
    "content": "[[ ## reasoning ## ]]\nNot supplied for this particular example. \n\n[[ ## label ## ]]\ncard_payment_wrong_exchange_rate"
  },
  {
    "role": "user",
    "content": "This is an example of the task, though some input or output fields are not supplied.\n\n[[ ## text ## ]]\nCan you do exchanges of EUR?"
  },
  {
    "role": "assistant",
    "content": "[[ ## reasoning ## ]]\nNot supplied for this particular example. \n\n[[ ## label ## ]]\nfiat_currency_support"
  },
  {
    "role": "user",
    "content": "[[ ## text ## ]]\nWhat if there is an error on the exchange rate?\n\nRespond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## label ## ]]` (must be formatted as a valid Python Literal['activate_my_card', 'age_limit', 'apple_pay_or_google_pay', 'atm_support', 'automatic_top_up', 'balance_not_updated_after_bank_transfer', 'balance_not_updated_after_cheque_or_cash_deposit', 'beneficiary_not_allowed', 'cancel_transfer', 'card_about_to_expire', 'card_acceptance', 'card_arrival', 'card_delivery_estimate', 'card_linking', 'card_not_working', 'card_payment_fee_charged', 'card_payment_not_recognised', 'card_payment_wrong_exchange_rate', 'card_swallowed', 'cash_withdrawal_charge', 'cash_withdrawal_not_recognised', 'change_pin', 'compromised_card', 'contactless_not_working', 'country_support', 'declined_card_payment', 'declined_cash_withdrawal', 'declined_transfer', 'direct_debit_payment_not_recognised', 'disposable_card_limits', 'edit_personal_details', 'exchange_charge', 'exchange_rate', 'exchange_via_app', 'extra_charge_on_statement', 'failed_transfer', 'fiat_currency_support', 'get_disposable_virtual_card', 'get_physical_card', 'getting_spare_card', 'getting_virtual_card', 'lost_or_stolen_card', 'lost_or_stolen_phone', 'order_physical_card', 'passcode_forgotten', 'pending_card_payment', 'pending_cash_withdrawal', 'pending_top_up', 'pending_transfer', 'pin_blocked', 'receiving_money', 'Refund_not_showing_up', 'request_refund', 'reverted_card_payment?', 'supported_cards_and_currencies', 'terminate_account', 'top_up_by_bank_transfer_charge', 'top_up_by_card_charge', 'top_up_by_cash_or_cheque', 'top_up_failed', 'top_up_limits', 'top_up_reverted', 'topping_up_by_card', 'transaction_charged_twice', 'transfer_fee_charged', 'transfer_into_account', 'transfer_not_received_by_recipient', 'transfer_timing', 'unable_to_verify_identity', 'verify_my_identity', 'verify_source_of_funds', 'verify_top_up', 'virtual_card_not_working', 'visa_or_mastercard', 'why_verify_identity', 'wrong_amount_of_cash_received', 'wrong_exchange_rate_for_cash_withdrawal']), and then ending with the marker for `[[ ## completed ## ]]`."
  }
]*

In [None]:
# Evaluation on pipeline after optimization
evaluate_pipeline(optimized_program, testset_after_optimization)

Processing examples: 100%|██████████| 200/200 [2:19:32<00:00, 41.86s/example]  


Evaluation completed with 0 errors
Custom Metric Score: 0.70 (calculated on 200 valid examples)





**Score after optimization is 70%**

# 8. DSPy Program Comparison: Initial vs Optimized with MIPRO v2



## Initial Base Program

### Key Features
- **Structure**: Strict JSON-based prompt template with `system`, `user`, and `assistant` roles.
- **Input/Output Fields**:
  - **Input**: `text` (str)
  - **Output**: `reasoning` (str), `label` (predefined Literal type).
- **Instructions**:
  - Explicitly defines valid `label` options in a Python `Literal` format.
  - Uses placeholder reasoning (`Not supplied for this example`) in training data.
  - Emphasizes strict formatting (e.g., `[[ ## reasoning ## ]]` blocks).
- **Examples**:
  - Includes 5 labeled examples with minimal reasoning (e.g., `My card is no longer lost → card_linking`).
- **Demos**:
  - ✅ Contains **5 training demos** with `text` and `label` mappings.
  - ❌ No explicit reasoning steps in demos.

---

## Optimized Program (MIPRO v2)

### Key Features
- **Structure**: Identical JSON template to the initial program (no structural changes).
- **Input/Output Fields**:
  - Same as the initial program (`text`, `reasoning`, `label`).
- **Optimizations**:
  - **Simplified Instructions**: Removes redundant explanations (e.g., `note: the value you produce must exactly match...` is retained but streamlined).
  - **Label Consistency**: Fixes potential ambiguities in `label` options (e.g., `reverted_card_payment?` vs `reverted_card_payment`).
  - **Clarified Reasoning**: Encourages more explicit reasoning in outputs (though demos still lack this).
- **Examples**:
  - Same 5 examples as the initial program (no new training data added).
- **Demos**:
  - ✅ Retains **5 demos** but with potential for improved label accuracy.
  - ❌ Still lacks detailed reasoning in examples.

---

## Key Differences

| Feature                | Initial Program                          | Optimized Program (MIPRO v2)             |
|------------------------|------------------------------------------|------------------------------------------|
| **Instruction Clarity**| Verbose, repetitive label definitions    | Streamlined, less redundant instructions |
| **Label Consistency**  | Potential typos (e.g., `reverted_card_payment?`) | Fixed label naming conventions           |
| **Reasoning Guidance** | Minimal reasoning in examples            | Implicitly encourages clearer reasoning  |
| **Demo Quality**       | Basic examples with placeholders         | Same examples, but label accuracy improved via optimizations |
| **Structural Changes** | Rigid JSON template                      | No structural changes                    |

---

## Instructions for Each

### Initial Program
- **Goal**: Map `text` to a predefined `label` using strict JSON formatting.
- **Steps**:
  1. Parse `text` input.
  2. Select the most relevant `label` from the `Literal` list.
  3. Output `reasoning` (optional) and `label` in JSON blocks.

### Optimized Program
- **Goal**: Same as initial program, but with refined label selection and reasoning.
- **Steps**:
  1. Parse `text` input.
  2. Use MIPRO v2 optimizations to resolve ambiguous labels (e.g., `exchange_rate` vs `card_payment_wrong_exchange_rate`).
  3. Output clearer `reasoning` (if applicable) and `label`.

---

## Demo Availability
- **Initial Program**: Includes 5 demos but lacks reasoning details.
- **Optimized Program**: Same demos but leverages MIPRO v2 for better label mapping.  
**Note**: Neither version provides full end-to-end code demos; both focus on prompt-based examples.

---

## Summary
- **Initial Program**: Baseline template with rigid formatting and basic examples.
- **Optimized Program**: Retains structure but improves label consistency and reasoning quality via MIPRO v2.