In [36]:
import dspy
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from tqdm.auto import tqdm

student_lm_name = "fireworks_ai/accounts/fireworks/models/gpt-oss-20b"
teacher_lm_name = "openai/gpt-5.1-2025-11-13"

if "ollama_chat" in student_lm_name:
    student_lm = dspy.LM(student_lm_name, api_base='http://localhost:11434', api_key='')
else:
    student_lm = dspy.LM(student_lm_name)
teacher_lm = dspy.LM(teacher_lm_name)

dspy.configure(lm=student_lm)


# Prepare DSPY Dataset (Subtask 2)


In [37]:
trial_id = "DSP0002SPRO"
# Train val test split
lang = "eng"
# Load the training and validation data for subtask 2
train_df = pd.read_csv(f'./dev_phase/subtask2_pro/train/{lang}.csv')
test_df = pd.read_csv(f'./dev_phase/subtask2_pro/dev/{lang}.csv')
# Split train into train and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)


In [38]:
# Define label columns for multilabel classification
LABEL_COLUMNS = ["political", "racial/ethnic", "religious", "gender/sexual", "other"]

# Map for each label: 1 -> "yes", 0 -> "no"
LABEL_MAP = {1: "yes", 0: "no"}
LABEL_MAP_INV = {"yes": 1, "no": 0}

def make_dspy_examples(df, include_label: bool = True):
    examples = []
    for _, row in df.iterrows():
        kwargs = dict(
            sentence=row["text"],
        )
        if include_label:
            for col in LABEL_COLUMNS:
                if col in row:
                    kwargs[col.replace("/", "_").replace("-", "_")] = LABEL_MAP[row[col]]
        example = dspy.Example(**kwargs).with_inputs("sentence")
        examples.append(example)
    return examples

# Create DSPY datasets
raw_train = make_dspy_examples(train_df, include_label=True)
raw_val = make_dspy_examples(val_df, include_label=True)
raw_test = make_dspy_examples(test_df, include_label=False)


# # For now take only 10% of each 
raw_train = raw_train[:int(len(raw_train) * 0.2)]
raw_val = raw_val[:int(len(raw_val) * 0.3)]
raw_test = raw_test[:int(len(raw_test) * 1)]


# Define Signature (Subtask 2 - Multilabel)


In [39]:
from typing import Literal

class PolarizationMultilabel(dspy.Signature):
    """
    Polarization denotes stereotyping, vilification, dehumanization, deindividuation, or intolerance of other people's views, beliefs, and identities.
    
    Given a sentence, classify it into one or more polarization categories:
    - political: Political polarization targeting political groups, parties, ideologies, or political figures
    - racial_ethnic: Racial or ethnic polarization targeting racial or ethnic groups
    - religious: Religious polarization targeting religious groups or beliefs
    - gender_sexual: Gender or sexual polarization targeting gender identities or sexual orientations
    - other: Other forms of polarization not covered by the above categories
    
    For each category, output "yes" if the sentence contains that type of polarization, "no" otherwise.
    A sentence can belong to multiple categories or none at all.
    """

    sentence: str = dspy.InputField()
    political: Literal["yes", "no"] = dspy.OutputField(
        desc='Return "yes" if the sentence contains political polarization, "no" otherwise.',
    )
    racial_ethnic: Literal["yes", "no"] = dspy.OutputField(
        desc='Return "yes" if the sentence contains racial/ethnic polarization, "no" otherwise.',
    )
    religious: Literal["yes", "no"] = dspy.OutputField(
        desc='Return "yes" if the sentence contains religious polarization, "no" otherwise.',
    )
    gender_sexual: Literal["yes", "no"] = dspy.OutputField(
        desc='Return "yes" if the sentence contains gender/sexual polarization, "no" otherwise.',
    )
    other: Literal["yes", "no"] = dspy.OutputField(
        desc='Return "yes" if the sentence contains other forms of polarization, "no" otherwise.',
    )

classify = dspy.Predict(PolarizationMultilabel)


# Evaluation


In [40]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import numpy as np

# Label columns with normalized names (matching dspy field names)
LABEL_FIELDS = ["political", "racial_ethnic", "religious", "gender_sexual", "other"]

def label2id(label: str) -> int:
    return LABEL_MAP_INV.get(label, 0)

def id2label(i: int) -> str:
    return LABEL_MAP[i]

def accuracy_metric(example, pred, trace=None):
    """Compute accuracy as exact match across all labels"""
    correct = 0
    total = len(LABEL_FIELDS)
    for field in LABEL_FIELDS:
        gold = getattr(example, field, "no")
        guess = getattr(pred, field, "no")
        if guess is None:
            guess = "no"
        if gold == guess:
            correct += 1
    return correct / total  # Return fraction of correct labels


In [41]:
evaluate = dspy.Evaluate(
    devset=raw_val,
    metric=accuracy_metric,
    display_progress=True,
    display_table=True,   # nice overview
)
eval_result = evaluate(classify)
print("DSPy average accuracy metric:", eval_result.score)  # percentage


Average Metric: 176.00 / 193 (91.2%): 100%|██████████| 193/193 [00:34<00:00,  5.52it/s]

2025/12/09 09:49:26 INFO dspy.evaluate.evaluate: Average Metric: 176.0 / 193 (91.2%)





Unnamed: 0,sentence,example_political,example_racial_ethnic,example_religious,example_gender_sexual,example_other,pred_political,pred_racial_ethnic,pred_religious,pred_gender_sexual,pred_other,accuracy_metric
0,Donald Trump relies on First Amendment,no,no,no,no,no,no,no,no,no,no,✔️ [1.000]
1,House GOP in no rush to give more Ukraine aid after 6,no,no,no,no,no,yes,no,no,no,no,✔️ [0.800]
2,country_xi adviser to meet with US officials on war,no,no,no,no,no,no,no,no,no,no,✔️ [1.000]
3,"so russia commits war crimes, how does that justify ukraine also c...",yes,no,no,no,no,yes,no,no,no,no,✔️ [1.000]
4,Cant wait to watch this episode of Border Security,no,no,no,no,no,no,no,no,no,no,✔️ [1.000]
...,...,...,...,...,...,...,...,...,...,...,...,...
188,"Wow, thats awesome. It reminds me of the crazy things the Tamir in...",no,no,no,no,no,no,no,no,no,no,✔️ [1.000]
189,How long will it be until human rights are stripped away? oligarch...,yes,no,no,no,no,no,no,no,no,no,✔️ [0.800]
190,There are no open borders here in Texas.,no,no,no,no,no,no,no,no,no,no,✔️ [1.000]
191,"Ottawa to unveil economic update detailing deficit, new border sec...",no,no,no,no,no,no,no,no,no,no,✔️ [1.000]


DSPy average accuracy metric: 91.19


In [42]:
def eval_metrics_on_dataset(program, dataset):
    """Compute multilabel classification metrics"""
    y_true = []
    y_pred = []

    for ex in dataset:
        pred = program(sentence=ex.sentence)
        
        gold_labels = []
        pred_labels = []
        
        for field in LABEL_FIELDS:
            gold = getattr(ex, field, "no")
            guess = getattr(pred, field, "no")
            if guess is None:
                guess = "no"
            gold_labels.append(label2id(gold))
            pred_labels.append(label2id(guess))
        
        y_true.append(gold_labels)
        y_pred.append(pred_labels)

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    return {
        'f1_macro': f1_score(y_true, y_pred, average='macro'),
        'f1_micro': f1_score(y_true, y_pred, average='micro'),
        'f1_samples': f1_score(y_true, y_pred, average='samples'),
        'f1_weighted': f1_score(y_true, y_pred, average='weighted'),
        'precision_macro': precision_score(y_true, y_pred, average='macro', zero_division=0),
        'recall_macro': recall_score(y_true, y_pred, average='macro', zero_division=0),
        'exact_match_ratio': np.mean(np.all(y_true == y_pred, axis=1)),
    }

metrics_val = eval_metrics_on_dataset(classify, raw_val)
print("Validation metrics:", metrics_val)

# metrics_test = eval_metrics_on_dataset(classify, raw_test)
# print("Test metrics:", metrics_test)


Validation metrics: {'f1_macro': 0.46280423280423283, 'f1_micro': 0.5728643216080402, 'f1_samples': 0.2023686158401184, 'f1_weighted': 0.5587832102537985, 'precision_macro': 0.5207142857142857, 'recall_macro': 0.44982905982905985, 'exact_match_ratio': np.float64(0.6476683937823834)}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [43]:
student_f1_macro = metrics_val["f1_macro"]
import json

# Build current language entry for subtask_2
lang_entry = {
    "eval_results": {
        "eval_f1_macro": student_f1_macro
    }
}

# Load previous trials from logs.json if it exists, else create new list
log_path = "logs.json"
try:
    with open(log_path, "r") as f:
        trials = json.load(f)
except FileNotFoundError:
    trials = []

# Look for existing trial by trial_id
found = False
for trial in trials:
    if trial.get("trial_id") == trial_id:
        # Insert or update lang in subtask_2
        if "subtask_2" not in trial:
            trial["subtask_2"] = {"score": None}
        if "score" not in trial["subtask_2"]:
            trial["subtask_2"]["score"] = None
        trial["subtask_2"][lang] = lang_entry
        found = True
        break

if not found:
    # Build new trial dict if trial_id not found
    current_trial = {
        "trial_id": trial_id,
        "metadata": {
            "approach": "dspy basic",
            "model": student_lm_name
        },
        "subtask_2": {
            "score": None,
            lang: lang_entry
        }
    }
    trials.append(current_trial)

# Save back to logs.json
with open(log_path, "w") as f:
    json.dump(trials, f, indent=4)


In [44]:
ids = test_df["id"]

# Map field names back to original CSV column names
FIELD_TO_COL = {
    "political": "political",
    "racial_ethnic": "racial/ethnic",
    "religious": "religious",
    "gender_sexual": "gender/sexual",
    "other": "other"
}

def predict_dataset(program, dataset):
    rows = []
    for i, ex in tqdm(enumerate(dataset), total=len(dataset)):
        out = program(sentence=ex.sentence)
        row = {"id": test_df.iloc[i]["id"]}
        for field in LABEL_FIELDS:
            label = getattr(out, field, "no")
            if label is None:
                label = "no"
            col_name = FIELD_TO_COL[field]
            row[col_name] = label2id(label)
        rows.append(row)
    return pd.DataFrame(rows)



# after you choose which model to use (classify, optimized_classify, etc.)
test_preds_df = predict_dataset(classify, raw_test)

print(test_preds_df.head())


  0%|          | 0/160 [00:00<?, ?it/s]

                                     id  political  racial/ethnic  religious  \
0  eng_f66ca14d60851371f9720aaf4ccd9b58          0              0          0   
1  eng_3a489aa7fed9726aa8d3d4fe74c57efb          0              0          0   
2  eng_95770ff547ea5e48b0be00f385986483          0              0          0   
3  eng_2048ae6f9aa261c48e6d777bcc5b38bf          0              0          0   
4  eng_07781aa88e61e7c0a996abd1e5ea3a20          0              0          0   

   gender/sexual  other  
0              0      0  
1              0      0  
2              0      0  
3              0      0  
4              0      0  


In [45]:
# in results create dir with trial_id and create subtask_2 inside it, then save the csv as "pred_lang.csv" inside it
import os
os.makedirs(f"results/{trial_id}/subtask_2", exist_ok=True)
test_preds_df.to_csv(f"results/{trial_id}/subtask_2/pred_{lang}.csv", index=False)


# Using MIPROv2


In [46]:
from dspy.teleprompt import MIPROv2

mipro = MIPROv2(
    metric=accuracy_metric,
    auto="light",
    teacher_settings=dict(lm=teacher_lm),
    prompt_model=student_lm,
)

optimized_prog = mipro.compile(
    student=dspy.Predict(PolarizationMultilabel),
    trainset=raw_train,
    valset=raw_val,
    requires_permission_to_run=False,
)

# # After optimization, compute your full metrics dict
# metrics_val_opt = eval_metrics_on_dataset(optimized_prog, raw_val)

# print("Optimized validation metrics:", metrics_val_opt)


2025/12/09 09:51:39 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 10
minibatch: True
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 100

2025/12/09 09:51:39 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/12/09 09:51:39 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/12/09 09:51:39 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=6 sets of demonstrations...


Bootstrapping set 1/6
Bootstrapping set 2/6
Bootstrapping set 3/6


  1%|          | 4/515 [00:07<15:11,  1.78s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 4/6


  0%|          | 1/515 [00:00<00:08, 59.29it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 5/6


  0%|          | 1/515 [00:01<11:08,  1.30s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 6/6


  1%|          | 3/515 [00:00<00:07, 65.13it/s]
2025/12/09 09:51:47 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/12/09 09:51:47 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.


2025/12/09 09:52:34 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=3 instructions...

2025/12/09 09:53:26 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/12/09 09:53:26 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Polarization denotes stereotyping, vilification, dehumanization, deindividuation, or intolerance of other people's views, beliefs, and identities.

Given a sentence, classify it into one or more polarization categories:
- political: Political polarization targeting political groups, parties, ideologies, or political figures
- racial_ethnic: Racial or ethnic polarization targeting racial or ethnic groups
- religious: Religious polarization targeting religious groups or beliefs
- gender_sexual: Gender or sexual polarization targeting gender identities or sexual orientations
- other: Other forms of polarization not covered by the above categories

For each category, output "yes" if the sentence contains that type of polarization, "no" 

Average Metric: 92.00 / 100 (92.0%): 100%|██████████| 100/100 [00:00<00:00, 1808.52it/s]

2025/12/09 09:53:26 INFO dspy.evaluate.evaluate: Average Metric: 92.0 / 100 (92.0%)
2025/12/09 09:53:26 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 92.0

2025/12/09 09:53:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 13 - Minibatch ==



Average Metric: 30.60 / 35 (87.4%): 100%|██████████| 35/35 [00:23<00:00,  1.51it/s]

2025/12/09 09:53:49 INFO dspy.evaluate.evaluate: Average Metric: 30.6 / 35 (87.4%)
2025/12/09 09:53:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 87.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3'].
2025/12/09 09:53:49 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [87.43]
2025/12/09 09:53:49 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [92.0]
2025/12/09 09:53:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 92.0


2025/12/09 09:53:49 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 13 - Minibatch ==



Average Metric: 30.80 / 35 (88.0%): 100%|██████████| 35/35 [00:30<00:00,  1.14it/s]

2025/12/09 09:54:20 INFO dspy.evaluate.evaluate: Average Metric: 30.8 / 35 (88.0%)
2025/12/09 09:54:20 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/12/09 09:54:20 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [87.43, 88.0]
2025/12/09 09:54:20 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [92.0]
2025/12/09 09:54:20 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 92.0


2025/12/09 09:54:20 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 13 - Minibatch ==



Average Metric: 30.80 / 35 (88.0%): 100%|██████████| 35/35 [00:33<00:00,  1.04it/s]

2025/12/09 09:54:54 INFO dspy.evaluate.evaluate: Average Metric: 30.8 / 35 (88.0%)
2025/12/09 09:54:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 5'].
2025/12/09 09:54:54 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [87.43, 88.0, 88.0]
2025/12/09 09:54:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [92.0]
2025/12/09 09:54:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 92.0


2025/12/09 09:54:54 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 13 - Minibatch ==



Average Metric: 29.60 / 35 (84.6%): 100%|██████████| 35/35 [00:23<00:00,  1.48it/s]

2025/12/09 09:55:17 INFO dspy.evaluate.evaluate: Average Metric: 29.6 / 35 (84.6%)
2025/12/09 09:55:17 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 2'].
2025/12/09 09:55:17 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [87.43, 88.0, 88.0, 84.57]
2025/12/09 09:55:17 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [92.0]
2025/12/09 09:55:17 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 92.0


2025/12/09 09:55:17 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 13 - Minibatch ==



Average Metric: 33.00 / 35 (94.3%): 100%|██████████| 35/35 [00:05<00:00,  6.68it/s]  

2025/12/09 09:55:22 INFO dspy.evaluate.evaluate: Average Metric: 33.0 / 35 (94.3%)
2025/12/09 09:55:23 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 94.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5'].
2025/12/09 09:55:23 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [87.43, 88.0, 88.0, 84.57, 94.29]
2025/12/09 09:55:23 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [92.0]
2025/12/09 09:55:23 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 92.0


2025/12/09 09:55:23 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 13 - Full Evaluation =====
2025/12/09 09:55:23 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 94.29) from minibatch trials...



Average Metric: 92.00 / 100 (92.0%): 100%|██████████| 100/100 [00:06<00:00, 16.22it/s]

2025/12/09 09:55:29 INFO dspy.evaluate.evaluate: Average Metric: 92.0 / 100 (92.0%)
2025/12/09 09:55:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [92.0, 92.0]
2025/12/09 09:55:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 92.0
2025/12/09 09:55:29 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/12/09 09:55:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 13 - Minibatch ==



Average Metric: 30.80 / 35 (88.0%): 100%|██████████| 35/35 [00:18<00:00,  1.94it/s]

2025/12/09 09:55:47 INFO dspy.evaluate.evaluate: Average Metric: 30.8 / 35 (88.0%)
2025/12/09 09:55:47 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/12/09 09:55:47 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [87.43, 88.0, 88.0, 84.57, 94.29, 88.0]
2025/12/09 09:55:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [92.0, 92.0]
2025/12/09 09:55:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 92.0


2025/12/09 09:55:47 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 13 - Minibatch ==



Average Metric: 31.40 / 35 (89.7%): 100%|██████████| 35/35 [00:22<00:00,  1.52it/s]

2025/12/09 09:56:10 INFO dspy.evaluate.evaluate: Average Metric: 31.400000000000002 / 35 (89.7%)
2025/12/09 09:56:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 89.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2025/12/09 09:56:10 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [87.43, 88.0, 88.0, 84.57, 94.29, 88.0, 89.71]
2025/12/09 09:56:10 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [92.0, 92.0]
2025/12/09 09:56:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 92.0


2025/12/09 09:56:10 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 13 - Minibatch ==



Average Metric: 30.60 / 35 (87.4%): 100%|██████████| 35/35 [00:25<00:00,  1.37it/s]

2025/12/09 09:56:35 INFO dspy.evaluate.evaluate: Average Metric: 30.6 / 35 (87.4%)
2025/12/09 09:56:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 87.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 4'].
2025/12/09 09:56:35 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [87.43, 88.0, 88.0, 84.57, 94.29, 88.0, 89.71, 87.43]
2025/12/09 09:56:35 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [92.0, 92.0]
2025/12/09 09:56:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 92.0


2025/12/09 09:56:35 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 13 - Minibatch ==



Average Metric: 33.00 / 35 (94.3%): 100%|██████████| 35/35 [00:00<00:00, 1284.41it/s]

2025/12/09 09:56:36 INFO dspy.evaluate.evaluate: Average Metric: 33.0 / 35 (94.3%)
2025/12/09 09:56:36 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 94.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5'].
2025/12/09 09:56:36 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [87.43, 88.0, 88.0, 84.57, 94.29, 88.0, 89.71, 87.43, 94.29]
2025/12/09 09:56:36 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [92.0, 92.0]
2025/12/09 09:56:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 92.0


2025/12/09 09:56:36 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 13 - Minibatch ==



Average Metric: 31.20 / 35 (89.1%): 100%|██████████| 35/35 [00:00<00:00, 4072.14it/s]

2025/12/09 09:56:36 INFO dspy.evaluate.evaluate: Average Metric: 31.2 / 35 (89.1%)
2025/12/09 09:56:36 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 89.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5'].
2025/12/09 09:56:36 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [87.43, 88.0, 88.0, 84.57, 94.29, 88.0, 89.71, 87.43, 94.29, 89.14]
2025/12/09 09:56:36 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [92.0, 92.0]
2025/12/09 09:56:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 92.0


2025/12/09 09:56:36 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 13 - Full Evaluation =====
2025/12/09 09:56:36 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 89.71) from minibatch trials...



Average Metric: 90.60 / 100 (90.6%): 100%|██████████| 100/100 [00:36<00:00,  2.76it/s]

2025/12/09 09:57:12 INFO dspy.evaluate.evaluate: Average Metric: 90.60000000000001 / 100 (90.6%)
2025/12/09 09:57:12 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [92.0, 92.0, 90.6]
2025/12/09 09:57:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 92.0
2025/12/09 09:57:12 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/12/09 09:57:12 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 92.0!





In [47]:
evaluate = dspy.Evaluate(
    devset=raw_val,
    metric=accuracy_metric,
    display_progress=True,
    display_table=True,   # nice overview
)
eval_result = evaluate(optimized_prog)
print("DSPy average accuracy metric:", eval_result.score)  # percentage


Average Metric: 176.00 / 193 (91.2%): 100%|██████████| 193/193 [00:00<00:00, 4541.48it/s]

2025/12/09 09:57:12 INFO dspy.evaluate.evaluate: Average Metric: 176.0 / 193 (91.2%)





Unnamed: 0,sentence,example_political,example_racial_ethnic,example_religious,example_gender_sexual,example_other,pred_political,pred_racial_ethnic,pred_religious,pred_gender_sexual,pred_other,accuracy_metric
0,Donald Trump relies on First Amendment,no,no,no,no,no,no,no,no,no,no,✔️ [1.000]
1,House GOP in no rush to give more Ukraine aid after 6,no,no,no,no,no,yes,no,no,no,no,✔️ [0.800]
2,country_xi adviser to meet with US officials on war,no,no,no,no,no,no,no,no,no,no,✔️ [1.000]
3,"so russia commits war crimes, how does that justify ukraine also c...",yes,no,no,no,no,yes,no,no,no,no,✔️ [1.000]
4,Cant wait to watch this episode of Border Security,no,no,no,no,no,no,no,no,no,no,✔️ [1.000]
...,...,...,...,...,...,...,...,...,...,...,...,...
188,"Wow, thats awesome. It reminds me of the crazy things the Tamir in...",no,no,no,no,no,no,no,no,no,no,✔️ [1.000]
189,How long will it be until human rights are stripped away? oligarch...,yes,no,no,no,no,no,no,no,no,no,✔️ [0.800]
190,There are no open borders here in Texas.,no,no,no,no,no,no,no,no,no,no,✔️ [1.000]
191,"Ottawa to unveil economic update detailing deficit, new border sec...",no,no,no,no,no,no,no,no,no,no,✔️ [1.000]


DSPy average accuracy metric: 91.19


In [48]:
metrics_val = eval_metrics_on_dataset(optimized_prog, raw_val)
print("Validation metrics:", metrics_val)
mipro_f1_macro = metrics_val["f1_macro"]


Validation metrics: {'f1_macro': 0.46280423280423283, 'f1_micro': 0.5728643216080402, 'f1_samples': 0.2023686158401184, 'f1_weighted': 0.5587832102537985, 'precision_macro': 0.5207142857142857, 'recall_macro': 0.44982905982905985, 'exact_match_ratio': np.float64(0.6476683937823834)}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [49]:
ids = test_df["id"]
def predict_dataset(program, dataset):
    rows = []
    for i, ex in tqdm(enumerate(dataset), total=len(dataset)):
        out = program(sentence=ex.sentence)
        row = {"id": test_df.iloc[i]["id"]}
        for field in LABEL_FIELDS:
            label = getattr(out, field, "no")
            if label is None:
                label = "no"
            col_name = FIELD_TO_COL[field]
            row[col_name] = label2id(label)
        rows.append(row)
    return pd.DataFrame(rows)


# after you choose which model to use (classify, optimized_classify, etc.)
test_preds_df = predict_dataset(optimized_prog, raw_test)

trial_id = "MIPRO" + trial_id
os.makedirs(f"results/{trial_id}/subtask_2", exist_ok=True)
test_preds_df.to_csv(f"results/{trial_id}/subtask_2/pred_{lang}.csv", index=False)


  0%|          | 0/160 [00:00<?, ?it/s]

In [50]:
import json

log_path = "logs.json"

# Load previous trials from logs.json if it exists, else create new list
try:
    with open(log_path, "r") as f:
        trials = json.load(f)
except FileNotFoundError:
    trials = []

# Try to find an existing trial with the same trial_id
found = False
for trial in trials:
    if trial.get("trial_id") == trial_id:
        # Add or update the language entry under subtask_2
        if "subtask_2" not in trial:
            trial["subtask_2"] = {"score": None}
        if "score" not in trial["subtask_2"]:
            trial["subtask_2"]["score"] = None
        # Insert/update this language result
        trial["subtask_2"][lang] = {
            "eval_results": {
                "eval_f1_macro": mipro_f1_macro
            }
        }
        found = True
        break

if not found:
    # Build current trial result dict and append if no matching trial_id found
    current_trial = {
        "trial_id": trial_id,
        "metadata": {
            "approach": "dspy MIPROv2",
            "student_model": student_lm_name,
            "teacher_model": teacher_lm_name
        },
        "subtask_2": {
            "score": None,
            lang: {
                "eval_results": {
                    "eval_f1_macro": mipro_f1_macro
                }
            }
        }
    }
    trials.append(current_trial)

# Save back to logs.json
with open(log_path, "w") as f:
    json.dump(trials, f, indent=4)


In [51]:
# save the optimized program
# Create dspy_cache dir if it doesn't exist
os.makedirs("dspy_cache", exist_ok=True)
optimized_prog.save(f"dspy_cache/optimized_subtask2_{lang}_dspy_miprov2_student{student_lm_name.split('/')[-1]}_teacher{teacher_lm_name.split('/')[-1]}_{trial_id}.json")


# GEPA


In [52]:
# from dspy import GEPA

# def gepa_metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
#     """GEPA metric for multilabel classification"""
#     correct = 0
#     total = len(LABEL_FIELDS)
    
#     mismatches = []
#     for field in LABEL_FIELDS:
#         gold_label = getattr(gold, field, "no")
#         pred_label = getattr(pred, field, "no")
#         if pred_label is None:
#             pred_label = "no"
#         if gold_label == pred_label:
#             correct += 1
#         else:
#             mismatches.append((field, gold_label, pred_label))

#     score = correct / total

#     # When used just for Evaluate, we only need a scalar
#     if trace is None and pred_name is None and pred_trace is None:
#         return score

#     if score == 1.0:
#         feedback = (
#             "Correct. All polarization categories were classified correctly. "
#             "Keep enforcing the exact labels 'yes' or 'no' for each category."
#         )
#     else:
#         mismatch_details = "; ".join([f"{f}: gold='{g}', pred='{p}'" for f, g, p in mismatches])
#         feedback = (
#             f"Partially incorrect. Mismatches: {mismatch_details}. "
#             "For each category, output 'yes' if the sentence contains that type of "
#             "polarization (stereotyping, vilification, dehumanization, intolerance), "
#             "otherwise output 'no'. Categories are: political, racial/ethnic, "
#             "religious, gender/sexual, other."
#         )

#     return dspy.Prediction(score=score, feedback=feedback)

# gepa = GEPA(
#     metric=gepa_metric,
#     auto="light",
#     reflection_lm=teacher_lm,  # strong LM for reflection
#     # you can tweak these if you want more budget:
#     # max_metric_calls=200,
#     # max_full_evals=10,
# )

# gepa_prog = gepa.compile(
#     student=optimized_prog,   # start from MIPRO-optimized program
#     trainset=raw_train,
#     valset=raw_val,
# )

# # Evaluate GEPA-optimized program
# evaluate = dspy.Evaluate(
#     devset=raw_val,
#     metric=gepa_metric,
#     display_progress=True,
#     display_table=True,
# )
# eval_result = evaluate(gepa_prog)
# print("GEPA DSPy average accuracy metric:", eval_result.score)

# metrics_val_gepa = eval_metrics_on_dataset(gepa_prog, raw_val)
# print("GEPA validation metrics:", metrics_val_gepa)
# gepa_f1_macro = metrics_val_gepa["f1_macro"]
