In [None]:
import dspy
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from tqdm.auto import tqdm

student_lm_name = "fireworks_ai/accounts/fireworks/models/gpt-oss-20b"
teacher_lm_name = "openai/gpt-5.1-2025-11-13"

if "ollama_chat" in student_lm_name:
    student_lm = dspy.LM(student_lm_name, api_base='http://localhost:11434', api_key='')
else:
    student_lm = dspy.LM(student_lm_name)
teacher_lm = dspy.LM(teacher_lm_name)

dspy.configure(lm=student_lm)

# Prepare DSPY Dataset (Subtask 1)

In [37]:
trial_id = "DSP0002S"
# Train val test split
lang = "eng"
# Load the training and validation data for subtask 1
train_df = pd.read_csv(f'./dev_phase/subtask1/train/{lang}.csv')
test_df = pd.read_csv(f'./dev_phase/subtask1/dev/{lang}.csv')
# Split train into train and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [38]:
# Map the polarization numeric label to a string
POLARIZATION_MAP = {1: "polarization", 0: "no polarization"}

def make_dspy_examples(df, include_label: bool = True):
    examples = []
    for _, row in df.iterrows():
        kwargs = dict(
            sentence=row["text"],
        )
        if include_label and "polarization" in row:
            kwargs["polarization"] = POLARIZATION_MAP[row["polarization"]]
        example = dspy.Example(**kwargs).with_inputs("sentence")
        examples.append(example)
    return examples

# Create DSPY datasets
raw_train = make_dspy_examples(train_df, include_label=True)
raw_val = make_dspy_examples(val_df, include_label=True)
raw_test = make_dspy_examples(test_df, include_label=False)


# # For now take only 10% of each 
raw_train = raw_train[:int(len(raw_train) * 0.2)]
raw_val = raw_val[:int(len(raw_val) * 0.3)]
raw_test = raw_test[:int(len(raw_test) * 1)]

# Define Signature (Subtask 1)

In [39]:
from typing import Literal

class Polarization(dspy.Signature):
    """
    Polarization denotes stereotyping, vilification, dehumanization, deindividuation, or intolerance of other people’s views, beliefs, and identities. In this study, speeches and articles that are shared on social media that incite division, groupism, hatred, conflict, and intolerance are classified as containing polarization.
    Given this sentence, classify it as containing polarization or not."""

    sentence: str = dspy.InputField()
    polarization: Literal["polarization", "no polarization"] = dspy.OutputField(
        desc='Return "polarization" or "no polarization".',
        choices=["polarization", "no polarization"],
    )
classify = dspy.Predict(Polarization)

# Evaluation

In [40]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

POLARIZATION_MAP = {1: "polarization", 0: "no polarization"}

def label2id(label: str) -> int:
    return {v: k for k, v in POLARIZATION_MAP.items()}[label]

def id2label(i: int) -> str:
    return POLARIZATION_MAP[i]

def accuracy_metric(example, pred, trace=None):
    gold = example.polarization          # from your Examples
    guess = pred.polarization         # from Signature output
    return int(gold == guess)         # 1 = correct, 0 = incorrect

In [41]:
evaluate = dspy.Evaluate(
    devset=raw_val,
    metric=accuracy_metric,
    display_progress=True,
    display_table=True,   # nice overview
)
eval_result = evaluate(classify)
print("DSPy average accuracy metric:", eval_result.score)  # percentage

Average Metric: 41.00 / 50 (82.0%):  25%|██▌       | 49/193 [00:00<00:00, 215.09it/s]

Average Metric: 164.00 / 193 (85.0%): 100%|██████████| 193/193 [00:01<00:00, 171.58it/s]

2025/12/07 14:38:18 INFO dspy.evaluate.evaluate: Average Metric: 164 / 193 (85.0%)





Unnamed: 0,sentence,example_polarization,pred_polarization,accuracy_metric
0,Donald Trump relies on First Amendment,no polarization,no polarization,✔️ [1]
1,House GOP in no rush to give more Ukraine aid after 6,no polarization,no polarization,✔️ [1]
2,Israeli adviser to meet with US officials on war,no polarization,no polarization,✔️ [1]
3,"so russia commits war crimes, how does that justify ukraine also c...",polarization,polarization,✔️ [1]
4,Cant wait to watch this episode of Border Security,no polarization,no polarization,✔️ [1]
...,...,...,...,...
188,"Wow, thats awesome. It reminds me of the crazy things the Tamir in...",no polarization,no polarization,✔️ [1]
189,How long will it be until human rights are stripped away? oligarch...,polarization,no polarization,✔️ [0]
190,There are no open borders here in Texas.,no polarization,no polarization,✔️ [1]
191,"Ottawa to unveil economic update detailing deficit, new border sec...",no polarization,no polarization,✔️ [1]


DSPy average accuracy metric: 84.97


In [42]:
def eval_metrics_on_dataset(program, dataset):
    y_true, y_pred = [], []

    for ex in dataset:
        gold = ex.polarization
        pred = program(sentence=ex.sentence).polarization
        if pred is None:
            pred = "no polarization"
        y_true.append(label2id(gold))
        y_pred.append(label2id(pred))

    return {
        'f1_macro': f1_score(y_true, y_pred, average='macro'),
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, average='binary'),
        'recall': recall_score(y_true, y_pred, average='binary'),
        'f1_binary': f1_score(y_true, y_pred, average='binary'),
        'f1_micro': f1_score(y_true, y_pred, average='micro'),
    }

metrics_val = eval_metrics_on_dataset(classify, raw_val)
print("Validation metrics:", metrics_val)

# metrics_test = eval_metrics_on_dataset(classify, raw_test)
# print("Test metrics:", metrics_test)

Validation metrics: {'f1_macro': 0.8166420966420966, 'accuracy': 0.8497409326424871, 'precision': 0.7192982456140351, 'recall': 0.7592592592592593, 'f1_binary': 0.7387387387387387, 'f1_micro': 0.8497409326424871}


In [43]:
student_f1_macro = metrics_val["f1_macro"]
import json

# Build current language entry for subtask_1
lang_entry = {
    "eval_results": {
        "eval_f1_macro": student_f1_macro
    }
}

# Load previous trials from logs.json if it exists, else create new list
log_path = "logs.json"
try:
    with open(log_path, "r") as f:
        trials = json.load(f)
except FileNotFoundError:
    trials = []

# Look for existing trial by trial_id
found = False
for trial in trials:
    if trial.get("trial_id") == trial_id:
        # Insert or update lang in subtask_1
        if "subtask_1" not in trial:
            trial["subtask_1"] = {"score": None}
        if "score" not in trial["subtask_1"]:
            trial["subtask_1"]["score"] = None
        trial["subtask_1"][lang] = lang_entry
        found = True
        break

if not found:
    # Build new trial dict if trial_id not found
    current_trial = {
        "trial_id": trial_id,
        "metadata": {
            "approach": "dspy basic",
            "model": student_lm_name
        },
        "subtask_1": {
            "score": None,
            lang: lang_entry
        }
    }
    trials.append(current_trial)

# Save back to logs.json
with open(log_path, "w") as f:
    json.dump(trials, f, indent=4)


In [44]:
ids = test_df["id"]
def predict_dataset(program, dataset):
    rows = []
    for i, ex in tqdm(enumerate(dataset), total=len(dataset)):
        out = program(sentence=ex.sentence)
        label = out.polarization
        rows.append({
            "id": test_df.iloc[i]["id"],
            "polarization": label2id(label),
        })
    return pd.DataFrame(rows)



# after you choose which model to use (classify, optimized_classify, etc.)
test_preds_df = predict_dataset(classify, raw_test)

print(test_preds_df.head())

  0%|          | 0/160 [00:00<?, ?it/s]

                                     id  polarization
0  eng_f66ca14d60851371f9720aaf4ccd9b58             0
1  eng_3a489aa7fed9726aa8d3d4fe74c57efb             0
2  eng_95770ff547ea5e48b0be00f385986483             0
3  eng_2048ae6f9aa261c48e6d777bcc5b38bf             0
4  eng_07781aa88e61e7c0a996abd1e5ea3a20             0


In [45]:
# in results create dir with trial_id and create subtask_1 inside it, then save the csv as "pred_lang.csv" inside it
import os
os.makedirs(f"results/{trial_id}/subtask_1", exist_ok=True)
test_preds_df.to_csv(f"results/{trial_id}/subtask_1/pred_{lang}.csv", index=False)


# Using MIPROv2

In [46]:
from dspy.teleprompt import MIPROv2

mipro = MIPROv2(
    metric=accuracy_metric,
    auto="light",
    teacher_settings=dict(lm=teacher_lm),
    prompt_model=student_lm,
)

optimized_prog = mipro.compile(
    student=dspy.Predict(Polarization),
    trainset=raw_train,
    valset=raw_val,
    requires_permission_to_run=False,
)

# # After optimization, compute your full metrics dict
# metrics_val_opt = eval_metrics_on_dataset(optimized_prog, raw_val)

# print("Optimized validation metrics:", metrics_val_opt)

2025/12/07 14:38:29 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 10
minibatch: True
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 100

2025/12/07 14:38:29 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/12/07 14:38:29 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/12/07 14:38:29 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=6 sets of demonstrations...


Bootstrapping set 1/6
Bootstrapping set 2/6
Bootstrapping set 3/6


  1%|          | 5/515 [00:00<00:14, 34.31it/s]


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 4/6


  1%|          | 3/515 [00:00<00:13, 37.34it/s]


Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 5/6


  0%|          | 2/515 [00:00<00:15, 33.05it/s]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 6/6


  1%|          | 3/515 [00:00<00:12, 41.29it/s]
2025/12/07 14:38:29 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/12/07 14:38:29 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.


2025/12/07 14:38:29 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=3 instructions...

2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Polarization denotes stereotyping, vilification, dehumanization, deindividuation, or intolerance of other people’s views, beliefs, and identities. In this study, speeches and articles that are shared on social media that incite division, groupism, hatred, conflict, and intolerance are classified as containing polarization.
Given this sentence, classify it as containing polarization or not.

2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Classify the following sentence as containing polarization or not.  
Output exactly one of the two labels:  

* **polarity** – if the sentence includes political/ideological slurs, dehumanizing or vilifying language, direct second‑person address, profanity, overt partisan framing, or

Average Metric: 80.00 / 100 (80.0%): 100%|██████████| 100/100 [00:00<00:00, 1406.81it/s]

2025/12/07 14:38:30 INFO dspy.evaluate.evaluate: Average Metric: 80 / 100 (80.0%)
2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 80.0

2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 13 - Minibatch ==



Average Metric: 28.00 / 35 (80.0%): 100%|██████████| 35/35 [00:00<00:00, 229.53it/s]

2025/12/07 14:38:30 INFO dspy.evaluate.evaluate: Average Metric: 28 / 35 (80.0%)





2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3'].
2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0]
2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.0]
2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.0


2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 13 - Minibatch ==


Average Metric: 22.00 / 35 (62.9%): 100%|██████████| 35/35 [00:00<00:00, 222.81it/s]

2025/12/07 14:38:30 INFO dspy.evaluate.evaluate: Average Metric: 22 / 35 (62.9%)





2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 62.86]
2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.0]
2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.0


2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 13 - Minibatch ==


Average Metric: 25.00 / 35 (71.4%): 100%|██████████| 35/35 [00:00<00:00, 223.65it/s]

2025/12/07 14:38:30 INFO dspy.evaluate.evaluate: Average Metric: 25 / 35 (71.4%)





2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 71.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 5'].
2025/12/07 14:38:30 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 62.86, 71.43]
2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.0]
2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.0


2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 13 - Minibatch ==


Average Metric: 25.00 / 35 (71.4%): 100%|██████████| 35/35 [00:00<00:00, 220.09it/s]

2025/12/07 14:38:31 INFO dspy.evaluate.evaluate: Average Metric: 25 / 35 (71.4%)
2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 71.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 2'].
2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 62.86, 71.43, 71.43]





2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.0]
2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.0


2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 13 - Minibatch ==


Average Metric: 28.00 / 35 (80.0%): 100%|██████████| 35/35 [00:00<00:00, 217.99it/s]

2025/12/07 14:38:31 INFO dspy.evaluate.evaluate: Average Metric: 28 / 35 (80.0%)
2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5'].





2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 62.86, 71.43, 71.43, 80.0]
2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.0]
2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.0


2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 13 - Full Evaluation =====
2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 80.0) from minibatch trials...


Average Metric: 79.00 / 100 (79.0%): 100%|██████████| 100/100 [00:00<00:00, 238.65it/s]

2025/12/07 14:38:31 INFO dspy.evaluate.evaluate: Average Metric: 79 / 100 (79.0%)
2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.0, 79.0]
2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.0





2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/12/07 14:38:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 13 - Minibatch ==


Average Metric: 20.00 / 35 (57.1%): 100%|██████████| 35/35 [00:00<00:00, 234.58it/s]

2025/12/07 14:38:32 INFO dspy.evaluate.evaluate: Average Metric: 20 / 35 (57.1%)





2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 62.86, 71.43, 71.43, 80.0, 57.14]
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.0, 79.0]
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.0


2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 13 - Minibatch ==


Average Metric: 21.00 / 35 (60.0%): 100%|██████████| 35/35 [00:00<00:00, 209.85it/s]

2025/12/07 14:38:32 INFO dspy.evaluate.evaluate: Average Metric: 21 / 35 (60.0%)
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 62.86, 71.43, 71.43, 80.0, 57.14, 60.0]
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.0, 79.0]
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.0


2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 13 - Minibatch ==



Average Metric: 24.00 / 35 (68.6%): 100%|██████████| 35/35 [00:00<00:00, 204.05it/s]

2025/12/07 14:38:32 INFO dspy.evaluate.evaluate: Average Metric: 24 / 35 (68.6%)
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 4'].
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 62.86, 71.43, 71.43, 80.0, 57.14, 60.0, 68.57]
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.0, 79.0]
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.0


2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 13 - Minibatch ==



Average Metric: 26.00 / 35 (74.3%): 100%|██████████| 35/35 [00:00<00:00, 1126.64it/s]

2025/12/07 14:38:32 INFO dspy.evaluate.evaluate: Average Metric: 26 / 35 (74.3%)
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 74.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 0'].
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 62.86, 71.43, 71.43, 80.0, 57.14, 60.0, 68.57, 74.29]





2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.0, 79.0]
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.0


2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 13 - Minibatch ==


Average Metric: 27.00 / 35 (77.1%): 100%|██████████| 35/35 [00:00<00:00, 1270.70it/s]

2025/12/07 14:38:32 INFO dspy.evaluate.evaluate: Average Metric: 27 / 35 (77.1%)
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3'].
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 62.86, 71.43, 71.43, 80.0, 57.14, 60.0, 68.57, 74.29, 77.14]
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.0, 79.0]
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.0


2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 13 - Full Evaluation =====
2025/12/07 14:38:32 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 80.0) from minibatch trials...



Average Metric: 80.00 / 100 (80.0%): 100%|██████████| 100/100 [00:00<00:00, 253.11it/s]

2025/12/07 14:38:33 INFO dspy.evaluate.evaluate: Average Metric: 80 / 100 (80.0%)
2025/12/07 14:38:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.0, 79.0, 80.0]
2025/12/07 14:38:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.0





2025/12/07 14:38:33 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/12/07 14:38:33 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 80.0!


In [47]:
evaluate = dspy.Evaluate(
    devset=raw_val,
    metric=accuracy_metric,
    display_progress=True,
    display_table=True,   # nice overview
)
eval_result = evaluate(optimized_prog)
print("DSPy average accuracy metric:", eval_result.score)  # percentage

Average Metric: 3.00 / 5 (60.0%):   2%|▏         | 4/193 [00:00<00:01, 103.29it/s]

Average Metric: 164.00 / 193 (85.0%): 100%|██████████| 193/193 [00:00<00:00, 854.14it/s]

2025/12/07 14:38:33 INFO dspy.evaluate.evaluate: Average Metric: 164 / 193 (85.0%)





Unnamed: 0,sentence,example_polarization,pred_polarization,accuracy_metric
0,Donald Trump relies on First Amendment,no polarization,no polarization,✔️ [1]
1,House GOP in no rush to give more Ukraine aid after 6,no polarization,no polarization,✔️ [1]
2,Israeli adviser to meet with US officials on war,no polarization,no polarization,✔️ [1]
3,"so russia commits war crimes, how does that justify ukraine also c...",polarization,polarization,✔️ [1]
4,Cant wait to watch this episode of Border Security,no polarization,no polarization,✔️ [1]
...,...,...,...,...
188,"Wow, thats awesome. It reminds me of the crazy things the Tamir in...",no polarization,no polarization,✔️ [1]
189,How long will it be until human rights are stripped away? oligarch...,polarization,no polarization,✔️ [0]
190,There are no open borders here in Texas.,no polarization,no polarization,✔️ [1]
191,"Ottawa to unveil economic update detailing deficit, new border sec...",no polarization,no polarization,✔️ [1]


DSPy average accuracy metric: 84.97


In [48]:
metrics_val = eval_metrics_on_dataset(optimized_prog, raw_val)
print("Validation metrics:", metrics_val)
mipro_f1_macro = metrics_val["f1_macro"]

Validation metrics: {'f1_macro': 0.8166420966420966, 'accuracy': 0.8497409326424871, 'precision': 0.7192982456140351, 'recall': 0.7592592592592593, 'f1_binary': 0.7387387387387387, 'f1_micro': 0.8497409326424871}


In [49]:
ids = test_df["id"]
def predict_dataset(program, dataset):
    rows = []
    for i, ex in tqdm(enumerate(dataset), total=len(dataset)):
        out = program(sentence=ex.sentence)
        label = out.polarization
        if label is None:
            label = "no polarization"
        rows.append({
            "id": test_df.iloc[i]["id"],
            "polarization": label2id(label),
        })
    return pd.DataFrame(rows)


# after you choose which model to use (classify, optimized_classify, etc.)
test_preds_df = predict_dataset(optimized_prog, raw_test)

trial_id = "MIPRO" + trial_id
os.makedirs(f"results/{trial_id}/subtask_1", exist_ok=True)
test_preds_df.to_csv(f"results/{trial_id}/subtask_1/pred_{lang}.csv", index=False)

  0%|          | 0/160 [00:00<?, ?it/s]

In [50]:
import json

log_path = "logs.json"

# Load previous trials from logs.json if it exists, else create new list
try:
    with open(log_path, "r") as f:
        trials = json.load(f)
except FileNotFoundError:
    trials = []

# Try to find an existing trial with the same trial_id
found = False
for trial in trials:
    if trial.get("trial_id") == trial_id:
        # Add or update the language entry under subtask_1
        if "subtask_1" not in trial:
            trial["subtask_1"] = {"score": None}
        if "score" not in trial["subtask_1"]:
            trial["subtask_1"]["score"] = None
        # Insert/update this language result
        trial["subtask_1"][lang] = {
            "eval_results": {
                "eval_f1_macro": mipro_f1_macro
            }
        }
        found = True
        break

if not found:
    # Build current trial result dict and append if no matching trial_id found
    current_trial = {
        "trial_id": trial_id,
        "metadata": {
            "approach": "dspy MIPROv2",
            "student_model": student_lm_name,
            "teacher_model": teacher_lm_name
        },
        "subtask_1": {
            "score": None,
            lang: {
                "eval_results": {
                    "eval_f1_macro": mipro_f1_macro
                }
            }
        }
    }
    trials.append(current_trial)

# Save back to logs.json
with open(log_path, "w") as f:
    json.dump(trials, f, indent=4)


In [51]:
# save the optimized program
# Create dspy_cache dir if it doesn't exist
os.makedirs("dspy_cache", exist_ok=True)
optimized_prog.save(f"dspy_cache/optimized_subtask1_{lang}_dspy_miprov2_student{student_lm_name.split('/')[-1]}_teacher{teacher_lm_name.split('/')[-1]}_{trial_id}.json")

# GEPA

In [52]:
# from dspy import GPEA
# gepa = GPEA(metric=accuracy_metric,
#                  auto="light",
#                  reflection_lm=teacher_lm)

# gepa_program = gepa.compile(student=optimized_prog,
#                             trainset=raw_train,
#                             valset=raw_val)