In [1]:
import dspy
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from tqdm.auto import tqdm

student_lm_name = "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"
teacher_lm_name = "openai/gpt-5.1-2025-11-13"

if "ollama_chat" in student_lm_name:
    student_lm = dspy.LM(student_lm_name, api_base='http://localhost:11434', api_key='')
else:
    student_lm = dspy.LM(student_lm_name)
teacher_lm = dspy.LM(teacher_lm_name)

dspy.configure(lm=student_lm)

# Prepare DSPY Dataset (Subtask 1)

In [2]:
trial_id = "DSP0004S"
# Train val test split
lang = "eng"
# Load the training and validation data for subtask 1
train_df = pd.read_csv(f'./dev_phase/subtask1/train/{lang}.csv')
test_df = pd.read_csv(f'./dev_phase/subtask1/dev/{lang}.csv')
# Split train into train and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [3]:
# Map the polarization numeric label to a string
POLARIZATION_MAP = {1: "polarization", 0: "no polarization"}

def make_dspy_examples(df, include_label: bool = True):
    examples = []
    for _, row in df.iterrows():
        kwargs = dict(
            sentence=row["text"],
        )
        if include_label and "polarization" in row:
            kwargs["polarization"] = POLARIZATION_MAP[row["polarization"]]
        example = dspy.Example(**kwargs).with_inputs("sentence")
        examples.append(example)
    return examples

# Create DSPY datasets
raw_train = make_dspy_examples(train_df, include_label=True)
raw_val = make_dspy_examples(val_df, include_label=True)
raw_test = make_dspy_examples(test_df, include_label=False)


# # For now take only 10% of each 
raw_train = raw_train[:int(len(raw_train) * 0.2)]
raw_val = raw_val[:int(len(raw_val) * 0.3)]
raw_test = raw_test[:int(len(raw_test) * 1)]

# Define Signature (Subtask 1)

In [4]:
from typing import Literal

class Polarization(dspy.Signature):
    """
    Polarization denotes stereotyping, vilification, dehumanization, deindividuation, or intolerance of other people’s views, beliefs, and identities. In this study, speeches and articles that are shared on social media that incite division, groupism, hatred, conflict, and intolerance are classified as containing polarization.
    Given this sentence, classify it as containing polarization or not."""

    sentence: str = dspy.InputField()
    polarization: Literal["polarization", "no polarization"] = dspy.OutputField(
        desc='Return "polarization" or "no polarization".',
        choices=["polarization", "no polarization"],
    )
classify = dspy.Predict(Polarization)

# Evaluation

In [5]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

POLARIZATION_MAP = {1: "polarization", 0: "no polarization"}

def label2id(label: str) -> int:
    return {v: k for k, v in POLARIZATION_MAP.items()}[label]

def id2label(i: int) -> str:
    return POLARIZATION_MAP[i]

def accuracy_metric(example, pred, trace=None):
    gold = example.polarization          # from your Examples
    guess = pred.polarization         # from Signature output
    return int(gold == guess)         # 1 = correct, 0 = incorrect

In [6]:
evaluate = dspy.Evaluate(
    devset=raw_val,
    metric=accuracy_metric,
    display_progress=True,
    display_table=True,   # nice overview
)
eval_result = evaluate(classify)
print("DSPy average accuracy metric:", eval_result.score)  # percentage

Average Metric: 166.00 / 193 (86.0%): 100%|██████████| 193/193 [00:00<00:00, 400.37it/s]

2025/12/08 09:04:36 INFO dspy.evaluate.evaluate: Average Metric: 166 / 193 (86.0%)





Unnamed: 0,sentence,example_polarization,pred_polarization,accuracy_metric
0,Donald Trump relies on First Amendment,no polarization,no polarization,✔️ [1]
1,House GOP in no rush to give more Ukraine aid after 6,no polarization,no polarization,✔️ [1]
2,Israeli adviser to meet with US officials on war,no polarization,no polarization,✔️ [1]
3,"so russia commits war crimes, how does that justify ukraine also c...",polarization,polarization,✔️ [1]
4,Cant wait to watch this episode of Border Security,no polarization,no polarization,✔️ [1]
...,...,...,...,...
188,"Wow, thats awesome. It reminds me of the crazy things the Tamir in...",no polarization,no polarization,✔️ [1]
189,How long will it be until human rights are stripped away? oligarch...,polarization,polarization,✔️ [1]
190,There are no open borders here in Texas.,no polarization,no polarization,✔️ [1]
191,"Ottawa to unveil economic update detailing deficit, new border sec...",no polarization,no polarization,✔️ [1]


DSPy average accuracy metric: 86.01


In [7]:
def eval_metrics_on_dataset(program, dataset):
    y_true, y_pred = [], []

    for ex in dataset:
        gold = ex.polarization
        pred = program(sentence=ex.sentence).polarization
        if pred is None:
            pred = "no polarization"
        y_true.append(label2id(gold))
        y_pred.append(label2id(pred))

    return {
        'f1_macro': f1_score(y_true, y_pred, average='macro'),
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, average='binary'),
        'recall': recall_score(y_true, y_pred, average='binary'),
        'f1_binary': f1_score(y_true, y_pred, average='binary'),
        'f1_micro': f1_score(y_true, y_pred, average='micro'),
    }

metrics_val = eval_metrics_on_dataset(classify, raw_val)
print("Validation metrics:", metrics_val)

# metrics_test = eval_metrics_on_dataset(classify, raw_test)
# print("Test metrics:", metrics_test)

Validation metrics: {'f1_macro': 0.8189682126107347, 'accuracy': 0.8601036269430051, 'precision': 0.7872340425531915, 'recall': 0.6851851851851852, 'f1_binary': 0.7326732673267327, 'f1_micro': 0.8601036269430051}


In [8]:
student_f1_macro = metrics_val["f1_macro"]
import json

# Build current language entry for subtask_1
lang_entry = {
    "eval_results": {
        "eval_f1_macro": student_f1_macro
    }
}

# Load previous trials from logs.json if it exists, else create new list
log_path = "logs.json"
try:
    with open(log_path, "r") as f:
        trials = json.load(f)
except FileNotFoundError:
    trials = []

# Look for existing trial by trial_id
found = False
for trial in trials:
    if trial.get("trial_id") == trial_id:
        # Insert or update lang in subtask_1
        if "subtask_1" not in trial:
            trial["subtask_1"] = {"score": None}
        if "score" not in trial["subtask_1"]:
            trial["subtask_1"]["score"] = None
        trial["subtask_1"][lang] = lang_entry
        found = True
        break

if not found:
    # Build new trial dict if trial_id not found
    current_trial = {
        "trial_id": trial_id,
        "metadata": {
            "approach": "dspy basic",
            "model": student_lm_name
        },
        "subtask_1": {
            "score": None,
            lang: lang_entry
        }
    }
    trials.append(current_trial)

# Save back to logs.json
with open(log_path, "w") as f:
    json.dump(trials, f, indent=4)


In [9]:
ids = test_df["id"]
def predict_dataset(program, dataset):
    rows = []
    for i, ex in tqdm(enumerate(dataset), total=len(dataset)):
        out = program(sentence=ex.sentence)
        label = out.polarization
        rows.append({
            "id": test_df.iloc[i]["id"],
            "polarization": label2id(label),
        })
    return pd.DataFrame(rows)



# after you choose which model to use (classify, optimized_classify, etc.)
test_preds_df = predict_dataset(classify, raw_test)

print(test_preds_df.head())

  0%|          | 0/160 [00:00<?, ?it/s]

                                     id  polarization
0  eng_f66ca14d60851371f9720aaf4ccd9b58             0
1  eng_3a489aa7fed9726aa8d3d4fe74c57efb             0
2  eng_95770ff547ea5e48b0be00f385986483             0
3  eng_2048ae6f9aa261c48e6d777bcc5b38bf             0
4  eng_07781aa88e61e7c0a996abd1e5ea3a20             0


In [10]:
# in results create dir with trial_id and create subtask_1 inside it, then save the csv as "pred_lang.csv" inside it
import os
os.makedirs(f"results/{trial_id}/subtask_1", exist_ok=True)
test_preds_df.to_csv(f"results/{trial_id}/subtask_1/pred_{lang}.csv", index=False)


# Using MIPROv2

In [11]:
from dspy.teleprompt import MIPROv2

mipro = MIPROv2(
    metric=accuracy_metric,
    auto="light",
    teacher_settings=dict(lm=teacher_lm),
    prompt_model=student_lm,
)

optimized_prog = mipro.compile(
    student=dspy.Predict(Polarization),
    trainset=raw_train,
    valset=raw_val,
    requires_permission_to_run=False,
)

# # After optimization, compute your full metrics dict
# metrics_val_opt = eval_metrics_on_dataset(optimized_prog, raw_val)

# print("Optimized validation metrics:", metrics_val_opt)

2025/12/08 09:04:42 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 10
minibatch: True
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 100

2025/12/08 09:04:42 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/12/08 09:04:42 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/12/08 09:04:42 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=6 sets of demonstrations...


Bootstrapping set 1/6
Bootstrapping set 2/6
Bootstrapping set 3/6


  1%|          | 5/515 [00:00<00:10, 48.82it/s]


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 4/6


  1%|          | 3/515 [00:00<00:07, 69.14it/s]


Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 5/6


  0%|          | 2/515 [00:00<00:07, 66.95it/s]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 6/6


  1%|          | 3/515 [00:00<00:12, 40.55it/s]
2025/12/08 09:04:42 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/12/08 09:04:42 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.


2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=3 instructions...

2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Polarization denotes stereotyping, vilification, dehumanization, deindividuation, or intolerance of other people’s views, beliefs, and identities. In this study, speeches and articles that are shared on social media that incite division, groupism, hatred, conflict, and intolerance are classified as containing polarization.
Given this sentence, classify it as containing polarization or not.

2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: 1: You are given a single sentence. Classify it as **polarization** or **no polarization** according to the following criteria:

* **polarization** – the sentence contains stereotyping, vilification, dehumanization, de‑individuation, intolerance, hateful or divisive language, loaded or 

Average Metric: 86.00 / 100 (86.0%): 100%|██████████| 100/100 [00:00<00:00, 554.98it/s]

2025/12/08 09:04:43 INFO dspy.evaluate.evaluate: Average Metric: 86 / 100 (86.0%)
2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 86.0

2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 13 - Minibatch ==



Average Metric: 28.00 / 35 (80.0%): 100%|██████████| 35/35 [00:00<00:00, 384.22it/s]

2025/12/08 09:04:43 INFO dspy.evaluate.evaluate: Average Metric: 28 / 35 (80.0%)





2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3'].
2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0]
2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [86.0]
2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0


2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 13 - Minibatch ==


Average Metric: 27.00 / 35 (77.1%): 100%|██████████| 35/35 [00:00<00:00, 544.73it/s]

2025/12/08 09:04:43 INFO dspy.evaluate.evaluate: Average Metric: 27 / 35 (77.1%)
2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 77.14]
2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [86.0]
2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0


2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 13 - Minibatch ==



Average Metric: 27.00 / 35 (77.1%): 100%|██████████| 35/35 [00:00<00:00, 372.76it/s]

2025/12/08 09:04:43 INFO dspy.evaluate.evaluate: Average Metric: 27 / 35 (77.1%)
2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 5'].
2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 77.14, 77.14]
2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [86.0]
2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0


2025/12/08 09:04:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 13 - Minibatch ==



Average Metric: 27.00 / 35 (77.1%): 100%|██████████| 35/35 [00:00<00:00, 375.86it/s]

2025/12/08 09:04:44 INFO dspy.evaluate.evaluate: Average Metric: 27 / 35 (77.1%)
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 2'].
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 77.14, 77.14, 77.14]
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [86.0]
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0







2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 13 - Minibatch ==


Average Metric: 30.00 / 35 (85.7%): 100%|██████████| 35/35 [00:00<00:00, 460.63it/s]

2025/12/08 09:04:44 INFO dspy.evaluate.evaluate: Average Metric: 30 / 35 (85.7%)
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 85.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5'].
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 77.14, 77.14, 77.14, 85.71]
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [86.0]
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0


2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 13 - Full Evaluation =====
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 85.71) from minibatch trials...



Average Metric: 84.00 / 100 (84.0%): 100%|██████████| 100/100 [00:00<00:00, 441.08it/s]

2025/12/08 09:04:44 INFO dspy.evaluate.evaluate: Average Metric: 84 / 100 (84.0%)
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [86.0, 84.0]
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 13 - Minibatch ==



Average Metric: 28.00 / 35 (80.0%): 100%|██████████| 35/35 [00:00<00:00, 373.81it/s]

2025/12/08 09:04:44 INFO dspy.evaluate.evaluate: Average Metric: 28 / 35 (80.0%)





2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 77.14, 77.14, 77.14, 85.71, 80.0]
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [86.0, 84.0]
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0


2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 13 - Minibatch ==


Average Metric: 25.00 / 35 (71.4%): 100%|██████████| 35/35 [00:00<00:00, 649.16it/s]

2025/12/08 09:04:44 INFO dspy.evaluate.evaluate: Average Metric: 25 / 35 (71.4%)





2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 71.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 77.14, 77.14, 77.14, 85.71, 80.0, 71.43]
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [86.0, 84.0]
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0


2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 13 - Minibatch ==


Average Metric: 23.00 / 35 (65.7%): 100%|██████████| 35/35 [00:00<00:00, 479.03it/s]

2025/12/08 09:04:44 INFO dspy.evaluate.evaluate: Average Metric: 23 / 35 (65.7%)
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 4'].
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 77.14, 77.14, 77.14, 85.71, 80.0, 71.43, 65.71]
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [86.0, 84.0]
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0







2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 13 - Minibatch ==


Average Metric: 31.00 / 35 (88.6%): 100%|██████████| 35/35 [00:00<00:00, 358.13it/s] 

2025/12/08 09:04:44 INFO dspy.evaluate.evaluate: Average Metric: 31 / 35 (88.6%)
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 4'].
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 77.14, 77.14, 77.14, 85.71, 80.0, 71.43, 65.71, 88.57]
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [86.0, 84.0]
2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0


2025/12/08 09:04:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 13 - Minibatch ==



Average Metric: 30.00 / 35 (85.7%): 100%|██████████| 35/35 [00:11<00:00,  3.13it/s]

2025/12/08 09:04:56 INFO dspy.evaluate.evaluate: Average Metric: 30 / 35 (85.7%)
2025/12/08 09:04:56 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 85.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 4'].
2025/12/08 09:04:56 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 77.14, 77.14, 77.14, 85.71, 80.0, 71.43, 65.71, 88.57, 85.71]
2025/12/08 09:04:56 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [86.0, 84.0]
2025/12/08 09:04:56 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0


2025/12/08 09:04:56 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 13 - Full Evaluation =====
2025/12/08 09:04:56 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 87.13999999999999) from minibatch trials...



Average Metric: 87.00 / 100 (87.0%): 100%|██████████| 100/100 [00:12<00:00,  7.95it/s]

2025/12/08 09:05:08 INFO dspy.evaluate.evaluate: Average Metric: 87 / 100 (87.0%)
2025/12/08 09:05:08 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 87.0





2025/12/08 09:05:08 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [86.0, 84.0, 87.0]
2025/12/08 09:05:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 87.0
2025/12/08 09:05:08 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/12/08 09:05:08 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 87.0!


In [12]:
evaluate = dspy.Evaluate(
    devset=raw_val,
    metric=accuracy_metric,
    display_progress=True,
    display_table=True,   # nice overview
)
eval_result = evaluate(optimized_prog)
print("DSPy average accuracy metric:", eval_result.score)  # percentage

Average Metric: 2.00 / 2 (100.0%):   1%|          | 1/193 [00:00<00:00, 864.45it/s]

Average Metric: 165.00 / 193 (85.5%): 100%|██████████| 193/193 [00:44<00:00,  4.38it/s]

2025/12/08 09:05:53 INFO dspy.evaluate.evaluate: Average Metric: 165 / 193 (85.5%)





Unnamed: 0,sentence,example_polarization,pred_polarization,accuracy_metric
0,Donald Trump relies on First Amendment,no polarization,no polarization,✔️ [1]
1,House GOP in no rush to give more Ukraine aid after 6,no polarization,no polarization,✔️ [1]
2,Israeli adviser to meet with US officials on war,no polarization,no polarization,✔️ [1]
3,"so russia commits war crimes, how does that justify ukraine also c...",polarization,no polarization,✔️ [0]
4,Cant wait to watch this episode of Border Security,no polarization,no polarization,✔️ [1]
...,...,...,...,...
188,"Wow, thats awesome. It reminds me of the crazy things the Tamir in...",no polarization,no polarization,✔️ [1]
189,How long will it be until human rights are stripped away? oligarch...,polarization,polarization,✔️ [1]
190,There are no open borders here in Texas.,no polarization,no polarization,✔️ [1]
191,"Ottawa to unveil economic update detailing deficit, new border sec...",no polarization,no polarization,✔️ [1]


DSPy average accuracy metric: 85.49


In [13]:
metrics_val = eval_metrics_on_dataset(optimized_prog, raw_val)
print("Validation metrics:", metrics_val)
mipro_f1_macro = metrics_val["f1_macro"]

Validation metrics: {'f1_macro': 0.823905109489051, 'accuracy': 0.8549222797927462, 'precision': 0.7241379310344828, 'recall': 0.7777777777777778, 'f1_binary': 0.75, 'f1_micro': 0.8549222797927462}


In [14]:
ids = test_df["id"]
def predict_dataset(program, dataset):
    rows = []
    for i, ex in tqdm(enumerate(dataset), total=len(dataset)):
        out = program(sentence=ex.sentence)
        label = out.polarization
        if label is None:
            label = "no polarization"
        rows.append({
            "id": test_df.iloc[i]["id"],
            "polarization": label2id(label),
        })
    return pd.DataFrame(rows)


# after you choose which model to use (classify, optimized_classify, etc.)
test_preds_df = predict_dataset(optimized_prog, raw_test)

trial_id = "MIPRO" + trial_id
os.makedirs(f"results/{trial_id}/subtask_1", exist_ok=True)
test_preds_df.to_csv(f"results/{trial_id}/subtask_1/pred_{lang}.csv", index=False)

  0%|          | 0/160 [00:00<?, ?it/s]

In [15]:
import json

log_path = "logs.json"

# Load previous trials from logs.json if it exists, else create new list
try:
    with open(log_path, "r") as f:
        trials = json.load(f)
except FileNotFoundError:
    trials = []

# Try to find an existing trial with the same trial_id
found = False
for trial in trials:
    if trial.get("trial_id") == trial_id:
        # Add or update the language entry under subtask_1
        if "subtask_1" not in trial:
            trial["subtask_1"] = {"score": None}
        if "score" not in trial["subtask_1"]:
            trial["subtask_1"]["score"] = None
        # Insert/update this language result
        trial["subtask_1"][lang] = {
            "eval_results": {
                "eval_f1_macro": mipro_f1_macro
            }
        }
        found = True
        break

if not found:
    # Build current trial result dict and append if no matching trial_id found
    current_trial = {
        "trial_id": trial_id,
        "metadata": {
            "approach": "dspy MIPROv2",
            "student_model": student_lm_name,
            "teacher_model": teacher_lm_name
        },
        "subtask_1": {
            "score": None,
            lang: {
                "eval_results": {
                    "eval_f1_macro": mipro_f1_macro
                }
            }
        }
    }
    trials.append(current_trial)

# Save back to logs.json
with open(log_path, "w") as f:
    json.dump(trials, f, indent=4)


In [16]:
# save the optimized program
# Create dspy_cache dir if it doesn't exist
os.makedirs("dspy_cache", exist_ok=True)
optimized_prog.save(f"dspy_cache/optimized_subtask1_{lang}_dspy_miprov2_student{student_lm_name.split('/')[-1]}_teacher{teacher_lm_name.split('/')[-1]}_{trial_id}.json")

# GEPA

In [18]:
from dspy import GEPA

def gepa_metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
    gold_label = gold.polarization
    pred_label = getattr(pred, "polarization", None)

    if pred_label is None:
        pred_label = "no polarization"

    score = float(gold_label == pred_label)

    # When used just for Evaluate, we only need a scalar
    if trace is None and pred_name is None and pred_trace is None:
        return score

    if score == 1.0:
        feedback = (
            f"Correct. The sentence was labeled '{gold_label}' and the "
            f"model predicted '{pred_label}'. Keep enforcing the exact "
            "labels 'polarization' or 'no polarization'."
        )
    else:
        feedback = (
            f"Incorrect. Gold='{gold_label}', pred='{pred_label}'. "
            "If the sentence shows stereotyping, vilification, "
            "dehumanization, group hatred, or incites conflict, "
            "you must output 'polarization'. Otherwise output "
            "'no polarization'. Be strict about borderline cases."
        )

    return dspy.Prediction(score=score, feedback=feedback)

gepa = GEPA(
    metric=gepa_metric,
    auto="light",
    reflection_lm=teacher_lm,  # strong LM for reflection
    # you can tweak these if you want more budget:
    # max_metric_calls=200,
    # max_full_evals=10,
)

gepa_prog = gepa.compile(
    student=optimized_prog,   # start from MIPRO-optimized program
    trainset=raw_train,
    valset=raw_val,
)

# Evaluate GEPA-optimized program
evaluate = dspy.Evaluate(
    devset=raw_val,
    metric=gepa_metric,
    display_progress=True,
    display_table=True,
)
eval_result = evaluate(gepa_prog)
print("GEPA DSPy average accuracy metric:", eval_result.score)

metrics_val_gepa = eval_metrics_on_dataset(gepa_prog, raw_val)
print("GEPA validation metrics:", metrics_val_gepa)
gepa_f1_macro = metrics_val_gepa["f1_macro"]

2025/12/08 09:19:32 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 1152 metric calls of the program. This amounts to 1.63 full evals on the train+val set.
2025/12/08 09:19:32 INFO dspy.teleprompt.gepa.gepa: Using 193 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget. GEPA requires you to provide the smallest valset that is just large enough to match your downstream task distribution, while providing as large trainset as possible.
GEPA Optimization:   0%|          | 0/1152 [00:00<?, ?rollouts/s]2025/12/08 09:19:33 INFO dspy.evaluate.evaluate: Average Metric: 165.0 / 193 (85.5%)
2025/12/08 09:19:33 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.8549222797927462
GEPA Optimization:  17%|█▋        | 193/1152 [00:00<00:04, 221.34rollouts/s]2025/12/08 09:19:33 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.8549222797

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:06<00:00,  2.06s/it]

2025/12/08 09:19:39 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/12/08 09:19:39 INFO dspy.teleprompt.gepa.gepa: Iteration 1: All subsample scores perfect. Skipping.
2025/12/08 09:19:39 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Reflective mutation did not propose a new candidate
2025/12/08 09:19:39 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Selected program 0 score: 0.8549222797927462



Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:02<00:00,  1.42it/s]

2025/12/08 09:19:41 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/12/08 09:20:05 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for self: You are given a single input field:

- sentence: a short piece of text (e.g., a post, comment, or sentence from a speech or article).

Your task is to classify whether this sentence contains **polarization** as defined below.

DEFINITION OF POLARIZATION
Polarization denotes:
- stereotyping of groups or people,
- vilification (strongly insulting or demonizing) of groups or people,
- dehumanization or deindividuation of groups or people,
- intolerance of other people’s views, beliefs, and identities,
- content that incites or expresses division, groupism, hatred, conflict, or intolerance.

In this context, we are focusing on speeches, posts, and articles (or snippets thereof) that are shared on social media and:
- incite or reinforce group-based division or antagonism, or
- target people based on their identity, group membership, or political/collective alignment, or
- express or encourage hatred

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:03<00:00,  1.22s/it]

2025/12/08 09:21:16 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/12/08 09:21:16 INFO dspy.teleprompt.gepa.gepa: Iteration 3: All subsample scores perfect. Skipping.
2025/12/08 09:21:16 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Reflective mutation did not propose a new candidate
GEPA Optimization:  35%|███▍      | 398/1152 [01:44<03:53,  3.22rollouts/s]2025/12/08 09:21:16 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Selected program 1 score: 0.8341968911917098



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:03<00:00,  1.12s/it]

2025/12/08 09:21:20 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/12/08 09:21:20 INFO dspy.teleprompt.gepa.gepa: Iteration 4: All subsample scores perfect. Skipping.
2025/12/08 09:21:20 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Reflective mutation did not propose a new candidate
2025/12/08 09:21:20 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Selected program 0 score: 0.8549222797927462



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:02<00:00,  1.08it/s]

2025/12/08 09:21:22 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/12/08 09:21:22 INFO dspy.teleprompt.gepa.gepa: Iteration 5: All subsample scores perfect. Skipping.
2025/12/08 09:21:22 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Reflective mutation did not propose a new candidate
2025/12/08 09:21:22 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Selected program 0 score: 0.8549222797927462



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:03<00:00,  1.01s/it]

2025/12/08 09:21:25 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/12/08 09:21:25 INFO dspy.teleprompt.gepa.gepa: Iteration 6: All subsample scores perfect. Skipping.
2025/12/08 09:21:25 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Reflective mutation did not propose a new candidate
2025/12/08 09:21:25 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Selected program 1 score: 0.8341968911917098



Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:10<00:00,  3.34s/it] 

2025/12/08 09:21:35 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/12/08 09:22:06 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Proposed new text for self: You are given a single input field:

- sentence: a short piece of text (e.g., a post, comment, or sentence from a speech or article).

Your task is to classify whether this sentence contains **polarization** as defined below.

DEFINITION OF POLARIZATION

In this task, "polarization" refers specifically to language that:

- Stereotypes, generalizes, or essentializes groups or people.
- Vilifies (strongly insults, demonizes) groups or people.
- Dehumanizes or de-individuates groups or people (treats them as less than human, as a monolith, or erases individuality).
- Expresses or promotes intolerance of other people’s views, beliefs, and identities.
- Incites, reinforces, or expresses division, groupism, hatred, conflict, or intolerance between social, political, national, ethnic, religious, or ideological groups.

We are focusing on sentences from speeches, posts, and articles (or snippets thereo

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:02<00:00,  1.27it/s] 

2025/12/08 09:23:16 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/12/08 09:23:38 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Proposed new text for self: You are given a single input field:

- `sentence`: A short text (e.g., a tweet, comment, or sentence from a speech/article).

Your task is to classify whether this sentence contains **polarization** according to the following definition and rules.

--------------------
DEFINITIONS
--------------------

**Polarization** (for this task) denotes any of the following directed at a person or group, or that encourages division between groups:

- **Stereotyping**: Attributing generalized, often negative traits or behaviors to a group (e.g., all members of a group behave or are a certain way).
- **Vilification**: Strongly condemning or demonizing a group or individual, painting them as evil, harmful, or morally corrupt.
- **Dehumanization**: Treating people or groups as less than human, denying their humanity, or reducing them to objects, animals, or targets.
- **Deindividuation**: Speaking of people o

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:04<00:00,  1.37s/it] 

2025/12/08 09:24:49 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/12/08 09:25:11 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Proposed new text for self: You are given a single input field:

- sentence: a short piece of text (e.g., a social media post, comment, or sentence from a speech or article).

Your task is to classify whether this sentence contains **polarization** as defined below.

DEFINITION OF POLARIZATION

In this task, “polarization” refers to language that contributes to or reflects group-based antagonism, hatred, or division. Concretely, polarization includes:

- Stereotyping groups or people (assigning negative traits to a group as a whole).
- Vilification (strongly insulting, demonizing, or morally condemning) of groups or people based on their group identity.
- Dehumanization or deindividuation of groups or people (implying they are less than human, animals, monsters, etc.).
- Intolerance of other people’s views, beliefs, or identities.
- Content that incites or expresses division, groupism, hatred, conflict, or intolerance bet

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:02<00:00,  1.33it/s]

2025/12/08 09:26:20 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/12/08 09:26:20 INFO dspy.teleprompt.gepa.gepa: Iteration 10: All subsample scores perfect. Skipping.
2025/12/08 09:26:20 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Reflective mutation did not propose a new candidate
GEPA Optimization:  87%|████████▋ | 1007/1152 [06:48<01:06,  2.17rollouts/s]2025/12/08 09:26:20 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Selected program 1 score: 0.8341968911917098



Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:03<00:00,  1.21s/it] 

2025/12/08 09:26:24 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/12/08 09:26:43 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Proposed new text for self: You are given a single input field:

- sentence: a short piece of text (e.g., a post, comment, or sentence from a speech or article).

Your task is to classify whether this sentence contains **polarization** as defined below.

DEFINITION OF POLARIZATION

Polarization denotes:
- stereotyping of groups or people,
- vilification (strongly insulting or demonizing) of groups or people,
- dehumanization or deindividuation of groups or people,
- intolerance of other people’s views, beliefs, and identities,
- content that incites or expresses division, groupism, hatred, conflict, or intolerance.

In this context, we are focusing on speeches, posts, and articles (or snippets thereof) that are shared on social media and:
- incite or reinforce group-based division or antagonism, or
- target people based on their identity, group membership, or political/collective alignment, or
- express or encourage hatr

Average Metric: 165.00 / 193 (85.5%): 100%|██████████| 193/193 [00:00<00:00, 865.76it/s]

2025/12/08 09:27:42 INFO dspy.evaluate.evaluate: Average Metric: 165.0 / 193 (85.5%)





Unnamed: 0,sentence,example_polarization,pred_polarization,gepa_metric
0,Donald Trump relies on First Amendment,no polarization,no polarization,✔️ [1.000]
1,House GOP in no rush to give more Ukraine aid after 6,no polarization,no polarization,✔️ [1.000]
2,Israeli adviser to meet with US officials on war,no polarization,no polarization,✔️ [1.000]
3,"so russia commits war crimes, how does that justify ukraine also c...",polarization,no polarization,✔️ [0.000]
4,Cant wait to watch this episode of Border Security,no polarization,no polarization,✔️ [1.000]
...,...,...,...,...
188,"Wow, thats awesome. It reminds me of the crazy things the Tamir in...",no polarization,no polarization,✔️ [1.000]
189,How long will it be until human rights are stripped away? oligarch...,polarization,polarization,✔️ [1.000]
190,There are no open borders here in Texas.,no polarization,no polarization,✔️ [1.000]
191,"Ottawa to unveil economic update detailing deficit, new border sec...",no polarization,no polarization,✔️ [1.000]


GEPA DSPy average accuracy metric: 85.49
GEPA validation metrics: {'f1_macro': 0.823905109489051, 'accuracy': 0.8549222797927462, 'precision': 0.7241379310344828, 'recall': 0.7777777777777778, 'f1_binary': 0.75, 'f1_micro': 0.8549222797927462}
