In [1]:
import dspy
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

student_lm_name = "openai/gpt-5-nano"
teacher_lm_name = "openai/gpt-5-mini"

student_lm = dspy.LM(student_lm_name)
teacher_lm = dspy.LM(teacher_lm_name)

dspy.configure(lm=student_lm)

# Prepare DSPY Dataset

In [2]:
# Train val test split
lang = "arb"
# Load the training and validation data for subtask 1
train_df = pd.read_csv(f'./dev_phase/subtask1/train/{lang}.csv')
test_df = pd.read_csv(f'./dev_phase/subtask1/dev/{lang}.csv')
# Split train into train and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [None]:
# Map the polarization numeric label to a string
POLARIZATION_MAP = {1: "polarization", 0: "no polarization"}

def make_dspy_examples(df, include_label: bool = True):
    examples = []
    for _, row in df.iterrows():
        kwargs = dict(
            sentence=row["text"],
        )
        if include_label and "polarization" in row:
            kwargs["polarization"] = POLARIZATION_MAP[row["polarization"]]
        example = dspy.Example(**kwargs).with_inputs("sentence")
        examples.append(example)
    return examples

# Create DSPY datasets
raw_train = make_dspy_examples(train_df, include_label=True)
raw_val = make_dspy_examples(val_df, include_label=True)
raw_test = make_dspy_examples(test_df, include_label=False)


# For now take only 10% of each 
raw_train = raw_train[:int(len(raw_train) * 0.1)]
raw_val = raw_val[:int(len(raw_val) * 0.1)]
raw_test = raw_test[:int(len(raw_test) * 0.1)]

# Define Signature (Subtask 1)

In [4]:
from typing import Literal

class Polarization(dspy.Signature):
    """
    Polarization denotes stereotyping, vilification, dehumanization, deindividuation, or intolerance of other people’s views, beliefs, and identities. In this study, speeches and articles that are shared on social media that incite division, groupism, hatred, conflict, and intolerance are classified as containing polarization.
    Given this sentence, classify it as containing polarization or not."""

    sentence: str = dspy.InputField()
    polarization: Literal["polarization", "no polarization"] = dspy.OutputField(
        desc='Return "polarization" or "no polarization".',
        choices=["polarization", "no polarization"],
    )
classify = dspy.Predict(Polarization)

# Evaluation

In [5]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

POLARIZATION_MAP = {1: "polarization", 0: "no polarization"}

def label2id(label: str) -> int:
    return {v: k for k, v in POLARIZATION_MAP.items()}[label]

def id2label(i: int) -> str:
    return POLARIZATION_MAP[i]

def accuracy_metric(example, pred, trace=None):
    gold = example.polarization          # from your Examples
    guess = pred.polarization         # from Signature output
    return int(gold == guess)         # 1 = correct, 0 = incorrect

In [6]:
evaluate = dspy.Evaluate(
    devset=raw_val,
    metric=accuracy_metric,
    display_progress=True,
    display_table=True,   # nice overview
)
eval_result = evaluate(classify)
print("DSPy average accuracy metric:", eval_result.score)  # percentage

Average Metric: 54.00 / 67 (80.6%): 100%|██████████| 67/67 [01:05<00:00,  1.02it/s] 

2025/12/04 12:10:48 INFO dspy.evaluate.evaluate: Average Metric: 54 / 67 (80.6%)





Unnamed: 0,sentence,id,example_polarization,pred_polarization,accuracy_metric
0,لطيفة فنانة كبيرة ومحترمة وهي مدرسة متجددة وراقية وجديرة التعليم,arb_7c368a050fe0fa3f2e3ce9381b044a92,no polarization,no polarization,✔️ [1]
1,بعد بيتي يا الكدعان ، اوف اوف لو مكسرينها ، ما قصرتو اسباع \nالعراقي,arb_3807da9263dd05b8d446e28dfad9527f,polarization,no polarization,✔️ [0]
2,ما افتهمت الكويت تقول كاظم الساهر انطلق من الكويت واحد يقول انطلق ...,arb_e3ff5e7a145f2f1af59fe95994a3ecf7,no polarization,no polarization,✔️ [1]
3,من البيضة مهيوب الحر … \nمن لعب لاندية عالمية مثل برايتون و مالمو...,arb_a4c06a913d00fe874c4599d6e77487d3,polarization,no polarization,✔️ [0]
4,الاحتيال المالي يتفاقم.. هل تستطيع البنوك الصمود بمفردها؟,arb_07608e61292eead7cb1d68bd0e593c97,no polarization,no polarization,✔️ [1]
...,...,...,...,...,...
62,صخل حرام اصلا انته عايش اغبرررر اوووف شنو هالنماذج,arb_4ed42d3e85c7e08cfb5bb38fe9612200,polarization,no polarization,✔️ [0]
63,ليش لما كانو يقولو هالمقولة كانو يشاركو بكأس العالم ؟\n\n,arb_5fe0ce58ce66430db91c1ff1b39060a9,no polarization,no polarization,✔️ [1]
64,تحقيق بي بي سي: الإمارات ضالعة في ضربة قاتلة بطائرة مسيرة في ليبيا,arb_9f45e60c6797ce7b80047813d8137668,no polarization,no polarization,✔️ [1]
65,مية بالمية اختراع الذكورة اللي انزرع شيطان في مخهم ضد المراءة من...,arb_ae8662935b5dcb03e5c77582c92be9f4,polarization,polarization,✔️ [1]


DSPy average accuracy metric: 80.6


In [7]:
def eval_metrics_on_dataset(program, dataset):
    y_true, y_pred = [], []

    for ex in dataset:
        gold = ex.polarization
        pred = program(sentence=ex.sentence).polarization
        y_true.append(label2id(gold))
        y_pred.append(label2id(pred))

    return {
        'f1_macro': f1_score(y_true, y_pred, average='macro'),
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, average='binary'),
        'recall': recall_score(y_true, y_pred, average='binary'),
        'f1_binary': f1_score(y_true, y_pred, average='binary'),
        'f1_micro': f1_score(y_true, y_pred, average='micro'),
    }

metrics_val = eval_metrics_on_dataset(classify, raw_val)
print("Validation metrics:", metrics_val)

# metrics_test = eval_metrics_on_dataset(classify, raw_test)
# print("Test metrics:", metrics_test)

Validation metrics: {'f1_macro': 0.8044015270604087, 'accuracy': 0.8059701492537313, 'precision': 0.8571428571428571, 'recall': 0.7272727272727273, 'f1_binary': 0.7868852459016393, 'f1_micro': 0.8059701492537313}


In [8]:
def predict_dataset(program, dataset):
    rows = []
    for ex in dataset:
        out = program(sentence=ex.sentence)
        label = out.polarization
        rows.append({
            "id": ex.id,
            "polarization": label2id(label),
        })
    return pd.DataFrame(rows)


# after you choose which model to use (classify, optimized_classify, etc.)
test_preds_df = predict_dataset(classify, raw_test)

print(test_preds_df.head())


                                     id  polarization
0  arb_67be47e5216d7bee41e17484e619f4e6             0
1  arb_272322e5b265e177613d685e5619e402             0
2  arb_d1ec38dd0ec5d7a4fe28ef8317fc96c1             1
3  arb_fad75310b17c124d98ebc514189ec033             1
4  arb_95caf70cec5bf00c94c35cf7af2a0ab5             1


OSError: Cannot save file into a non-existent directory: 'subtask1_arb_dspy_openai'

In [11]:
test_preds_df.to_csv(f"subtask1_{lang}_dspy_{student_lm_name.split("/")[-1]}.csv", index=False)

# Using MIPROv2

In [12]:
from dspy.teleprompt import MIPROv2

mipro = MIPROv2(
    metric=accuracy_metric,
    auto="light",
    teacher_settings=dict(lm=teacher_lm),
    prompt_model=student_lm
)

optimized_prog = mipro.compile(
    student=dspy.Predict(Polarization),
    trainset=raw_train,
    valset=raw_val,
)

# After optimization, compute your full metrics dict
metrics_val_opt = eval_metrics_on_dataset(optimized_prog, raw_val)

print("Optimized validation metrics:", metrics_val_opt)

2025/12/04 12:14:27 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 10
minibatch: True
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 67

2025/12/04 12:14:27 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/12/04 12:14:27 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/12/04 12:14:27 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=6 sets of demonstrations...


Bootstrapping set 1/6
Bootstrapping set 2/6
Bootstrapping set 3/6


  1%|▏         | 4/270 [00:16<18:24,  4.15s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 4/6


  1%|          | 2/270 [00:12<27:33,  6.17s/it]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 5/6


  0%|          | 1/270 [00:07<34:58,  7.80s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 6/6


  0%|          | 1/270 [00:03<14:43,  3.28s/it]
2025/12/04 12:15:07 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/12/04 12:15:07 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.


2025/12/04 12:18:59 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=3 instructions...

2025/12/04 12:21:19 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/12/04 12:21:19 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Polarization denotes stereotyping, vilification, dehumanization, deindividuation, or intolerance of other people’s views, beliefs, and identities. In this study, speeches and articles that are shared on social media that incite division, groupism, hatred, conflict, and intolerance are classified as containing polarization.
Given this sentence, classify it as containing polarization or not.

2025/12/04 12:21:19 INFO dspy.teleprompt.mipro_optimizer_v2: 1: You are a safety-conscious Arabic text moderation assistant. You will receive one Arabic sentence (dialectal, with informal punctuation, emojis, line breaks, elongations, and Eastern Arabic numerals). Determine whether the sentence contains polarization content (divisive or polarizin

Average Metric: 54.00 / 67 (80.6%): 100%|██████████| 67/67 [00:00<00:00, 1776.11it/s]

2025/12/04 12:21:19 INFO dspy.evaluate.evaluate: Average Metric: 54 / 67 (80.6%)
2025/12/04 12:21:19 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 80.6

2025/12/04 12:21:19 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 13 - Minibatch ==



Average Metric: 28.00 / 35 (80.0%): 100%|██████████| 35/35 [00:34<00:00,  1.02it/s]

2025/12/04 12:21:54 INFO dspy.evaluate.evaluate: Average Metric: 28 / 35 (80.0%)
2025/12/04 12:21:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3'].
2025/12/04 12:21:54 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0]
2025/12/04 12:21:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.6]
2025/12/04 12:21:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.6


2025/12/04 12:21:54 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 13 - Minibatch ==



Average Metric: 26.00 / 35 (74.3%): 100%|██████████| 35/35 [01:09<00:00,  1.99s/it]

2025/12/04 12:23:03 INFO dspy.evaluate.evaluate: Average Metric: 26 / 35 (74.3%)
2025/12/04 12:23:03 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 74.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/12/04 12:23:03 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 74.29]
2025/12/04 12:23:03 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.6]
2025/12/04 12:23:03 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.6


2025/12/04 12:23:04 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 13 - Minibatch ==



Average Metric: 27.00 / 35 (77.1%): 100%|██████████| 35/35 [00:24<00:00,  1.43it/s]

2025/12/04 12:23:28 INFO dspy.evaluate.evaluate: Average Metric: 27 / 35 (77.1%)
2025/12/04 12:23:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 5'].
2025/12/04 12:23:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 74.29, 77.14]
2025/12/04 12:23:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.6]
2025/12/04 12:23:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.6


2025/12/04 12:23:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 13 - Minibatch ==



Average Metric: 27.00 / 35 (77.1%): 100%|██████████| 35/35 [00:41<00:00,  1.19s/it]

2025/12/04 12:24:10 INFO dspy.evaluate.evaluate: Average Metric: 27 / 35 (77.1%)
2025/12/04 12:24:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 2'].
2025/12/04 12:24:10 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 74.29, 77.14, 77.14]
2025/12/04 12:24:10 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.6]
2025/12/04 12:24:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.6


2025/12/04 12:24:10 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 13 - Minibatch ==



Average Metric: 31.00 / 35 (88.6%): 100%|██████████| 35/35 [00:28<00:00,  1.22it/s]

2025/12/04 12:24:38 INFO dspy.evaluate.evaluate: Average Metric: 31 / 35 (88.6%)
2025/12/04 12:24:38 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5'].
2025/12/04 12:24:38 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 74.29, 77.14, 77.14, 88.57]
2025/12/04 12:24:38 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.6]
2025/12/04 12:24:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 80.6


2025/12/04 12:24:38 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 13 - Full Evaluation =====
2025/12/04 12:24:38 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 88.57) from minibatch trials...



Average Metric: 57.00 / 67 (85.1%): 100%|██████████| 67/67 [00:35<00:00,  1.91it/s]  

2025/12/04 12:25:14 INFO dspy.evaluate.evaluate: Average Metric: 57 / 67 (85.1%)
2025/12/04 12:25:14 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 85.07
2025/12/04 12:25:14 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.6, 85.07]
2025/12/04 12:25:14 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 85.07
2025/12/04 12:25:14 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/12/04 12:25:14 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 13 - Minibatch ==



Average Metric: 28.00 / 35 (80.0%): 100%|██████████| 35/35 [00:28<00:00,  1.22it/s]

2025/12/04 12:25:42 INFO dspy.evaluate.evaluate: Average Metric: 28 / 35 (80.0%)
2025/12/04 12:25:42 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/12/04 12:25:42 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 74.29, 77.14, 77.14, 88.57, 80.0]
2025/12/04 12:25:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.6, 85.07]
2025/12/04 12:25:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 85.07


2025/12/04 12:25:42 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 13 - Minibatch ==



Average Metric: 26.00 / 35 (74.3%): 100%|██████████| 35/35 [00:28<00:00,  1.23it/s]

2025/12/04 12:26:11 INFO dspy.evaluate.evaluate: Average Metric: 26 / 35 (74.3%)
2025/12/04 12:26:11 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 74.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2025/12/04 12:26:11 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 74.29, 77.14, 77.14, 88.57, 80.0, 74.29]
2025/12/04 12:26:11 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.6, 85.07]
2025/12/04 12:26:11 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 85.07


2025/12/04 12:26:11 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 13 - Minibatch ==



Average Metric: 28.00 / 35 (80.0%): 100%|██████████| 35/35 [00:27<00:00,  1.29it/s]

2025/12/04 12:26:38 INFO dspy.evaluate.evaluate: Average Metric: 28 / 35 (80.0%)
2025/12/04 12:26:38 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 4'].
2025/12/04 12:26:38 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 74.29, 77.14, 77.14, 88.57, 80.0, 74.29, 80.0]
2025/12/04 12:26:38 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.6, 85.07]
2025/12/04 12:26:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 85.07


2025/12/04 12:26:38 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 13 - Minibatch ==



Average Metric: 27.00 / 35 (77.1%): 100%|██████████| 35/35 [00:00<00:00, 2206.83it/s]

2025/12/04 12:26:38 INFO dspy.evaluate.evaluate: Average Metric: 27 / 35 (77.1%)
2025/12/04 12:26:38 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5'].
2025/12/04 12:26:38 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 74.29, 77.14, 77.14, 88.57, 80.0, 74.29, 80.0, 77.14]
2025/12/04 12:26:38 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.6, 85.07]
2025/12/04 12:26:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 85.07


2025/12/04 12:26:38 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 13 - Minibatch ==



Average Metric: 24.00 / 35 (68.6%): 100%|██████████| 35/35 [00:36<00:00,  1.05s/it]

2025/12/04 12:27:14 INFO dspy.evaluate.evaluate: Average Metric: 24 / 35 (68.6%)
2025/12/04 12:27:14 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1'].
2025/12/04 12:27:14 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [80.0, 74.29, 77.14, 77.14, 88.57, 80.0, 74.29, 80.0, 77.14, 68.57]
2025/12/04 12:27:14 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.6, 85.07]
2025/12/04 12:27:14 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 85.07


2025/12/04 12:27:14 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 13 - Full Evaluation =====
2025/12/04 12:27:14 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 80.0) from minibatch trials...



Average Metric: 52.00 / 67 (77.6%): 100%|██████████| 67/67 [00:30<00:00,  2.16it/s]

2025/12/04 12:27:45 INFO dspy.evaluate.evaluate: Average Metric: 52 / 67 (77.6%)
2025/12/04 12:27:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [80.6, 85.07, 77.61]
2025/12/04 12:27:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 85.07
2025/12/04 12:27:45 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/12/04 12:27:45 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 85.07!



Optimized validation metrics: {'f1_macro': 0.8499103942652331, 'accuracy': 0.8507462686567164, 'precision': 0.896551724137931, 'recall': 0.7878787878787878, 'f1_binary': 0.8387096774193549, 'f1_micro': 0.8507462686567164}


In [20]:
ids = test_df["id"]
def predict_dataset(program, dataset):
    rows = []
    for i, ex in enumerate(dataset):
        out = program(sentence=ex.sentence)
        label = out.polarization
        rows.append({
            "id": test_df.iloc[i]["id"],
            "polarization": label2id(label),
        })
    return pd.DataFrame(rows)


# after you choose which model to use (classify, optimized_classify, etc.)
test_preds_df = predict_dataset(optimized_prog, raw_test)

print(test_preds_df.head())
test_preds_df.to_csv(f"subtask1_{lang}_dspy_miprov2_student{student_lm_name.split('/')[-1]}_teacher{teacher_lm_name.split('/')[-1]}.csv", index=False)

                                     id  polarization
0  arb_67be47e5216d7bee41e17484e619f4e6             1
1  arb_272322e5b265e177613d685e5619e402             0
2  arb_d1ec38dd0ec5d7a4fe28ef8317fc96c1             1
3  arb_fad75310b17c124d98ebc514189ec033             1
4  arb_95caf70cec5bf00c94c35cf7af2a0ab5             1


In [None]:
# Save optimize program for future use
optimized_prog.save(f"optimized_subtask1_{lang}_dspy_miprov2_student{student_lm_name.split('/')[-1]}_teacher{teacher_lm_name.split('/')[-1]}.json")

# load the optimized program
save_path = f"optimized_subtask1_{lang}_dspy_miprov2_student{student_lm_name.split('/')[-1]}_teacher{teacher_lm_name.split('/')[-1]}.json"
optimized_prog_loaded = dspy.Predict(Polarization)