In [1]:
import dspy
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from tqdm.auto import tqdm

student_lm_name = "fireworks_ai/accounts/fireworks/models/glm-4p6"
teacher_lm_name = "fireworks_ai/accounts/fireworks/models/glm-4p6"

student_lm = dspy.LM(student_lm_name)
teacher_lm = dspy.LM(teacher_lm_name)

dspy.configure(lm=student_lm)

# Prepare DSPY Dataset

In [2]:
# Train val test split
lang = "eng"
# Load the training and validation data for subtask 1
train_df = pd.read_csv(f'./dev_phase/subtask1/train/{lang}.csv')
test_df = pd.read_csv(f'./dev_phase/subtask1/dev/{lang}.csv')
# Split train into train and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [3]:
# Map the polarization numeric label to a string
POLARIZATION_MAP = {1: "polarization", 0: "no polarization"}

def make_dspy_examples(df, include_label: bool = True):
    examples = []
    for _, row in df.iterrows():
        kwargs = dict(
            sentence=row["text"],
        )
        if include_label and "polarization" in row:
            kwargs["polarization"] = POLARIZATION_MAP[row["polarization"]]
        example = dspy.Example(**kwargs).with_inputs("sentence")
        examples.append(example)
    return examples

# Create DSPY datasets
raw_train = make_dspy_examples(train_df, include_label=True)
raw_val = make_dspy_examples(val_df, include_label=True)
raw_test = make_dspy_examples(test_df, include_label=False)


# # For now take only 10% of each 
raw_train = raw_train[:int(len(raw_train) * 0.2)]
raw_val = raw_val[:int(len(raw_val) * 0.3)]
raw_test = raw_test[:int(len(raw_test) * 1)]

# Define Signature (Subtask 1)

In [4]:
from typing import Literal

class Polarization(dspy.Signature):
    """
    Polarization denotes stereotyping, vilification, dehumanization, deindividuation, or intolerance of other people’s views, beliefs, and identities. In this study, speeches and articles that are shared on social media that incite division, groupism, hatred, conflict, and intolerance are classified as containing polarization.
    Given this sentence, classify it as containing polarization or not."""

    sentence: str = dspy.InputField()
    polarization: Literal["polarization", "no polarization"] = dspy.OutputField(
        desc='Return "polarization" or "no polarization".',
        choices=["polarization", "no polarization"],
    )
classify = dspy.Predict(Polarization)

# Evaluation

In [5]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

POLARIZATION_MAP = {1: "polarization", 0: "no polarization"}

def label2id(label: str) -> int:
    return {v: k for k, v in POLARIZATION_MAP.items()}[label]

def id2label(i: int) -> str:
    return POLARIZATION_MAP[i]

def accuracy_metric(example, pred, trace=None):
    gold = example.polarization          # from your Examples
    guess = pred.polarization         # from Signature output
    return int(gold == guess)         # 1 = correct, 0 = incorrect

In [10]:
evaluate = dspy.Evaluate(
    devset=raw_val,
    metric=accuracy_metric,
    display_progress=True,
    display_table=True,   # nice overview
)
eval_result = evaluate(classify)
print("DSPy average accuracy metric:", eval_result.score)  # percentage

Average Metric: 62.00 / 73 (84.9%):  36%|███▌      | 73/202 [05:01<07:37,  3.55s/it] 



Average Metric: 165.00 / 202 (81.7%): 100%|██████████| 202/202 [12:46<00:00,  3.79s/it]

2025/12/04 17:45:07 INFO dspy.evaluate.evaluate: Average Metric: 165 / 202 (81.7%)





Unnamed: 0,sentence,example_polarization,pred_polarization,accuracy_metric
0,لطيفة فنانة كبيرة ومحترمة وهي مدرسة متجددة وراقية وجديرة التعليم,no polarization,no polarization,✔️ [1]
1,بعد بيتي يا الكدعان ، اوف اوف لو مكسرينها ، ما قصرتو اسباع \nالعراقي,polarization,polarization,✔️ [1]
2,ما افتهمت الكويت تقول كاظم الساهر انطلق من الكويت واحد يقول انطلق ...,no polarization,no polarization,✔️ [1]
3,من البيضة مهيوب الحر … \nمن لعب لاندية عالمية مثل برايتون و مالمو...,polarization,,✔️ [0]
4,الاحتيال المالي يتفاقم.. هل تستطيع البنوك الصمود بمفردها؟,no polarization,no polarization,✔️ [1]
...,...,...,...,...
197,علم الطاقة \nالابراج \nالتاروت\nالاستبصار\nحرااااااااااام وشرك ولا...,polarization,polarization,✔️ [1]
198,فيديو .. وليد الفراج يعلق على خلع حسن معاذ للشورت شاهد على :,no polarization,no polarization,✔️ [1]
199,الجيش الإسرائيلي يصدر بيانا حول جثث الرهائن الأربعة,no polarization,no polarization,✔️ [1]
200,لطيفة اكبر منافقة تغني لمصر وتتودد للمصريين ايه النفاق ده يا لطيفة,polarization,polarization,✔️ [1]


DSPy average accuracy metric: 81.68


In [6]:
def eval_metrics_on_dataset(program, dataset):
    y_true, y_pred = [], []

    for ex in dataset:
        gold = ex.polarization
        pred = program(sentence=ex.sentence).polarization
        if pred is None:
            pred = "no polarization"
        y_true.append(label2id(gold))
        y_pred.append(label2id(pred))

    return {
        'f1_macro': f1_score(y_true, y_pred, average='macro'),
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, average='binary'),
        'recall': recall_score(y_true, y_pred, average='binary'),
        'f1_binary': f1_score(y_true, y_pred, average='binary'),
        'f1_micro': f1_score(y_true, y_pred, average='micro'),
    }

# metrics_val = eval_metrics_on_dataset(classify, raw_val)
# print("Validation metrics:", metrics_val)

# metrics_test = eval_metrics_on_dataset(classify, raw_test)
# print("Test metrics:", metrics_test)

In [7]:
ids = test_df["id"]
def predict_dataset(program, dataset):
    rows = []
    for i, ex in tqdm(enumerate(dataset), total=len(dataset)):
        out = program(sentence=ex.sentence)
        label = out.polarization
        rows.append({
            "id": test_df.iloc[i]["id"],
            "polarization": label2id(label),
        })
    return pd.DataFrame(rows)



# after you choose which model to use (classify, optimized_classify, etc.)
test_preds_df = predict_dataset(classify, raw_test)

print(test_preds_df.head())

  0%|          | 0/169 [00:00<?, ?it/s]

                                     id  polarization
0  arb_67be47e5216d7bee41e17484e619f4e6             1
1  arb_272322e5b265e177613d685e5619e402             1
2  arb_d1ec38dd0ec5d7a4fe28ef8317fc96c1             0
3  arb_fad75310b17c124d98ebc514189ec033             1
4  arb_95caf70cec5bf00c94c35cf7af2a0ab5             1


In [8]:
test_preds_df.to_csv(f"subtask1_{lang}_dspy_{student_lm_name.split("/")[-1]}.csv", index=False)

# Using MIPROv2

In [7]:
from dspy.teleprompt import MIPROv2

mipro = MIPROv2(
    metric=accuracy_metric,
    auto="light",
    teacher_settings=dict(lm=teacher_lm),
    prompt_model=student_lm
)

optimized_prog = mipro.compile(
    student=dspy.Predict(Polarization),
    trainset=raw_train,
    valset=raw_val,
    requires_permission_to_run=False,
)

# After optimization, compute your full metrics dict
metrics_val_opt = eval_metrics_on_dataset(optimized_prog, raw_val)

print("Optimized validation metrics:", metrics_val_opt)

2025/12/05 11:24:26 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 10
minibatch: True
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 100

2025/12/05 11:24:26 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/12/05 11:24:26 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/12/05 11:24:26 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=6 sets of demonstrations...


Bootstrapping set 1/6
Bootstrapping set 2/6
Bootstrapping set 3/6


  1%|          | 5/428 [00:34<48:09,  6.83s/it]  


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 4/6


  0%|          | 1/428 [00:11<1:19:45, 11.21s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 5/6


  1%|          | 4/428 [00:29<52:43,  7.46s/it]  


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 6/6


  0%|          | 1/428 [00:01<08:40,  1.22s/it]
2025/12/05 11:25:43 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/12/05 11:25:43 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.


2025/12/05 11:26:33 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=3 instructions...

2025/12/05 11:28:53 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/12/05 11:28:53 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Polarization denotes stereotyping, vilification, dehumanization, deindividuation, or intolerance of other people’s views, beliefs, and identities. In this study, speeches and articles that are shared on social media that incite division, groupism, hatred, conflict, and intolerance are classified as containing polarization.
Given this sentence, classify it as containing polarization or not.

2025/12/05 11:28:53 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Your task is to classify a given sentence as either 'polarization' or 'no polarization'. A sentence is considered polarizing if it uses language to intentionally create, reinforce, or exacerbate divisions between different social or political groups.

Identify polarizing language by 

Average Metric: 84.00 / 100 (84.0%): 100%|██████████| 100/100 [02:34<00:00,  1.55s/it]

2025/12/05 11:31:28 INFO dspy.evaluate.evaluate: Average Metric: 84 / 100 (84.0%)
2025/12/05 11:31:28 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 84.0

2025/12/05 11:31:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 13 - Minibatch ==



Average Metric: 30.00 / 35 (85.7%): 100%|██████████| 35/35 [00:30<00:00,  1.14it/s]

2025/12/05 11:31:59 INFO dspy.evaluate.evaluate: Average Metric: 30 / 35 (85.7%)
2025/12/05 11:31:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 85.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3'].
2025/12/05 11:31:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [85.71]
2025/12/05 11:31:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [84.0]
2025/12/05 11:31:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 84.0


2025/12/05 11:31:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 13 - Minibatch ==



Average Metric: 28.00 / 35 (80.0%): 100%|██████████| 35/35 [00:44<00:00,  1.27s/it]

2025/12/05 11:32:43 INFO dspy.evaluate.evaluate: Average Metric: 28 / 35 (80.0%)
2025/12/05 11:32:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/12/05 11:32:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [85.71, 80.0]
2025/12/05 11:32:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [84.0]
2025/12/05 11:32:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 84.0


2025/12/05 11:32:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 13 - Minibatch ==



Average Metric: 29.00 / 35 (82.9%): 100%|██████████| 35/35 [00:28<00:00,  1.23it/s]

2025/12/05 11:33:11 INFO dspy.evaluate.evaluate: Average Metric: 29 / 35 (82.9%)
2025/12/05 11:33:11 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 82.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 5'].
2025/12/05 11:33:11 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [85.71, 80.0, 82.86]
2025/12/05 11:33:11 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [84.0]
2025/12/05 11:33:11 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 84.0


2025/12/05 11:33:11 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 13 - Minibatch ==



Average Metric: 32.00 / 35 (91.4%): 100%|██████████| 35/35 [00:30<00:00,  1.14it/s]

2025/12/05 11:33:42 INFO dspy.evaluate.evaluate: Average Metric: 32 / 35 (91.4%)
2025/12/05 11:33:42 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 91.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 2'].
2025/12/05 11:33:42 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [85.71, 80.0, 82.86, 91.43]
2025/12/05 11:33:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [84.0]
2025/12/05 11:33:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 84.0


2025/12/05 11:33:42 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 13 - Minibatch ==



Average Metric: 30.00 / 35 (85.7%): 100%|██████████| 35/35 [00:28<00:00,  1.24it/s]

2025/12/05 11:34:10 INFO dspy.evaluate.evaluate: Average Metric: 30 / 35 (85.7%)
2025/12/05 11:34:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 85.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5'].
2025/12/05 11:34:10 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [85.71, 80.0, 82.86, 91.43, 85.71]
2025/12/05 11:34:10 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [84.0]
2025/12/05 11:34:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 84.0


2025/12/05 11:34:10 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 13 - Full Evaluation =====
2025/12/05 11:34:10 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 91.43) from minibatch trials...



Average Metric: 86.00 / 100 (86.0%): 100%|██████████| 100/100 [01:00<00:00,  1.66it/s]

2025/12/05 11:35:11 INFO dspy.evaluate.evaluate: Average Metric: 86 / 100 (86.0%)
2025/12/05 11:35:11 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 86.0
2025/12/05 11:35:11 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [84.0, 86.0]
2025/12/05 11:35:11 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0
2025/12/05 11:35:11 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/12/05 11:35:11 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 13 - Minibatch ==



Average Metric: 29.00 / 35 (82.9%): 100%|██████████| 35/35 [00:30<00:00,  1.13it/s]

2025/12/05 11:35:41 INFO dspy.evaluate.evaluate: Average Metric: 29 / 35 (82.9%)
2025/12/05 11:35:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 82.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/12/05 11:35:41 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [85.71, 80.0, 82.86, 91.43, 85.71, 82.86]
2025/12/05 11:35:41 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [84.0, 86.0]
2025/12/05 11:35:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0


2025/12/05 11:35:41 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 13 - Minibatch ==



Average Metric: 30.00 / 35 (85.7%): 100%|██████████| 35/35 [00:23<00:00,  1.52it/s] 

2025/12/05 11:36:04 INFO dspy.evaluate.evaluate: Average Metric: 30 / 35 (85.7%)
2025/12/05 11:36:04 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 85.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2025/12/05 11:36:04 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [85.71, 80.0, 82.86, 91.43, 85.71, 82.86, 85.71]
2025/12/05 11:36:04 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [84.0, 86.0]
2025/12/05 11:36:04 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0


2025/12/05 11:36:04 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 13 - Minibatch ==



Average Metric: 26.00 / 35 (74.3%): 100%|██████████| 35/35 [00:25<00:00,  1.39it/s]

2025/12/05 11:36:30 INFO dspy.evaluate.evaluate: Average Metric: 26 / 35 (74.3%)
2025/12/05 11:36:30 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 74.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 4'].
2025/12/05 11:36:30 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [85.71, 80.0, 82.86, 91.43, 85.71, 82.86, 85.71, 74.29]
2025/12/05 11:36:30 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [84.0, 86.0]
2025/12/05 11:36:30 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0


2025/12/05 11:36:30 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 13 - Minibatch ==



Average Metric: 29.00 / 35 (82.9%): 100%|██████████| 35/35 [00:29<00:00,  1.19it/s] 

2025/12/05 11:36:59 INFO dspy.evaluate.evaluate: Average Metric: 29 / 35 (82.9%)
2025/12/05 11:36:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 82.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 2'].
2025/12/05 11:36:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [85.71, 80.0, 82.86, 91.43, 85.71, 82.86, 85.71, 74.29, 82.86]
2025/12/05 11:36:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [84.0, 86.0]
2025/12/05 11:36:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0


2025/12/05 11:36:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 13 - Minibatch ==



Average Metric: 29.00 / 35 (82.9%): 100%|██████████| 35/35 [00:00<00:00, 3912.81it/s]

2025/12/05 11:36:59 INFO dspy.evaluate.evaluate: Average Metric: 29 / 35 (82.9%)
2025/12/05 11:36:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 82.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 2'].
2025/12/05 11:36:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [85.71, 80.0, 82.86, 91.43, 85.71, 82.86, 85.71, 74.29, 82.86, 82.86]
2025/12/05 11:36:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [84.0, 86.0]
2025/12/05 11:36:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0


2025/12/05 11:36:59 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 13 - Full Evaluation =====
2025/12/05 11:36:59 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 85.71) from minibatch trials...



Average Metric: 83.00 / 100 (83.0%): 100%|██████████| 100/100 [01:02<00:00,  1.59it/s]

2025/12/05 11:38:02 INFO dspy.evaluate.evaluate: Average Metric: 83 / 100 (83.0%)
2025/12/05 11:38:02 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [84.0, 86.0, 83.0]
2025/12/05 11:38:02 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 86.0
2025/12/05 11:38:02 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/12/05 11:38:02 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 86.0!





NameError: name 'eval_metrics_on_dataset' is not defined

In [8]:
dspy.inspect_history(n=3)





[34m[2025-12-05T11:37:56.280923][0m

[31mSystem message:[0m

Your input fields are:
1. `sentence` (str):
Your output fields are:
1. `polarization` (Literal['polarization', 'no polarization']): Return "polarization" or "no polarization".
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## sentence ## ]]
{sentence}

[[ ## polarization ## ]]
{polarization}        # note: the value you produce must exactly match (no extra characters) one of: polarization; no polarization

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Your task is to classify a given sentence as either 'polarization' or 'no polarization'. A sentence is considered polarizing if it uses language to intentionally create, reinforce, or exacerbate divisions between different social or political groups.
        
        Identify polarizing language by looking for these key indicators:
        *   **Loaded or Derogatory Labels:** Using c

In [9]:
ids = test_df["id"]
def predict_dataset(program, dataset):
    rows = []
    for i, ex in tqdm(enumerate(dataset), total=len(dataset)):
        out = program(sentence=ex.sentence)
        label = out.polarization
        rows.append({
            "id": test_df.iloc[i]["id"],
            "polarization": label2id(label),
        })
    return pd.DataFrame(rows)


# after you choose which model to use (classify, optimized_classify, etc.)
test_preds_df = predict_dataset(optimized_prog, raw_test)

print(test_preds_df.head())
test_preds_df.to_csv(f"subtask1_{lang}_dspy_miprov2_student{student_lm_name.split('/')[-1]}_teacher{teacher_lm_name.split('/')[-1]}.csv", index=False)

  0%|          | 0/133 [00:00<?, ?it/s]

                                     id  polarization
0  eng_f66ca14d60851371f9720aaf4ccd9b58             0
1  eng_3a489aa7fed9726aa8d3d4fe74c57efb             0
2  eng_95770ff547ea5e48b0be00f385986483             0
3  eng_2048ae6f9aa261c48e6d777bcc5b38bf             1
4  eng_07781aa88e61e7c0a996abd1e5ea3a20             0


In [10]:
# Save optimize program for future use
optimized_prog.save(f"optimized_subtask1_{lang}_dspy_miprov2_student{student_lm_name.split('/')[-1]}_teacher{teacher_lm_name.split('/')[-1]}.json")

# load the optimized program
save_path = f"optimized_subtask1_{lang}_dspy_miprov2_student{student_lm_name.split('/')[-1]}_teacher{teacher_lm_name.split('/')[-1]}.json"
optimized_prog_loaded = dspy.Predict(Polarization)