In [1]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from statsmodels.stats.contingency_tables import mcnemar

In [3]:
from scipy import stats
from itertools import combinations

In [4]:
import openai
from openai import OpenAI

In [5]:
import os

In [6]:
import re
import torch
import sklearn
from sklearn.model_selection import train_test_split

In [7]:
df_anno = pd.read_csv('plasticity_2025_Anno_DEU_test_2nd_run.csv')

In [8]:
df_anno['Fokalisierung'] = df_anno['Fokalisierung'].replace({
    'intern': 'internal',
    'extern': 'external',
    'null': 'zero',
    None: 'zero',
    np.nan: 'zero'
})

In [9]:
df_anno.head()

Unnamed: 0,Autor,Titel,Absatz,Fokalisierung,Kommentar
0,Schiller,Der Vebrecher aus verlorener Ehre,In der ganzen Geschichte des Menschen ist kein...,zero,
1,Schiller,Der Vebrecher aus verlorener Ehre,Es ist etwas so Einförmiges und doch wieder so...,zero,
2,Schiller,Der Vebrecher aus verlorener Ehre,"Ich weiß, daß von den besten Geschichtschreibe...",internal,
3,Schiller,Der Vebrecher aus verlorener Ehre,"Der Held muß kalt werden wie der Leser, oder, ...",zero,
4,Tieck,Die beiden merkwürdigsten Tage aus Siegmunds L...,"Es war schon gegen Abend, als ein Wagen vor de...",internal,


In [10]:
api_key = os.getenv('MY_OPENAI')

In [11]:
def get_completion(prompt):  
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(
        messages=[{
        "role": "user",
        "content": prompt,
        }],
        model="o3-mini-2025-01-31",
    )
    return response.choices[0].message.content

In [12]:
prompt_basic = """
### Instruction
Your task is to classify the focalization of the following sentence

###
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text 
Sentence: '''{text}''''
Label:
"""

In [13]:
prompt_labels = """
### Instruction
Your task is to classify the focalization of the following sentence

### Labels
There are three modes of focalization:
- internal
- external
- zero

####
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text 
Sentence: '''{text}''' 
Label: 
"""

In [14]:
prompt_redefin = """
### Instruction
Your task is to classify the focalization of the following sentence

### Labels
There are three modes of focalization:
- internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
- external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
- zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously.

####
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text  
Sentence: '''{text}''' 
Label: 
"""

In [15]:
prompt_meta = """
### Instruction
Your task is to classify the focalization of the following sentence

### Labels
There are three modes of focalization:
- internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
- external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
- zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously. 
These definitions are redefinitions of the standard understanding of focalization.

####
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text 
Sentence: '''{text}''' 
Label: 
"""

In [16]:
prompt_templates = [prompt_labels, prompt_redefin, prompt_meta]

In [17]:
def evaluate_prompts_and_predictions(df, prompt_templates):
    """
    Evaluiert verschiedene Prompt-Templates und berechnet Metriken für die Vorhersagen.
    Jeder Prompt wird zweimal über die Daten ausgeführt.
    
    Args:
        df: Pandas DataFrame mit den Spalten 'Absatz' und 'Fokalisierung'
        prompt_templates: Liste der Prompt-Templates
    
    Returns:
        Tuple mit:
            - df: DataFrame mit den gespeicherten Vorhersagen
            - results_df: DataFrame mit den Evaluierungsmetriken für jeden Prompt und jeden Run
    """
    results = []

    for run in range(1, 6):
        print(f"Starte Run {run}/5")

        # Iteration über die Prompt-Templates
        for prompt_idx, template in enumerate(prompt_templates):
            print(f"Verarbeite Prompt-Template {prompt_idx + 1}/{len(prompt_templates)} - Run {run}")
            
            # Spaltenname für Vorhersage definieren
            prediction_col = f'Prediction_{prompt_idx}_Run{run}'
            df[prediction_col] = None
            
            # Iteration über die Zeilen des DataFrames
            for idx, row in df.iterrows():
                prompt = template.format(text=row['Absatz'])
                prediction = get_completion(prompt)
                first_word = prediction.split()[0].lower()
                print(first_word)
                df.at[idx, prediction_col] = first_word
            
            # Metriken berechnen
            metrics = {
                'Prompt': f'Template_{prompt_idx}',
                'Run': run,
                'F1-Score': f1_score(df['Fokalisierung'], df[prediction_col], average='weighted'),
                'Recall': recall_score(df['Fokalisierung'], df[prediction_col], average='weighted'),
                'Precision': precision_score(df['Fokalisierung'], df[prediction_col], average='weighted'),
                'Accuracy': accuracy_score(df['Fokalisierung'], df[prediction_col])
            }
            
            results.append(metrics)
    
    # Ergebnisse in DataFrame umwandeln
    results_df = pd.DataFrame(results)
    return df, results_df

In [18]:
results_03, test = evaluate_prompts_and_predictions(df_anno, prompt_templates)

Starte Run 1/5
Verarbeite Prompt-Template 1/3 - Run 1
zero
zero
internal
internal
zero
external
external
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
external
external
zero
internal
internal
internal
internal
zero
internal
external
internal
external
external
internal
zero
internal
internal
internal
internal
external
zero
external
external
internal
internal
internal
internal
external
internal
internal
internal
external
external
external
external
internal
internal
internal
internal
external
external
external
zero
internal
external
external
zero
external
internal
external
external
zero
external
internal
zero
internal
external
zero
internal
internal
internal
internal
internal
external
external
external
external
external
external
external
external
external
external
external
external
external
external
external
internal
external
internal
external
zero
external
external
external
internal
zero
zero
Verarbeite Prompt-Template 2/3 - Run 1
zero
zero
interna

In [19]:
results_03.to_json("DH_03_results.json", orient="records", indent=4, force_ascii=False)

In [20]:
test

Unnamed: 0,Prompt,Run,F1-Score,Recall,Precision,Accuracy
0,Template_0,1,0.576829,0.566038,0.6413,0.566038
1,Template_1,1,0.716632,0.726415,0.740253,0.726415
2,Template_2,1,0.653581,0.688679,0.694626,0.688679
3,Template_0,2,0.575975,0.556604,0.64521,0.556604
4,Template_1,2,0.676402,0.698113,0.738234,0.698113
5,Template_2,2,0.650274,0.660377,0.667645,0.660377
6,Template_0,3,0.582256,0.575472,0.68066,0.575472
7,Template_1,3,0.625579,0.650943,0.627566,0.650943
8,Template_2,3,0.676144,0.688679,0.690755,0.688679
9,Template_0,4,0.585558,0.566038,0.657878,0.566038


In [21]:
def summarize_prompt_metrics(df):
    summary_rows = []

    # Gruppieren nach Prompt-Template
    grouped = df.groupby("Prompt")

    for prompt, group in grouped:
        f1_median = group["F1-Score"].median()
        f1_min = group["F1-Score"].min()
        f1_max = group["F1-Score"].max()

        precision_mean = group["Precision"].mean()
        precision_std = group["Precision"].std()

        recall_mean = group["Recall"].mean()
        recall_std = group["Recall"].std()

        acc_mean = group["Accuracy"].mean()
        acc_std = group["Accuracy"].std()

        summary_rows.append({
            "Prompt": prompt,
            "F1-Median (Min–Max)": f"{f1_median:.3f} ({f1_min:.3f}–{f1_max:.3f})",
            "Precision (M ± SD)": f"{precision_mean:.3f} ± {precision_std:.3f}",
            "Recall (M ± SD)": f"{recall_mean:.3f} ± {recall_std:.3f}",
            "Accuracy (M ± SD)": f"{acc_mean:.3f} ± {acc_std:.3f}",
        })

    summary_df = pd.DataFrame(summary_rows)
    return summary_df


In [22]:
summary = summarize_prompt_metrics(test)
summary

Unnamed: 0,Prompt,F1-Median (Min–Max),Precision (M ± SD),Recall (M ± SD),Accuracy (M ± SD)
0,Template_0,0.582 (0.576–0.604),0.663 ± 0.021,0.570 ± 0.011,0.570 ± 0.011
1,Template_1,0.676 (0.621–0.717),0.696 ± 0.069,0.689 ± 0.041,0.689 ± 0.041
2,Template_2,0.656 (0.650–0.700),0.689 ± 0.022,0.685 ± 0.022,0.685 ± 0.022


## Optimization – DSPY

In [23]:
import litellm

In [24]:
import dspy

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
litellm.drop_params = True

# Loading Data

In [26]:
df_anno_train = pd.read_csv('plasticity_focalization_trainset.csv')

In [27]:
df_anno_train['Fokalisierung'] = df_anno_train['Fokalisierung'].replace({
    'intern': 'internal',
    'extern': 'external',
    'null': 'zero',
    None: 'zero',
    np.nan: 'zero'
})

In [28]:
df_anno_test = pd.read_csv('plasticity_2025_Anno_DEU_test_2nd_run.csv')

In [29]:
df_anno_test['Fokalisierung'] = df_anno_test['Fokalisierung'].replace({
    'intern': 'internal',
    'extern': 'external',
    'null': 'zero',
    None: 'zero',
    np.nan: 'zero'
})

In [30]:
df_anno_train.head()

Unnamed: 0,Autor,Titel,Absatz,Fokalisierung,Kommentar,Link
0,Goethe,Die Sängerin Antonelli,"Als ich mich in Neapel aufhielt, begegnete das...",internal,,https://www.projekt-gutenberg.org/goethe/anton...
1,Goethe,Die Sängerin Antonelli,"Eine Sängerin, Antonelli genannt, war zu meine...",internal,,https://www.projekt-gutenberg.org/goethe/anton...
2,Goethe,Die Sängerin Antonelli,Bei ihren bisherigen Verbindungen war ihr Geis...,internal,,https://www.projekt-gutenberg.org/goethe/anton...
3,Goethe,Die Sängerin Antonelli,"Es war ein Genueser, der sich um diese Zeit ei...",internal,,https://www.projekt-gutenberg.org/goethe/anton...
4,Tieck,Das grüne Band,Durch die Thäler und über die Wiesen wandelte ...,external,,https://www.projekt-gutenberg.org/tieck/grueba...


In [31]:
def balanced_sample_by_category(df, category_column, n_per_category=8, random_state=42):
    """
    Gibt ein balanciertes Sample aus dem DataFrame zurück mit n_per_category Einträgen pro Kategorie.

    Args:
        df (pd.DataFrame): Eingabedaten.
        category_column (str): Spaltenname, nach dem kategorisiert werden soll.
        n_per_category (int): Anzahl von Einträgen pro Kategorie.
        random_state (int): Seed für Reproduzierbarkeit.

    Returns:
        pd.DataFrame: Balanciertes Sample.
    """
    # Fehlende Kategorien ausschließen
    df_clean = df.dropna(subset=[category_column])

    # Alle eindeutigen Kategorien abrufen
    categories = df_clean[category_column].unique()

    # Sampling durchführen
    balanced_df = pd.concat([
        df_clean[df_clean[category_column] == cat].sample(
            n=min(n_per_category, len(df_clean[df_clean[category_column] == cat])),
            random_state=random_state
        )
        for cat in categories
    ])

    # Index zurücksetzen
    return balanced_df.reset_index(drop=True)

In [32]:
df_train_balanced = balanced_sample_by_category(df_anno_train, category_column="Fokalisierung", n_per_category=8)

In [33]:
df_train_balanced.describe()

Unnamed: 0,Autor,Titel,Absatz,Fokalisierung,Kommentar,Link
count,24,24,24,24,0.0,24
unique,14,14,24,3,0.0,14
top,Brentano,Baron Hüpfenstich,"Als ich mich in Neapel aufhielt, begegnete das...",internal,,https://www.projekt-gutenberg.org/brentano/hue...
freq,3,3,1,8,,3


# Preparing Dataset

Anleitung zur Erstellung eines Datensets: https://dspy-docs.vercel.app/docs/deep-dive/data-handling/loading-custom-data

In [34]:
from dspy.datasets.dataset import Dataset

In [35]:
train = df_train_balanced[["Absatz", "Fokalisierung"]].copy(deep=True)

In [36]:
train.head()

Unnamed: 0,Absatz,Fokalisierung
0,"Als ich mich in Neapel aufhielt, begegnete das...",internal
1,"Eine große Sorge hatte der gute König jetzt, d...",internal
2,"Es ist doch etwas Schönes, Herrliches, Erhaben...",internal
3,"Eine Sängerin, Antonelli genannt, war zu meine...",internal
4,Einen anderen Weg schlag ich ein; er ist aller...,internal


In [37]:
len(train)

24

In [38]:
class CSVDataset(Dataset):
    def __init__(self, df, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        
        df=df
        #self._train = df.iloc[0:35].to_dict(orient='records')
        self._dev = df.to_dict(orient='records')

In [39]:
dataset = CSVDataset(train)

In [40]:
len(dataset.dev)

24

# Setting LLM

In [41]:
dspy.settings.configure(
    cache=None
)

In [42]:
gpt_key = os.getenv('MY_OPENAI')

In [43]:
gpt = dspy.LM('o3-mini-2025-01-31', temperature=1.0, max_tokens = 5000, api_key=gpt_key)

In [44]:
dspy.settings.configure(lm=gpt)

# Setting Up Module + checking output

In [45]:
class Determinacy(dspy.Signature):
    """
    Your task is to classify the focalization of the following sentence 
    
    ### Labels
    There are three modes of focalization:
    - internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
    - external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
    - zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously.
    """
    #context = dspy.InputField(desc="contains annotation guidelines and scoring instructions")
    text_snippet = dspy.InputField(desc="contains a snippet of a narrative text")
    tag = dspy.OutputField(desc="contains only the **label** in lower case")

In [46]:
context = """ 
### Labels
There are three modes of focalization:
- internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
- external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
- zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously.
"""

# Setting Metric

Anleitung zu den Metriken in DSPY: https://dspy-docs.vercel.app/docs/building-blocks/metrics

In [47]:
from dspy.evaluate import Evaluate

In [48]:
def validate_tag(example, pred, trace=None):
    print(example.answer)
    print(pred.tag)
    return example.answer in pred.tag

# Trying out the Signature Optimizer

Anleitung zur Arbeit mit dem Optimizer bei zero-shot: https://dspy-docs.vercel.app/docs/deep-dive/teleprompter/signature-optimizer

In [49]:
class DeterminacyPipe(dspy.Module):
    def __init__(self):
        super().__init__()
        
        self.signature = Determinacy
        self.predictor = dspy.ChainOfThought(self.signature)
        
    def forward(self, text_snippet):
        result = self.predictor(text_snippet=text_snippet)
        return dspy.Prediction(
            tag = result.tag
        )      

In [50]:
devset = dataset.dev

In [51]:
evaluate = Evaluate(devset=devset, metric=validate_tag, num_threads=3, display_progress=True, display_table=True)

In [52]:
event_baseline = DeterminacyPipe()
devset_with_input = [dspy.Example({"text_snippet": r["Absatz"], "answer": r["Fokalisierung"]}).with_inputs("context", "text_snippet") for r in devset]

In [53]:
evaluate(event_baseline, devset=devset_with_input)

external
external
zero
external
internal
internal
internalexternal
zero

internal
  0%|                                                    | 0/24 [00:00<?, ?it/s]internal
internal
zerozero
zero
Average Metric: 1.00 / 1 (100.0%):   0%|                 | 0/24 [00:00<?, ?it/s]
zero
Average Metric: 1.00 / 2 (50.0%):   4%|▍        | 1/24 [00:00<00:00, 597.99it/s]
zero
internal
internal
Average Metric: 2.00 / 3 (66.7%):   8%|▊        | 2/24 [00:00<00:00, 544.96it/s]
zero
Average Metric: 3.00 / 4 (75.0%):  12%|█▏       | 3/24 [00:00<00:00, 647.24it/s]zero
internal
internal
internal
zero
internal
zeroage Metric: 4.00 / 5 (80.0%):  17%|█▌       | 4/24 [00:00<00:00, 572.07it/s]
zero
external
internal
internal
internal
zeroage Metric: 4.00 / 6 (66.7%):  21%|█▉       | 5/24 [00:00<00:00, 488.08it/s]
zero
externalMetric: 5.00 / 7 (71.4%):  25%|██▎      | 6/24 [00:00<00:00, 485.63it/s]
zero
zeroage Metric: 5.00 / 8 (62.5%):  29%|██▋      | 7/24 [00:00<00:00, 532.84it/s]
zero
Average Metric: 6.00 / 9

2025/04/24 21:00:54 INFO dspy.evaluate.evaluate: Average Metric: 13 / 24 (54.2%)





Unnamed: 0,text_snippet,answer,tag,validate_tag
0,"Als der Tag anbrach, noch ehe die Sonne aufgegangen war, kam schon...",external,external,✔️ [True]
1,"Als ich mich in Neapel aufhielt, begegnete daselbst eine Geschicht...",internal,internal,✔️ [True]
2,In den letzten Jahrzehnten ist das Interesse an Hungerkünstlern se...,zero,external,
3,"Wie gesagt, die Hand warf mich wieder zur Erde. Bald darauf erfaßt...",internal,internal,✔️ [True]
4,"Aber da keine Krankheit in ihm war, so war der Gedanke nicht graue...",internal,internal,✔️ [True]
5,"Es blieb daher nur noch die andere Seite neben dem Herrenkreuz, un...",external,zero,
6,"In M..., einer bedeutenden Stadt im oberen Italien, ließ die verwi...",zero,zero,✔️ [True]
7,"Die Jugend, welche die beiden Freunde Aeins und Azwei verband, war...",zero,zero,✔️ [True]
8,"Wenn man in jenen Tagen ein Ding durch die Fichtau bringen wollte,...",external,zero,
9,"Einen anderen Weg schlag ich ein; er ist allerdings etwas weit, ab...",internal,internal,✔️ [True]


54.17

# Using Copro

In [54]:
from dspy.teleprompt import COPRO

In [55]:
teleprompter = dspy.teleprompt.COPRO(
    program_mode="basic",
    init_temperature=0.4,  
    breadth=4,
    metric=validate_tag,
)

In [56]:
kwargs = dict(num_threads=5, display_progress=True, display_table=0) # Used in Evaluate class in the optimization process
compiled_prompt_opt = teleprompter.compile(DeterminacyPipe(), trainset=devset_with_input, eval_kwargs=kwargs)

2025/04/24 21:00:54 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 1/3.
2025/04/24 21:00:54 INFO dspy.teleprompt.copro_optimizer: At Depth 1/3, Evaluating Prompt Candidate #1/4 for Predictor 1 of 1.


internal                                                 | 0/24 [00:00<?, ?it/s]
internal
internal
internal
internalMetric: 2.00 / 2 (100.0%):   4%|▎       | 1/24 [00:00<00:00, 234.80it/s]
zero
Average Metric: 2.00 / 3 (66.7%):   8%|▊        | 2/24 [00:00<00:00, 299.53it/s]external
zero
zero
zero
Average Metric: 3.00 / 4 (75.0%):  12%|█▏       | 3/24 [00:00<00:00, 354.91it/s]external
zero
zero
zero
internal
internal
zero
external
Average Metric: 3.00 / 5 (60.0%):  17%|█▌       | 4/24 [00:00<00:00, 329.27it/s]internal
internal
Average Metric: 4.00 / 7 (57.1%):  25%|██▎      | 6/24 [00:00<00:00, 417.07it/s]internal
external
zero
internal
internalzero
internal
external
external
external
internal

internal
Average Metric: 5.00 / 8 (62.5%):  29%|██▋      | 7/24 [00:00<00:00, 368.98it/s]zero
internal
zero
zero
internal
internal
zeroexternal
zero
external
external

external
external
external
external
external
Average Metric: 13.00 / 24 (54.2%): 100%|██████| 24/24 [00:00<00:00, 885.71it/s]

2025/04/24 21:00:54 INFO dspy.evaluate.evaluate: Average Metric: 13 / 24 (54.2%)
2025/04/24 21:00:54 INFO dspy.teleprompt.copro_optimizer: At Depth 1/3, Evaluating Prompt Candidate #2/4 for Predictor 1 of 1.



internal                                                 | 0/24 [00:00<?, ?it/s]
internal
externalMetric: 1.00 / 1 (100.0%):   0%|                 | 0/24 [00:00<?, ?it/s]
zero
external
zero
internal
internal
internal
internal
zero
external
zero
zero
internal
internal
zeroage Metric: 2.00 / 2 (100.0%):   4%|▍        | 1/24 [00:00<00:00, 89.09it/s]
zero
internal
internal
internal
internal
zero
zero
zero
zero
internalMetric: 3.00 / 3 (100.0%):   8%|▋       | 2/24 [00:00<00:00, 125.32it/s]
internal
zero
zero
external
zero
external
zero
Average Metric: 4.00 / 4 (100.0%):  12%|█       | 3/24 [00:00<00:00, 155.69it/s]external
zero
zerointernal
internal
external
external

external
zero
zero
Average Metric: 4.00 / 5 (80.0%):  17%|█▌       | 4/24 [00:00<00:00, 177.88it/s]external
zero
Average Metric: 4.00 / 6 (66.7%):  21%|█▉       | 5/24 [00:00<00:00, 213.71it/s]external
zero
Average Metric: 15.00 / 24 (62.5%): 100%|██████| 24/24 [00:00<00:00, 875.25it/s]

2025/04/24 21:00:54 INFO dspy.evaluate.evaluate: Average Metric: 15 / 24 (62.5%)
2025/04/24 21:00:54 INFO dspy.teleprompt.copro_optimizer: At Depth 1/3, Evaluating Prompt Candidate #3/4 for Predictor 1 of 1.



  0%|                                                    | 0/24 [00:00<?, ?it/s]zero
external
externalMetric: 0.00 / 1 (0.0%):   0%|                   | 0/24 [00:00<?, ?it/s]
external
externalzeroic: 1.00 / 2 (50.0%):   4%|▍        | 1/24 [00:00<00:00, 490.45it/s]
zero

internal
Average Metric: 2.00 / 4 (50.0%):  12%|█▏       | 3/24 [00:00<00:00, 628.02it/s]zero
zero
external
zero
Average Metric: 2.00 / 5 (40.0%):  17%|█▌       | 4/24 [00:00<00:00, 559.52it/s]internal
internal
internal
internal
Average Metric: 3.00 / 6 (50.0%):  21%|█▉       | 5/24 [00:00<00:00, 550.19it/s]
internal
internal
internal
zero
external
internalzeroic: 4.00 / 7 (57.1%):  25%|██▎      | 6/24 [00:00<00:00, 510.25it/s]
internal
internal
internal

external
internal
internal
zero
internal
externalMetric: 4.00 / 8 (50.0%):  29%|██▋      | 7/24 [00:00<00:00, 438.82it/s]
zero
zero
external
zero
zero
internal
internal
external
external
external
zero
external
internal
Average Metric: 5.00 / 9 (55.6%):  33%|███      |

2025/04/24 21:00:54 INFO dspy.evaluate.evaluate: Average Metric: 12 / 24 (50.0%)
2025/04/24 21:00:54 INFO dspy.teleprompt.copro_optimizer: At Depth 1/3, Evaluating Prompt Candidate #4/4 for Predictor 1 of 1.



external
external
zero
external
internal
internal
internal
internal
internal
internal
external
zero
zero
zero
external
zero
internal
zero
internal
internal
internal
internal
zero
zero
external
internal
zero
zero
external
zero
zero
zero
external
zero
internal
internal
external
internal
external
zero
zero
zero
zero
internal
internal
internal
zero
internal
Average Metric: 13.00 / 24 (54.2%): 100%|█████| 24/24 [00:00<00:00, 5798.24it/s]

2025/04/24 21:00:54 INFO dspy.evaluate.evaluate: Average Metric: 13 / 24 (54.2%)
2025/04/24 21:00:54 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 2/3.
2025/04/24 21:00:54 INFO dspy.teleprompt.copro_optimizer: At Depth 2/3, Evaluating Prompt Candidate #1/4 for Predictor 1 of 1.



internal                                                 | 0/24 [00:00<?, ?it/s]
internal
zeroage Metric: 1.00 / 1 (100.0%):   0%|                 | 0/24 [00:00<?, ?it/s]
external
internal
internal
Average Metric: 2.00 / 3 (66.7%):   8%|▊        | 2/24 [00:00<00:00, 376.15it/s]external
external
external
internal
zero
zero
zero
zero
internal
internal
externalinternal
internal

zero
Average Metric: 2.00 / 4 (50.0%):  12%|█▏       | 3/24 [00:00<00:00, 259.45it/s]internal
internal
zero
internal
internalzero
internal
zero
internal

external
external
external
internalMetric: 3.00 / 5 (60.0%):  17%|█▌       | 4/24 [00:00<00:00, 236.13it/s]
internal
zero
external
external
zero
externalMetric: 4.00 / 6 (66.7%):  21%|█▉       | 5/24 [00:00<00:00, 252.51it/s]
external
external
zero
zeroexternalic: 4.00 / 7 (57.1%):  25%|██▎      | 6/24 [00:00<00:00, 279.12it/s]
zero
internal
internal

external
Average Metric: 12.00 / 24 (50.0%): 100%|██████| 24/24 [00:00<00:00, 947.42it/s]

2025/04/24 21:00:54 INFO dspy.evaluate.evaluate: Average Metric: 12 / 24 (50.0%)
2025/04/24 21:00:54 INFO dspy.teleprompt.copro_optimizer: At Depth 2/3, Evaluating Prompt Candidate #2/4 for Predictor 1 of 1.



zeroexternal                                             | 0/24 [00:00<?, ?it/s]
internal

external
Average Metric: 0.00 / 2 (0.0%):   4%|▍         | 1/24 [00:00<00:00, 211.15it/s]
zero
Average Metric: 0.00 / 3 (0.0%):   8%|▊         | 2/24 [00:00<00:00, 314.06it/s]
internal
internal
internal
zerointernal
internal
external
zero

zero
zero
zero
Average Metric: 1.00 / 4 (25.0%):  12%|█▏       | 3/24 [00:00<00:00, 262.49it/s]internal
internal
internal
internal
Average Metric: 2.00 / 5 (40.0%):  17%|█▌       | 4/24 [00:00<00:00, 301.27it/s]zero
internal
zero
internal
internal
internal
zero
zero
external
internal
internal
external
zeroage Metric: 3.00 / 6 (50.0%):  21%|█▉       | 5/24 [00:00<00:00, 281.99it/s]
external
zero
external
external
zero
external
external
internalexternal
zero
Average Metric: 3.00 / 7 (42.9%):  25%|██▎      | 6/24 [00:00<00:00, 280.26it/s]
internal
external
internal
Average Metric: 11.00 / 24 (45.8%): 100%|██████| 24/24 [00:00<00:00, 946.96it/s]

2025/04/24 21:00:54 INFO dspy.evaluate.evaluate: Average Metric: 11 / 24 (45.8%)





2025/04/24 21:00:54 INFO dspy.teleprompt.copro_optimizer: At Depth 2/3, Evaluating Prompt Candidate #3/4 for Predictor 1 of 1.


internalzero
external

internal
external
Final answer: zero
zero
zero
zero
zero
external
external
internal
Answer: internal
  0%|                                                    | 0/24 [00:00<?, ?it/s]external
final answer: zero

Focalization: internal
internal
zero
Average Metric: 0.00 / 1 (0.0%):   0%|                   | 0/24 [00:00<?, ?it/s]
final answer: zero
internal
internal

external
zero
external
Average Metric: 0.00 / 2 (0.0%):   4%|▍         | 1/24 [00:00<00:00, 326.35it/s]external
Answer: external
internal
final answer: internal
Average Metric: 1.00 / 3 (33.3%):   8%|▊        | 2/24 [00:00<00:00, 502.49it/s]zero
zero
Average Metric: 2.00 / 4 (50.0%):  12%|█▏       | 3/24 [00:00<00:00, 581.28it/s]external
Final answer: zero
zero
zero
external
Final answer: zero
zero
zero
Average Metric: 3.00 / 5 (60.0%):  17%|█▌       | 4/24 [00:00<00:00, 477.66it/s]external
Final Answer: internal
internal
Answer: internal
Average Metric: 4.00 / 6 (66.7%):  21%|█▉       | 5/24 [00:00<00:0

2025/04/24 21:00:54 INFO dspy.evaluate.evaluate: Average Metric: 13 / 24 (54.2%)





2025/04/24 21:00:54 INFO dspy.teleprompt.copro_optimizer: At Depth 2/3, Evaluating Prompt Candidate #4/4 for Predictor 1 of 1.


internal                                                 | 0/24 [00:00<?, ?it/s]
internal
externalMetric: 1.00 / 1 (100.0%):   0%|                 | 0/24 [00:00<?, ?it/s]
external
zerointernalic: 2.00 / 2 (100.0%):   4%|▎       | 1/24 [00:00<00:00, 221.32it/s]
internal
Average Metric: 3.00 / 3 (100.0%):   8%|▋       | 2/24 [00:00<00:00, 312.54it/s]
external
zero
zero
Average Metric: 4.00 / 4 (100.0%):  12%|█       | 3/24 [00:00<00:00, 367.37it/s]
external
Average Metric: 4.00 / 5 (80.0%):  17%|█▌       | 4/24 [00:00<00:00, 374.18it/s]internal
internal

external
internal
internal
external
zero
internal
internal
internal
internal
Average Metric: 5.00 / 6 (83.3%):  21%|█▉       | 5/24 [00:00<00:00, 320.80it/s]zero
external
zero
external
internal
external
zero
internal
external
internal
external
external
zerozeroexternal
external

external
internal
internal
Average Metric: 6.00 / 7 (85.7%):  25%|██▎      | 6/24 [00:00<00:00, 286.85it/s]
external
external
external
external
zero
Average Metr

2025/04/24 21:00:54 INFO dspy.evaluate.evaluate: Average Metric: 13 / 24 (54.2%)





2025/04/24 21:00:54 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 3/3.
2025/04/24 21:00:54 INFO dspy.teleprompt.copro_optimizer: At Depth 3/3, Evaluating Prompt Candidate #1/4 for Predictor 1 of 1.


external                                                 | 0/24 [00:00<?, ?it/s]
internal
Average Metric: 0.00 / 1 (0.0%):   0%|                   | 0/24 [00:00<?, ?it/s]internal
internal
internalMetric: 1.00 / 2 (50.0%):   4%|▍        | 1/24 [00:00<00:00, 111.62it/s]
[[ ## proposed_instruction ## ]]
[[ ## proposed_prefix_for_output_field ## ]] Label: zero [[ ## completed ## ]]
zeroage Metric: 1.00 / 3 (33.3%):   8%|▊        | 2/24 [00:00<00:00, 172.37it/s]
[[ ## proposed_instruction ## ]]
[[ ## proposed_prefix_for_output_field ## ]] external [[ ## completed ## ]]
Average Metric: 1.00 / 4 (25.0%):  12%|█▏       | 3/24 [00:00<00:00, 217.96it/s]
zero
Average Metric: 1.00 / 5 (20.0%):  17%|█▌       | 4/24 [00:00<00:00, 278.24it/s]zero
zero
internal
[[ ## proposed_instruction ## ]]
[[ ## proposed_prefix_for_output_field ## ]]result: internal[[ ## completed ## ]]
Average Metric: 2.00 / 6 (33.3%):  21%|█▉       | 5/24 [00:00<00:00, 303.53it/s]

2025/04/24 21:00:54 ERROR dspy.utils.parallelizer: Error for Example({'text_snippet': 'In M..., einer bedeutenden Stadt im oberen Italien, ließ die verwitwete Marquise von\xa0O..., eine Dame von vortrefflichem Ruf, und Mutter von mehreren wohlerzogenen Kindern, durch die Zeitungen bekannt machen: daß sie, ohne ihr Wissen, in andre Umstände gekommen sei, daß der Vater zu dem Kinde, das sie gebären würde, sich melden solle; und daß sie, aus Familienrücksichten, entschlossen wäre, ihn zu heiraten. Die Dame, die einen so sonderbaren, den Spott der Welt reizenden Schritt, beim Drang unabänderlicher Umstände, mit solcher Sicherheit tat, war die Tochter des Herrn von\xa0G..., Kommandanten der Zitadelle bei\xa0M... Sie hatte, vor ungefähr drei Jahren, ihren Gemahl, den Marquis von\xa0O..., dem sie auf das innigste und zärtlichste zugetan war, auf einer Reise verloren, die er, in Geschäften der Familie, nach Paris gemacht hatte. Auf Frau von\xa0G...s, ihrer würdigen Mutter, Wunsch, hatte sie, n

internal
internal
zero
external
external
zero
Average Metric: 3.00 / 7 (42.9%):  25%|██▎      | 6/24 [00:00<00:00, 269.08it/s]

2025/04/24 21:00:54 ERROR dspy.utils.parallelizer: Error for Example({'text_snippet': 'Es ist doch etwas Schönes, Herrliches, Erhabenes um das Leben! – »O\xa0du süße Gewohnheit des Daseins!« ruft jener niederländische Held in der Tragödie aus. So auch ich, aber nicht wie der Held in dem schmerzlichen Augenblick, als er sich davon trennen soll – nein! – in dem Moment, da mich eben die volle Lust des Gedankens durchdringt, daß ich in jene süße Gewohnheit nun ganz und gar hineingekommen und durchaus nicht willens bin, jemals wieder hinauszukommen. – Ich meine nämlich, die geistige Kraft, die unbekannte Macht, oder wie man sonst das über uns waltende Prinzip nennen mag, welches mir besagte Gewohnheit ohne meine Zustimmung gewissermaßen aufgedrungen hat, kann unmöglich schlechtere Gesinnungen haben als der freundliche Mann, bei dem ich in Kondition gegangen, und der mir das Gericht Fische, das er mir vorgesetzt, niemals vor der Nase wegzieht, wenn es mir eben recht wohlschmeckt.', 'answer':

internalinternal
zero

[[ ## proposed_instruction ## ]]
[[ ## proposed_prefix_for_output_field ## ]] final: internal [[ ## completed ## ]]
zero
focalization: internal
Average Metric: 3.00 / 7 (42.9%):  29%|██▋      | 7/24 [00:00<00:00, 283.84it/s]external
zero
zero
internal
externalMetric: 3.00 / 8 (37.5%):  33%|███      | 8/24 [00:00<00:00, 304.35it/s]
external
Average Metric: 4.00 / 10 (40.0%):  42%|██▉    | 10/24 [00:00<00:00, 338.95it/s]external
zero
zero
external
Average Metric: 5.00 / 11 (45.5%):  46%|███▏   | 11/24 [00:00<00:00, 339.76it/s]

2025/04/24 21:00:54 ERROR dspy.utils.parallelizer: Error for Example({'text_snippet': 'Eine große Sorge hatte der gute König jetzt, die plagte ihn sehr, er hatte seiner Gemahlin versprochen, er wolle, wenn sie vor dem Kinde sterbe, Mutterstelle an ihm vertreten. Wie er das machen sollte, wenn er Wort halten sollte, wußte er nun gar nicht, er ließ auch darüber stark nachdenken. Und siehe da, nach einer halben Stunde kam der Hofnachdenker herein und sprach: »Ihro Majestät, haben Sie etwas heraus?« Der König sagte: »Haben Sie etwas?« Der Nachdenker sagte: »Ihro Majestät, ich habe nichts heraus,« und der König sagte: »Und habe auch nichts.« Da sagte der Nachdenker: »Da haben wir also alle beide nichts heraus,« und nun gingen sie wieder frisch ans Nachdenken. Nach einer Stunde kamen sie ebenso zusammen und gingen ebenso auseinander.', 'answer': 'internal'}) (input_keys={'context', 'text_snippet'}): 'list' object has no attribute 'items'. Set `provide_traceback=True` for traceback.


externalexternal
internal
Average Metric: 5.00 / 13 (38.5%):  54%|███▊   | 13/24 [00:00<00:00, 382.41it/s]
Label: zero
Average Metric: 5.00 / 13 (38.5%):  58%|████   | 14/24 [00:00<00:00, 408.61it/s]

2025/04/24 21:00:54 ERROR dspy.utils.parallelizer: Error for Example({'text_snippet': 'Eine dieser Herausforderungen Gottes bestand darin, sich auf dem Turmgeländer, mit dem Blick nach unten, durch langsamen Druck der Muskeln in die Höhe zu heben und schwankend auf den Händen stehenzubleiben; jeder, der dieses Akrobatenkunststück zu ebener Erde ausgeführt hat, wird wissen, wieviel Selbstvertrauen, Kühnheit und Glück dazu gehören, es auf einem fußbreiten Steinstreifen in Turmhöhe zu wiederholen. Es muß auch gesagt werden, daß viele wilde und geschickte Burschen sich dessen nicht unterfingen, obgleich sie zu ebener Erde auf ihren Händen geradezu lustwandeln konnten. Zum Beispiel Aeins tat es nicht. Dagegen war Azwei, und das mag gut zu seiner Einführung als Erzähler dienen, in seiner Knabenzeit der Erfinder dieser Gesinnungsprobe gewesen. Es war schwer, einen Körper zu finden wie den seinen. Er trug nicht die Muskeln des Sports wie der Körper vieler, sondern schien einfach und mühelos vo

Average Metric: 6.00 / 20 (30.0%): 100%|███████| 24/24 [00:00<00:00, 655.65it/s]

2025/04/24 21:00:54 INFO dspy.evaluate.evaluate: Average Metric: 6.0 / 24 (25.0%)
2025/04/24 21:00:54 INFO dspy.teleprompt.copro_optimizer: At Depth 3/3, Evaluating Prompt Candidate #2/4 for Predictor 1 of 1.



external                                                 | 0/24 [00:00<?, ?it/s]
internal
externalMetric: 0.00 / 1 (0.0%):   0%|                   | 0/24 [00:00<?, ?it/s]
Focalization: zero
internalMetric: 0.00 / 2 (0.0%):   4%|▍         | 1/24 [00:00<00:00, 190.87it/s]
internal
zero
Answer: external
Average Metric: 0.00 / 3 (0.0%):   8%|▊         | 2/24 [00:00<00:00, 274.16it/s]internal
Answer: external
external
Focalization: zero
zero
answer: zero
internal
internal
Average Metric: 1.00 / 4 (25.0%):  12%|█▏       | 3/24 [00:00<00:00, 277.21it/s]zero
Focalization: zero
internalinternal
Answer: internal

Final Answer: internal
zeroage Metric: 2.00 / 6 (33.3%):  21%|█▉       | 5/24 [00:00<00:00, 344.85it/s]
zero
internal
Result: internal
internal
external
zero
Final: internal
zero
external
zero
Answer: external
Average Metric: 3.00 / 8 (37.5%):  29%|██▋      | 7/24 [00:00<00:00, 326.67it/s]external
external
externalexternal
answer: zero

Focalization: zero
zero
Final Answer: zero
intern

2025/04/24 21:00:54 INFO dspy.evaluate.evaluate: Average Metric: 11 / 24 (45.8%)
2025/04/24 21:00:54 INFO dspy.teleprompt.copro_optimizer: At Depth 3/3, Evaluating Prompt Candidate #3/4 for Predictor 1 of 1.



external                                                 | 0/24 [00:00<?, ?it/s]
external
externalMetric: 1.00 / 1 (100.0%):   0%|                 | 0/24 [00:00<?, ?it/s]
zero
internalzeroic: 1.00 / 2 (50.0%):   4%|▍        | 1/24 [00:00<00:00, 197.43it/s]
zero
Average Metric: 2.00 / 3 (66.7%):   8%|▊        | 2/24 [00:00<00:00, 262.82it/s]
internal
zero
external
zeroage Metric: 2.00 / 4 (50.0%):  12%|█▏       | 3/24 [00:00<00:00, 310.15it/s]
external
Average Metric: 3.00 / 5 (60.0%):  17%|█▌       | 4/24 [00:00<00:00, 351.19it/s]internal
external
Average Metric: 3.00 / 6 (50.0%):  21%|█▉       | 5/24 [00:00<00:00, 406.61it/s]zero
external
internal
internal
internalinternal3.00 / 7 (42.9%):  25%|██▎      | 6/24 [00:00<00:00, 423.35it/s]
internal
Average Metric: 3.00 / 8 (37.5%):  29%|██▋      | 7/24 [00:00<00:00, 449.81it/s]
internal
zero
external
Average Metric: 5.00 / 10 (50.0%):  38%|███     | 9/24 [00:00<00:00, 513.70it/s]zero
external
Average Metric: 5.00 / 11 (45.5%):  42%|██▉  

2025/04/24 21:00:54 INFO dspy.evaluate.evaluate: Average Metric: 12 / 24 (50.0%)
2025/04/24 21:00:54 INFO dspy.teleprompt.copro_optimizer: At Depth 3/3, Evaluating Prompt Candidate #4/4 for Predictor 1 of 1.



internalexternal                                         | 0/24 [00:00<?, ?it/s]
Final classification: internal

external
Average Metric: 1.00 / 1 (100.0%):   0%|                 | 0/24 [00:00<?, ?it/s]internal
Classification: external
Average Metric: 2.00 / 3 (66.7%):   8%|▊        | 2/24 [00:00<00:00, 194.57it/s]
zero
Average Metric: 2.00 / 4 (50.0%):  12%|█▏       | 3/24 [00:00<00:00, 251.26it/s]zero
Classification: external
zero
zero
zero
zero
Average Metric: 3.00 / 6 (50.0%):  21%|█▉       | 5/24 [00:00<00:00, 347.18it/s]zero
external
Average Metric: 4.00 / 7 (57.1%):  25%|██▎      | 6/24 [00:00<00:00, 381.17it/s]external
zero
internal
internal
internal
The classification is: internal
zero
external
internal
internal
Average Metric: 5.00 / 8 (62.5%):  29%|██▋      | 7/24 [00:00<00:00, 351.62it/s]zero
zero
internal
external
internal
internal
zero
zero
Average Metric: 6.00 / 10 (60.0%):  38%|███     | 9/24 [00:00<00:00, 385.51it/s]external
external
external
external
zeroexternal
zer

2025/04/24 21:00:54 INFO dspy.evaluate.evaluate: Average Metric: 15 / 24 (62.5%)





In [57]:
compiled_prompt_opt

predictor.predict = Predict(StringSignature(text_snippet -> reasoning, tag
    instructions="You are given a sentence and need to determine its focalization type based on the definitions provided. Read the sentence carefully, analyze whether the perceptual process is internally linked to a character's viewpoint, merely possible from a character’s perspective, or entirely detached from any character. Then, choose one of three categories: “internal”, “external”, or “zero”. When providing your answer, use the exact output structure described: first output the field with the tag `[[ ## proposed_instruction ## ]]` (already provided above), then output the field `[[ ## proposed_prefix_for_output_field ## ]]` that you are to display right before the final answer. Ensure that your final answer only indicates the chosen focalization label."
    text_snippet = Field(annotation=str required=True json_schema_extra={'desc': 'contains a snippet of a narrative text', '__dspy_field_type': 'input', 'pr

# Test with new Prompt

In [58]:
results = []
for text_snippet in df_anno_test.Absatz:
    response = compiled_prompt_opt(text_snippet=text_snippet)
    print(response.tag)
    results.append(response.tag)

zero
zero
internal
internal
external
external
external
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
external
zero
zero
internal
internal
internal
internal
external
external
zero
internal
external
zero
internal
zero
internal
internal
internal
internal
zero
zero
external
external
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
internal
zero
internal
internal
internal
internal
zero
zero
zero
zero
external
external
external
zero
zero
zero
zero
external
external
internal
zero
external
external
external
zero
internal
internal
zero
internal
internal
zero
external
zero
zero
zero
external
zero
external
external
external
external
external
external
external
external
internal
internal
internal
external
zero
external
zero
internal
internal
zero
zero


In [59]:
results_1 = []
for text in results:
    if "internal focalization" in text:
        results_1.append("internal")
    elif "internal" in text:
        results_1.append("internal")
    elif "external focalization" in text:
        results_1.append("external")
    elif "external" in text:
        results_1.append("external")
    elif "zero focalization" in text:
        results_1.append("zero")
    elif "zero" in text:
        results_1.append("zero")
    else:
        results_1.append("NaN")

In [60]:
predictions_dspy = pd.Series(results_1)

In [61]:
predictions_dspy

0          zero
1          zero
2      internal
3      internal
4      external
         ...   
101        zero
102    internal
103    internal
104        zero
105        zero
Length: 106, dtype: object

In [62]:
ground_truth = df_anno_test.Fokalisierung

In [63]:
ground_truth

0          zero
1          zero
2      internal
3          zero
4      internal
         ...   
101    internal
102    internal
103    internal
104    internal
105    internal
Name: Fokalisierung, Length: 106, dtype: object

In [64]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [65]:
pd.DataFrame(list(zip([f1_score(ground_truth, predictions_dspy, average="weighted")],
                      [recall_score(ground_truth, predictions_dspy,  average="weighted")],
                      [precision_score(ground_truth, predictions_dspy, average="weighted")],
                      [accuracy_score(ground_truth, predictions_dspy,)])),
                      columns = ["F1", "Recall", "Precision", "Accuracy"])

Unnamed: 0,F1,Recall,Precision,Accuracy
0,0.665187,0.650943,0.694786,0.650943


In [66]:
compiled_prompt_opt.save('03_dspied.pkl')

## Statistischer Test: McNemar

In [67]:
from typing import Optional
from statsmodels.stats.contingency_tables import mcnemar

In [68]:
# Funktion zum Vergleichen der Vorhersagen mit dem Goldstandard
def compare_with_gold(predictions, gold_standard):
    return [pred == gold for pred, gold in zip(predictions, gold_standard)]

In [69]:
# Funktion zum Erstellen einer Kontingenztafel
def create_contingency_table(results_a, results_b):
    both_correct = sum(a and b for a, b in zip(results_a, results_b))
    only_a_correct = sum(a and not b for a, b in zip(results_a, results_b))
    only_b_correct = sum(b and not a for a, b in zip(results_a, results_b))
    both_incorrect = sum(not a and not b for a, b in zip(results_a, results_b))
    return np.array([[both_correct, only_a_correct],
                     [only_b_correct, both_incorrect]])


In [70]:
# McNemar-Test durchführen
def run_mcnemar_test(results_a, results_b):
    table = create_contingency_table(results_a, results_b)
    return mcnemar(table, exact=True)


In [71]:
def generate_mcnemar_comparisons(
    results_gpt4o,
    df_anno,
    num_templates=3,
    runs=[1, 2],
    optimized_predictions: Optional[pd.Series] = None
):
    """
    Führt McNemar-Vergleiche zwischen verschiedenen Prompt-Vorhersagen durch,
    inklusive eines optionalen optimierten Prompts.

    Args:
        results_gpt4o: DataFrame mit Prediction-Spalten für alle Templates/Runs
        df_anno: DataFrame mit Goldstandard ('Fokalisierung')
        num_templates: Anzahl der verwendeten Template-Prompts
        runs: Liste der Runs (z. B. [1, 2])
        optimized_predictions: (optional) pd.Series mit Optimized-Prompt-Vorhersagen

    Returns:
        comparison_df: DataFrame mit allen McNemar-Test-Ergebnissen
    """
    gold = df_anno["Fokalisierung"]
    comparison_results = []

    for run in runs:
        # Vorhersagen der Templates
        predictions = {
            f"T{template_idx+1}": compare_with_gold(
                results_gpt4o[f"Prediction_{template_idx}_Run{run}"], gold
            )
            for template_idx in range(num_templates)
        }

        # Optional: Optimierten Prompt ergänzen
        if optimized_predictions is not None:
            predictions["Optimized"] = compare_with_gold(optimized_predictions, gold)

        # Paarweise Vergleiche
        templates = list(predictions.keys())
        for i in range(len(templates)):
            for j in range(i + 1, len(templates)):
                t1, t2 = templates[i], templates[j]
                result = run_mcnemar_test(predictions[t1], predictions[t2])
                comparison_results.append({
                    "Run": run,
                    "Comparison": f"{t1} vs {t2}",
                    "Prompt_A": t1,
                    "Prompt_B": t2,
                    "p-value": result.pvalue
                })

    comparison_df = pd.DataFrame(comparison_results)
    return comparison_df


In [72]:
comparison_df = generate_mcnemar_comparisons(
    results_gpt4o=results_03,
    df_anno=df_anno,
    num_templates=3,
    runs=[1, 2],
    optimized_predictions=predictions_dspy
)

In [73]:
print(comparison_df)

    Run       Comparison Prompt_A   Prompt_B   p-value
0     1         T1 vs T2       T1         T2  0.004551
1     1         T1 vs T3       T1         T3  0.053252
2     1  T1 vs Optimized       T1  Optimized  0.122078
3     1         T2 vs T3       T2         T3  0.387695
4     1  T2 vs Optimized       T2  Optimized  0.133801
5     1  T3 vs Optimized       T3  Optimized  0.541256
6     2         T1 vs T2       T1         T2  0.016674
7     2         T1 vs T3       T1         T3  0.080143
8     2  T1 vs Optimized       T1  Optimized  0.052479
9     2         T2 vs T3       T2         T3  0.423950
10    2  T2 vs Optimized       T2  Optimized  0.424356
11    2  T3 vs Optimized       T3  Optimized  1.000000


In [74]:
comparison_df.to_excel("mcnemar_vergleiche_03.xlsx", index=False)

In [75]:
def filter_significant_comparisons(comparison_df, alpha=0.05):
    """
    Gibt alle Vergleichspaare mit p-Wert < alpha (standardmäßig 0.05) zurück.
    
    Args:
        comparison_df: DataFrame mit den Spalten 'Run', 'Comparison', 'Prompt_A', 'Prompt_B', 'p-value'
        alpha: Signifikanzniveau (Default: 0.05)
        
    Returns:
        DataFrame mit nur signifikant unterschiedlichen Prompt-Paaren
    """
    significant_df = comparison_df[comparison_df["p-value"] < alpha].copy()
    return significant_df.sort_values(by=["Run", "p-value"])

In [76]:
significant_comparisons = filter_significant_comparisons(comparison_df)
significant_comparisons

Unnamed: 0,Run,Comparison,Prompt_A,Prompt_B,p-value
0,1,T1 vs T2,T1,T2,0.004551
6,2,T1 vs T2,T1,T2,0.016674
