In [1]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from statsmodels.stats.contingency_tables import mcnemar

In [3]:
from scipy import stats
from itertools import combinations

In [4]:
import openai
from openai import OpenAI

In [5]:
import os

In [6]:
import re
import torch
import sklearn
from sklearn.model_selection import train_test_split

In [7]:
df_anno = pd.read_csv('plasticity_2025_Anno_DEU_test_2nd_run.csv')

In [8]:
df_anno['Fokalisierung'] = df_anno['Fokalisierung'].replace({
    'intern': 'internal',
    'extern': 'external',
    'null': 'zero',
    None: 'zero',
    np.nan: 'zero'
})

In [9]:
df_anno.head()

Unnamed: 0,Autor,Titel,Absatz,Fokalisierung,Kommentar
0,Schiller,Der Vebrecher aus verlorener Ehre,In der ganzen Geschichte des Menschen ist kein...,zero,
1,Schiller,Der Vebrecher aus verlorener Ehre,Es ist etwas so Einförmiges und doch wieder so...,zero,
2,Schiller,Der Vebrecher aus verlorener Ehre,"Ich weiß, daß von den besten Geschichtschreibe...",internal,
3,Schiller,Der Vebrecher aus verlorener Ehre,"Der Held muß kalt werden wie der Leser, oder, ...",zero,
4,Tieck,Die beiden merkwürdigsten Tage aus Siegmunds L...,"Es war schon gegen Abend, als ein Wagen vor de...",internal,


In [10]:
api_key = os.getenv('MY_OPENAI')

In [11]:
def get_completion(prompt):  
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(
        temperature = 0.1,
        messages=[{
        "role": "user",
        "content": prompt,
        }],
        model="gpt-4.1-2025-04-14",
    )
    return response.choices[0].message.content

In [12]:
prompt_basic = """
### Instruction
Your task is to classify the focalization of the following sentence

###
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text 
Sentence: '''{text}''''
Label:
"""

In [13]:
prompt_labels = """
### Instruction
Your task is to classify the focalization of the following sentence

### Labels
There are three modes of focalization:
- internal
- external
- zero

####
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text 
Sentence: '''{text}''' 
Label: 
"""

In [14]:
prompt_redefin = """
### Instruction
Your task is to classify the focalization of the following sentence

### Labels
There are three modes of focalization:
- internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
- external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
- zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously.

####
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text  
Sentence: '''{text}''' 
Label: 
"""

In [15]:
prompt_meta = """
### Instruction
Your task is to classify the focalization of the following sentence

### Labels
There are three modes of focalization:
- internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
- external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
- zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously. 
These definitions are redefinitions of the standard understanding of focalization.

####
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text 
Sentence: '''{text}''' 
Label: 
"""

In [16]:
prompt_templates = [prompt_labels, prompt_redefin, prompt_meta]

In [17]:
def evaluate_prompts_and_predictions(df, prompt_templates):
    """
    Evaluiert verschiedene Prompt-Templates und berechnet Metriken für die Vorhersagen.
    Jeder Prompt wird zweimal über die Daten ausgeführt.
    
    Args:
        df: Pandas DataFrame mit den Spalten 'Absatz' und 'Fokalisierung'
        prompt_templates: Liste der Prompt-Templates
    
    Returns:
        Tuple mit:
            - df: DataFrame mit den gespeicherten Vorhersagen
            - results_df: DataFrame mit den Evaluierungsmetriken für jeden Prompt und jeden Run
    """
    results = []

    for run in range(1, 6):
        print(f"Starte Run {run}/5")

        # Iteration über die Prompt-Templates
        for prompt_idx, template in enumerate(prompt_templates):
            print(f"Verarbeite Prompt-Template {prompt_idx + 1}/{len(prompt_templates)} - Run {run}")
            
            # Spaltenname für Vorhersage definieren
            prediction_col = f'Prediction_{prompt_idx}_Run{run}'
            df[prediction_col] = None
            
            # Iteration über die Zeilen des DataFrames
            for idx, row in df.iterrows():
                prompt = template.format(text=row['Absatz'])
                prediction = get_completion(prompt)
                first_word = prediction.split()[0].lower()
                print(first_word)
                df.at[idx, prediction_col] = first_word
            
            # Metriken berechnen
            metrics = {
                'Prompt': f'Template_{prompt_idx}',
                'Run': run,
                'F1-Score': f1_score(df['Fokalisierung'], df[prediction_col], average='weighted'),
                'Recall': recall_score(df['Fokalisierung'], df[prediction_col], average='weighted'),
                'Precision': precision_score(df['Fokalisierung'], df[prediction_col], average='weighted'),
                'Accuracy': accuracy_score(df['Fokalisierung'], df[prediction_col])
            }
            
            results.append(metrics)
    
    # Ergebnisse in DataFrame umwandeln
    results_df = pd.DataFrame(results)
    return df, results_df

In [18]:
results_gpt41, test = evaluate_prompts_and_predictions(df_anno, prompt_templates)

Starte Run 1/5
Verarbeite Prompt-Template 1/3 - Run 1
zero
zero
internal
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
zero
internal
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
internal
zero
internal
internal
internal
internal
zero
zero
external
zero
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
external
zero
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
zero
zero
zero
zero
zero
internal
zero
zero
zero
zero
zero
external
internal
internal
internal
zero
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
external
internal
internal
external
external
external
zero
internal
internal
internal
zero
internal
internal
internal
zero
internal
internal
zero
Verarbeite Prompt-Template 2/3 - Run 1
zero
zero
internal
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
zero
internal
zero
internal
zero
zero
internal
internal
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
internal
zero
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
zero
zero
zero
internal
zero
internal
zero
zero
internal
zero
internal
zero
internal
internal
internal
internal
internal
internal
zero
internal
zero
zero
zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
internal
zero
zero
internal
internal
zero
Starte Run 2/5
Verarbeite Prompt-Template 1/3 - Run 2


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


zero
zero
internal
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
zero
internal
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
internal
zero
internal
internal
internal
internal
zero
zero
external
zero
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
external
zero
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
zero
zero
zero
zero
zero
internal
zero
zero
zero
zero
zero
external
internal
internal
internal
zero
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
zero
internal
internal
zero
external
external
zero
internal
internal
internal
zero
internal
internal
internal
zero
internal
internal
zero
Verarbeite Prompt-Template 2/3 - Run 2
zero
zero
internal
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
zero
intern

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
zero
internal
zero
internal
zero
zero
internal
internal
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
internal
zero
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
zero
zero
zero
internal
zero
internal
zero
zero
internal
zero
internal
zero
internal
internal
internal
internal
internal
internal
zero
internal
zero
zero
zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
zero
internal
internal
internal
zero
zero
internal
zero
internal
internal
internal
zero
Starte Run 3/5
Verarbeite Prompt-Template 1/3 - Run 3


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


zero
zero
internal
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
zero
internal
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
internal
zero
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
external
zero
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
zero
zero
zero
zero
zero
internal
zero
zero
zero
zero
zero
external
internal
internal
internal
zero
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
zero
zero
internal
zero
external
external
zero
internal
internal
internal
zero
internal
internal
internal
zero
internal
internal
zero
Verarbeite Prompt-Template 2/3 - Run 3
zero
zero
internal
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
zero
internal
zero


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
zero
internal
zero
internal
zero
zero
internal
internal
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
internal
zero
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
zero
zero
zero
internal
zero
internal
zero
zero
internal
zero
internal
zero
internal
internal
internal
internal
internal
internal
zero
internal
zero
zero
zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
zero
internal
internal
internal
zero
zero
internal
zero
internal
internal
internal
zero
Starte Run 4/5
Verarbeite Prompt-Template 1/3 - Run 4


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


zero
zero
internal
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
zero
internal
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
internal
zero
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
zero
zero
external
zero
zero
internal
zero
zero
zero
zero
zero
external
internal
internal
internal
zero
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
zero
zero
external
external
external
external
zero
internal
internal
internal
zero
internal
internal
internal
zero
internal
internal
zero
Verarbeite Prompt-Template 2/3 - Run 4
zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
zero
internal
zero


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
zero
internal
zero
internal
zero
zero
internal
internal
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
internal
zero
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
zero
zero
zero
internal
zero
internal
zero
zero
internal
zero
internal
zero
internal
internal
internal
internal
internal
internal
zero
internal
zero
zero
zero
zero
zero
zero
internal
zero
internal
internal
internal
internal
zero
internal
internal
internal
zero
zero
internal
internal
internal
internal
internal
zero
Starte Run 5/5
Verarbeite Prompt-Template 1/3 - Run 5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


zero
zero
internal
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
zero
internal
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
internal
zero
internal
internal
internal
internal
zero
zero
external
zero
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
external
zero
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
zero
zero
zero
zero
zero
internal
zero
zero
zero
zero
zero
external
internal
internal
internal
zero
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
zero
internal
internal
zero
external
external
zero
internal
internal
internal
zero
internal
internal
internal
zero
internal
zero
zero
Verarbeite Prompt-Template 2/3 - Run 5
zero
zero
internal
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
zero
internal
z

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
zero
internal
zero
internal
zero
zero
internal
internal
internal
internal
internal
internal
zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
internal
zero
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
zero
zero
zero
internal
zero
internal
zero
zero
internal
zero
internal
zero
internal
internal
internal
internal
internal
internal
zero
internal
zero
zero
zero
zero
zero
zero
internal
internal
internal
internal
internal
internal
zero
internal
internal
internal
zero
zero
internal
zero
internal
internal
internal
zero


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
results_gpt41.to_json("DH_gpt41_results", orient="records", indent=4, force_ascii=False)

In [20]:
test

Unnamed: 0,Prompt,Run,F1-Score,Recall,Precision,Accuracy
0,Template_0,1,0.689562,0.707547,0.747109,0.707547
1,Template_1,1,0.63924,0.716981,0.579619,0.716981
2,Template_2,1,0.613891,0.688679,0.558872,0.688679
3,Template_0,2,0.660458,0.688679,0.728867,0.688679
4,Template_1,2,0.646994,0.726415,0.586201,0.726415
5,Template_2,2,0.613891,0.688679,0.558872,0.688679
6,Template_0,3,0.652996,0.688679,0.726161,0.688679
7,Template_1,3,0.622126,0.698113,0.562984,0.698113
8,Template_2,3,0.613891,0.688679,0.558872,0.688679
9,Template_0,4,0.68255,0.707547,0.749724,0.707547


In [21]:
def summarize_prompt_metrics(df):
    summary_rows = []

    # Gruppieren nach Prompt-Template
    grouped = df.groupby("Prompt")

    for prompt, group in grouped:
        f1_median = group["F1-Score"].median()
        f1_min = group["F1-Score"].min()
        f1_max = group["F1-Score"].max()

        precision_mean = group["Precision"].mean()
        precision_std = group["Precision"].std()

        recall_mean = group["Recall"].mean()
        recall_std = group["Recall"].std()

        acc_mean = group["Accuracy"].mean()
        acc_std = group["Accuracy"].std()

        summary_rows.append({
            "Prompt": prompt,
            "F1-Median (Min–Max)": f"{f1_median:.3f} ({f1_min:.3f}–{f1_max:.3f})",
            "Precision (M ± SD)": f"{precision_mean:.3f} ± {precision_std:.3f}",
            "Recall (M ± SD)": f"{recall_mean:.3f} ± {recall_std:.3f}",
            "Accuracy (M ± SD)": f"{acc_mean:.3f} ± {acc_std:.3f}",
        })

    summary_df = pd.DataFrame(summary_rows)
    return summary_df


In [22]:
summary = summarize_prompt_metrics(test)
summary

Unnamed: 0,Prompt,F1-Median (Min–Max),Precision (M ± SD),Recall (M ± SD),Accuracy (M ± SD)
0,Template_0,0.660 (0.652–0.690),0.735 ± 0.012,0.694 ± 0.013,0.694 ± 0.013
1,Template_1,0.630 (0.606–0.647),0.571 ± 0.012,0.706 ± 0.018,0.706 ± 0.018
2,Template_2,0.614 (0.614–0.630),0.562 ± 0.007,0.692 ± 0.008,0.692 ± 0.008


## Optimization – DSPY

In [23]:
import litellm

In [24]:
import dspy

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
litellm.drop_params = True

# Loading Data

In [26]:
df_anno_train = pd.read_csv('plasticity_focalization_trainset.csv')

In [27]:
df_anno_train['Fokalisierung'] = df_anno_train['Fokalisierung'].replace({
    'intern': 'internal',
    'extern': 'external',
    'null': 'zero',
    None: 'zero',
    np.nan: 'zero'
})

In [28]:
df_anno_test = pd.read_csv('plasticity_2025_Anno_DEU_test_2nd_run.csv')

In [29]:
df_anno_test['Fokalisierung'] = df_anno_test['Fokalisierung'].replace({
    'intern': 'internal',
    'extern': 'external',
    'null': 'zero',
    None: 'zero',
    np.nan: 'zero'
})

In [30]:
df_anno_train.head()

Unnamed: 0,Autor,Titel,Absatz,Fokalisierung,Kommentar,Link
0,Goethe,Die Sängerin Antonelli,"Als ich mich in Neapel aufhielt, begegnete das...",internal,,https://www.projekt-gutenberg.org/goethe/anton...
1,Goethe,Die Sängerin Antonelli,"Eine Sängerin, Antonelli genannt, war zu meine...",internal,,https://www.projekt-gutenberg.org/goethe/anton...
2,Goethe,Die Sängerin Antonelli,Bei ihren bisherigen Verbindungen war ihr Geis...,internal,,https://www.projekt-gutenberg.org/goethe/anton...
3,Goethe,Die Sängerin Antonelli,"Es war ein Genueser, der sich um diese Zeit ei...",internal,,https://www.projekt-gutenberg.org/goethe/anton...
4,Tieck,Das grüne Band,Durch die Thäler und über die Wiesen wandelte ...,external,,https://www.projekt-gutenberg.org/tieck/grueba...


In [31]:
def balanced_sample_by_category(df, category_column, n_per_category=8, random_state=42):
    """
    Gibt ein balanciertes Sample aus dem DataFrame zurück mit n_per_category Einträgen pro Kategorie.

    Args:
        df (pd.DataFrame): Eingabedaten.
        category_column (str): Spaltenname, nach dem kategorisiert werden soll.
        n_per_category (int): Anzahl von Einträgen pro Kategorie.
        random_state (int): Seed für Reproduzierbarkeit.

    Returns:
        pd.DataFrame: Balanciertes Sample.
    """
    # Fehlende Kategorien ausschließen
    df_clean = df.dropna(subset=[category_column])

    # Alle eindeutigen Kategorien abrufen
    categories = df_clean[category_column].unique()

    # Sampling durchführen
    balanced_df = pd.concat([
        df_clean[df_clean[category_column] == cat].sample(
            n=min(n_per_category, len(df_clean[df_clean[category_column] == cat])),
            random_state=random_state
        )
        for cat in categories
    ])

    # Index zurücksetzen
    return balanced_df.reset_index(drop=True)

In [32]:
df_train_balanced = balanced_sample_by_category(df_anno_train, category_column="Fokalisierung", n_per_category=8)

In [33]:
df_train_balanced.describe()

Unnamed: 0,Autor,Titel,Absatz,Fokalisierung,Kommentar,Link
count,24,24,24,24,0.0,24
unique,14,14,24,3,0.0,14
top,Brentano,Baron Hüpfenstich,"Als ich mich in Neapel aufhielt, begegnete das...",internal,,https://www.projekt-gutenberg.org/brentano/hue...
freq,3,3,1,8,,3


# Preparing Dataset

Anleitung zur Erstellung eines Datensets: https://dspy-docs.vercel.app/docs/deep-dive/data-handling/loading-custom-data

In [34]:
from dspy.datasets.dataset import Dataset

In [35]:
train = df_train_balanced[["Absatz", "Fokalisierung"]].copy(deep=True)

In [36]:
train.head()

Unnamed: 0,Absatz,Fokalisierung
0,"Als ich mich in Neapel aufhielt, begegnete das...",internal
1,"Eine große Sorge hatte der gute König jetzt, d...",internal
2,"Es ist doch etwas Schönes, Herrliches, Erhaben...",internal
3,"Eine Sängerin, Antonelli genannt, war zu meine...",internal
4,Einen anderen Weg schlag ich ein; er ist aller...,internal


In [37]:
len(train)

24

In [38]:
class CSVDataset(Dataset):
    def __init__(self, df, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        
        df=df
        #self._train = df.iloc[0:35].to_dict(orient='records')
        self._dev = df.to_dict(orient='records')

In [39]:
dataset = CSVDataset(train)

In [40]:
len(dataset.dev)

24

# Setting LLM

In [41]:
dspy.settings.configure(
    cache=None
)

In [42]:
gpt_key = os.getenv('MY_OPENAI')

In [43]:
gpt = dspy.LM('gpt-4.1-2025-04-14', api_key=gpt_key)

In [44]:
dspy.settings.configure(lm=gpt)

# Setting Up Module + checking output

In [45]:
class Determinacy(dspy.Signature):
    """
    Your task is to classify the focalization of the following sentence 
    
    ### Labels
    There are three modes of focalization:
    - internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
    - external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
    - zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously.
    """
    #context = dspy.InputField(desc="contains annotation guidelines and scoring instructions")
    text_snippet = dspy.InputField(desc="contains a snippet of a narrative text")
    tag = dspy.OutputField(desc="contains only the **label** in lower case")

In [46]:
context = """ 
### Labels
There are three modes of focalization:
- internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
- external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
- zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously.
"""

# Setting Metric

Anleitung zu den Metriken in DSPY: https://dspy-docs.vercel.app/docs/building-blocks/metrics

In [47]:
from dspy.evaluate import Evaluate

In [48]:
def validate_tag(example, pred, trace=None):
    print(example.answer)
    print(pred.tag)
    return example.answer in pred.tag

# Trying out the Signature Optimizer

Anleitung zur Arbeit mit dem Optimizer bei zero-shot: https://dspy-docs.vercel.app/docs/deep-dive/teleprompter/signature-optimizer

In [49]:
class DeterminacyPipe(dspy.Module):
    def __init__(self):
        super().__init__()
        
        self.signature = Determinacy
        self.predictor = dspy.ChainOfThought(self.signature)
        
    def forward(self, text_snippet):
        result = self.predictor(text_snippet=text_snippet)
        return dspy.Prediction(
            tag = result.tag
        )      

In [50]:
devset = dataset.dev

In [51]:
evaluate = Evaluate(devset=devset, metric=validate_tag, num_threads=3, display_progress=True, display_table=True)

In [52]:
event_baseline = DeterminacyPipe()
devset_with_input = [dspy.Example({"text_snippet": r["Absatz"], "answer": r["Fokalisierung"]}).with_inputs("context", "text_snippet") for r in devset]

In [53]:
evaluate(event_baseline, devset=devset_with_input)

internal                                                 | 0/24 [00:00<?, ?it/s]
internal
externalMetric: 1.00 / 1 (100.0%):   4%|▍        | 1/24 [00:04<01:32,  4.02s/it]
internal
zeroage Metric: 1.00 / 2 (50.0%):   8%|▊         | 2/24 [00:05<00:53,  2.42s/it]
zero
externalMetric: 2.00 / 3 (66.7%):  12%|█▎        | 3/24 [00:06<00:40,  1.94s/it]
zero
internalMetric: 2.00 / 4 (50.0%):  17%|█▋        | 4/24 [00:09<00:46,  2.30s/it]
internal
internalMetric: 3.00 / 5 (60.0%):  21%|██        | 5/24 [00:12<00:46,  2.47s/it]
internal
zeroage Metric: 4.00 / 6 (66.7%):  25%|██▌       | 6/24 [00:14<00:40,  2.23s/it]
zero
externalMetric: 5.00 / 7 (71.4%):  29%|██▉       | 7/24 [00:16<00:37,  2.18s/it]
zero
zeroage Metric: 5.00 / 8 (62.5%):  33%|███▎      | 8/24 [00:16<00:27,  1.74s/it]
zero
internalMetric: 6.00 / 9 (66.7%):  38%|███▊      | 9/24 [00:17<00:18,  1.25s/it]
internal
internalMetric: 7.00 / 10 (70.0%):  42%|███▎    | 10/24 [00:18<00:19,  1.42s/it]
internal
zeroage Metric: 8.00 / 11 (72.

2025/04/24 11:08:37 INFO dspy.evaluate.evaluate: Average Metric: 14 / 24 (58.3%)





Unnamed: 0,text_snippet,answer,tag,validate_tag
0,"Als der Tag anbrach, noch ehe die Sonne aufgegangen war, kam schon...",external,internal,
1,"Als ich mich in Neapel aufhielt, begegnete daselbst eine Geschicht...",internal,internal,✔️ [True]
2,In den letzten Jahrzehnten ist das Interesse an Hungerkünstlern se...,zero,zero,✔️ [True]
3,"Wie gesagt, die Hand warf mich wieder zur Erde. Bald darauf erfaßt...",internal,internal,✔️ [True]
4,"Aber da keine Krankheit in ihm war, so war der Gedanke nicht graue...",internal,internal,✔️ [True]
5,"Es blieb daher nur noch die andere Seite neben dem Herrenkreuz, un...",external,zero,
6,"In M..., einer bedeutenden Stadt im oberen Italien, ließ die verwi...",zero,zero,✔️ [True]
7,"Die Jugend, welche die beiden Freunde Aeins und Azwei verband, war...",zero,zero,✔️ [True]
8,"Wenn man in jenen Tagen ein Ding durch die Fichtau bringen wollte,...",external,zero,
9,"Einen anderen Weg schlag ich ein; er ist allerdings etwas weit, ab...",internal,internal,✔️ [True]


58.33

# Using Copro

In [54]:
from dspy.teleprompt import COPRO

In [55]:
teleprompter = dspy.teleprompt.COPRO(
    program_mode="basic",
    init_temperature=0.4,  
    breadth=4,
    metric=validate_tag,
)

In [56]:
kwargs = dict(num_threads=5, display_progress=True, display_table=0) # Used in Evaluate class in the optimization process
compiled_prompt_opt = teleprompter.compile(DeterminacyPipe(), trainset=devset_with_input, eval_kwargs=kwargs)

2025/04/24 11:08:45 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 1/3.
2025/04/24 11:08:45 INFO dspy.teleprompt.copro_optimizer: At Depth 1/3, Evaluating Prompt Candidate #1/4 for Predictor 1 of 1.


internal                                                 | 0/24 [00:00<?, ?it/s]
internal
internalMetric: 1.00 / 1 (100.0%):   4%|▍        | 1/24 [00:04<01:49,  4.78s/it]
internal
internalMetric: 2.00 / 2 (100.0%):   8%|▊        | 2/24 [00:05<00:47,  2.17s/it]
internal
externalMetric: 3.00 / 3 (100.0%):   8%|▊        | 2/24 [00:05<00:47,  2.17s/it]
internal
zeroage Metric: 3.00 / 4 (75.0%):  17%|█▋        | 4/24 [00:05<00:20,  1.01s/it]
zero
externalMetric: 4.00 / 5 (80.0%):  21%|██        | 5/24 [00:05<00:13,  1.37it/s]
zero
internalMetric: 4.00 / 6 (66.7%):  25%|██▌       | 6/24 [00:08<00:22,  1.22s/it]
internal
externalMetric: 5.00 / 7 (71.4%):  29%|██▉       | 7/24 [00:08<00:16,  1.01it/s]
zero
internalMetric: 5.00 / 8 (62.5%):  33%|███▎      | 8/24 [00:08<00:11,  1.35it/s]
internal
internalMetric: 6.00 / 9 (66.7%):  38%|███▊      | 9/24 [00:11<00:19,  1.33s/it]
internal
zeroage Metric: 7.00 / 10 (70.0%):  42%|███▎    | 10/24 [00:12<00:16,  1.15s/it]
zero
internalMetric: 8.00 / 11 

2025/04/24 11:09:09 INFO dspy.evaluate.evaluate: Average Metric: 15 / 24 (62.5%)
2025/04/24 11:09:09 INFO dspy.teleprompt.copro_optimizer: At Depth 1/3, Evaluating Prompt Candidate #2/4 for Predictor 1 of 1.



internal                                                 | 0/24 [00:00<?, ?it/s]
internal
internalMetric: 1.00 / 1 (100.0%):   4%|▍        | 1/24 [00:02<01:05,  2.85s/it]
internal
internalMetric: 2.00 / 2 (100.0%):   8%|▊        | 2/24 [00:03<00:31,  1.42s/it]
internal
zeroage Metric: 3.00 / 3 (100.0%):   8%|▊        | 2/24 [00:03<00:31,  1.42s/it]
zero
externalMetric: 4.00 / 4 (100.0%):  17%|█▌       | 4/24 [00:03<00:11,  1.72it/s]
zero
externalMetric: 4.00 / 5 (80.0%):  21%|██        | 5/24 [00:03<00:10,  1.78it/s]
zero
zeroage Metric: 4.00 / 6 (66.7%):  25%|██▌       | 6/24 [00:06<00:22,  1.25s/it]
zero
zeroage Metric: 5.00 / 7 (71.4%):  29%|██▉       | 7/24 [00:06<00:15,  1.10it/s]
zero
externalMetric: 6.00 / 8 (75.0%):  33%|███▎      | 8/24 [00:07<00:13,  1.19it/s]
zero
internalMetric: 6.00 / 9 (66.7%):  33%|███▎      | 8/24 [00:07<00:13,  1.19it/s]
internal
internalMetric: 7.00 / 10 (70.0%):  42%|███▎    | 10/24 [00:07<00:06,  2.11it/s]
internal
internalMetric: 8.00 / 11 (72.7%)

2025/04/24 11:09:33 INFO dspy.evaluate.evaluate: Average Metric: 14 / 24 (58.3%)
2025/04/24 11:09:33 INFO dspy.teleprompt.copro_optimizer: At Depth 1/3, Evaluating Prompt Candidate #3/4 for Predictor 1 of 1.



internal                                                 | 0/24 [00:00<?, ?it/s]
internal
internalMetric: 1.00 / 1 (100.0%):   4%|▍        | 1/24 [00:03<01:11,  3.11s/it]
internal
internalMetric: 2.00 / 2 (100.0%):   8%|▊        | 2/24 [00:03<00:35,  1.61s/it]
internal
zeroage Metric: 3.00 / 3 (100.0%):  12%|█▏       | 3/24 [00:04<00:24,  1.19s/it]
zero
externalMetric: 4.00 / 4 (100.0%):  17%|█▌       | 4/24 [00:05<00:22,  1.12s/it]
zero
externalMetric: 4.00 / 5 (80.0%):  21%|██        | 5/24 [00:07<00:28,  1.52s/it]
internal
zeroage Metric: 4.00 / 6 (66.7%):  25%|██▌       | 6/24 [00:07<00:19,  1.07s/it]
zero
externalMetric: 5.00 / 7 (71.4%):  29%|██▉       | 7/24 [00:09<00:21,  1.26s/it]
zero
zeroage Metric: 5.00 / 8 (62.5%):  33%|███▎      | 8/24 [00:09<00:14,  1.09it/s]
zero
internalMetric: 6.00 / 9 (66.7%):  38%|███▊      | 9/24 [00:11<00:17,  1.14s/it]
internal
internalMetric: 7.00 / 10 (70.0%):  42%|███▎    | 10/24 [00:11<00:11,  1.21it/s]
zero
internalMetric: 7.00 / 11 (63.6%)

2025/04/24 11:10:01 INFO dspy.evaluate.evaluate: Average Metric: 13 / 24 (54.2%)
2025/04/24 11:10:01 INFO dspy.teleprompt.copro_optimizer: At Depth 1/3, Evaluating Prompt Candidate #4/4 for Predictor 1 of 1.



external
internal
internal
internal
zero
zero
zero
zero
internal
internal
external
zero
external
zero
internal
internal
zero
zero
internal
internal
internal
internal
zero
internal
external
internal
zero
zero
external
external
zero
zero
external
zero
internal
internal
external
internal
external
zero
internal
internal
zero
internal
zero
internal
internal
internal
Average Metric: 14.00 / 24 (58.3%): 100%|█████| 24/24 [00:00<00:00, 4044.49it/s]

2025/04/24 11:10:01 INFO dspy.evaluate.evaluate: Average Metric: 14 / 24 (58.3%)





2025/04/24 11:10:05 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 2/3.
2025/04/24 11:10:05 INFO dspy.teleprompt.copro_optimizer: At Depth 2/3, Evaluating Prompt Candidate #1/4 for Predictor 1 of 1.


internal                                                 | 0/24 [00:00<?, ?it/s]
internal
internalMetric: 1.00 / 1 (100.0%):   4%|▍        | 1/24 [00:02<00:56,  2.45s/it]
internal
internalMetric: 2.00 / 2 (100.0%):   4%|▍        | 1/24 [00:02<00:56,  2.45s/it]
internal
zeroage Metric: 3.00 / 3 (100.0%):  12%|█▏       | 3/24 [00:03<00:19,  1.07it/s]
zero
externalMetric: 4.00 / 4 (100.0%):  17%|█▌       | 4/24 [00:03<00:14,  1.37it/s]
zero
externalMetric: 4.00 / 5 (80.0%):  21%|██        | 5/24 [00:04<00:14,  1.35it/s]
external
zeroage Metric: 5.00 / 6 (83.3%):  25%|██▌       | 6/24 [00:05<00:13,  1.35it/s]
zero
internalMetric: 6.00 / 7 (85.7%):  29%|██▉       | 7/24 [00:05<00:10,  1.57it/s]
internal
externalMetric: 7.00 / 8 (87.5%):  33%|███▎      | 8/24 [00:06<00:13,  1.22it/s]
zero
zeroage Metric: 7.00 / 9 (77.8%):  38%|███▊      | 9/24 [00:07<00:09,  1.51it/s]
zero
internalMetric: 8.00 / 10 (80.0%):  42%|███▎    | 10/24 [00:08<00:13,  1.04it/s]
internal
zeroage Metric: 9.00 / 11 (81.

2025/04/24 11:10:23 INFO dspy.evaluate.evaluate: Average Metric: 15 / 24 (62.5%)
2025/04/24 11:10:23 INFO dspy.teleprompt.copro_optimizer: At Depth 2/3, Evaluating Prompt Candidate #2/4 for Predictor 1 of 1.



internal                                                 | 0/24 [00:00<?, ?it/s]
internal
internalMetric: 1.00 / 1 (100.0%):   4%|▍        | 1/24 [00:03<01:17,  3.38s/it]
internal
internalMetric: 2.00 / 2 (100.0%):   8%|▊        | 2/24 [00:03<00:33,  1.52s/it]
internal
zeroage Metric: 3.00 / 3 (100.0%):  12%|█▏       | 3/24 [00:03<00:19,  1.08it/s]
zero
externalMetric: 4.00 / 4 (100.0%):  12%|█▏       | 3/24 [00:03<00:19,  1.08it/s]
internal
externalMetric: 4.00 / 5 (80.0%):  21%|██        | 5/24 [00:04<00:10,  1.83it/s]
zero
zeroage Metric: 4.00 / 6 (66.7%):  25%|██▌       | 6/24 [00:06<00:19,  1.09s/it]
zero
internalMetric: 5.00 / 7 (71.4%):  25%|██▌       | 6/24 [00:06<00:19,  1.09s/it]
internal
externalMetric: 6.00 / 8 (75.0%):  33%|███▎      | 8/24 [00:06<00:10,  1.57it/s]
zero
zeroage Metric: 6.00 / 9 (66.7%):  38%|███▊      | 9/24 [00:08<00:11,  1.30it/s]
zero
internalMetric: 7.00 / 10 (70.0%):  42%|███▎    | 10/24 [00:08<00:10,  1.32it/s]
internal
internalMetric: 8.00 / 11 (72

2025/04/24 11:10:42 INFO dspy.evaluate.evaluate: Average Metric: 16 / 24 (66.7%)
2025/04/24 11:10:42 INFO dspy.teleprompt.copro_optimizer: At Depth 2/3, Evaluating Prompt Candidate #3/4 for Predictor 1 of 1.



internal                                                 | 0/24 [00:00<?, ?it/s]
internal
zeroage Metric: 1.00 / 1 (100.0%):   4%|▍        | 1/24 [00:02<00:53,  2.32s/it]
zero
internalMetric: 2.00 / 2 (100.0%):   8%|▊        | 2/24 [00:03<00:39,  1.81s/it]
internal
externalMetric: 3.00 / 3 (100.0%):  12%|█▏       | 3/24 [00:05<00:38,  1.82s/it]
zero
externalMetric: 3.00 / 4 (75.0%):  17%|█▋        | 4/24 [00:05<00:23,  1.19s/it]
internal
zeroage Metric: 3.00 / 5 (60.0%):  21%|██        | 5/24 [00:06<00:17,  1.11it/s]
zero
internalMetric: 4.00 / 6 (66.7%):  25%|██▌       | 6/24 [00:07<00:15,  1.14it/s]
internal
externalMetric: 5.00 / 7 (71.4%):  29%|██▉       | 7/24 [00:07<00:13,  1.21it/s]
zero
zeroage Metric: 5.00 / 8 (62.5%):  33%|███▎      | 8/24 [00:09<00:15,  1.01it/s]
zero
internalMetric: 6.00 / 9 (66.7%):  38%|███▊      | 9/24 [00:09<00:12,  1.20it/s]
internal
internalMetric: 7.00 / 10 (70.0%):  42%|███▎    | 10/24 [00:09<00:09,  1.48it/s]
internal
internalMetric: 8.00 / 11 (72

2025/04/24 11:11:17 INFO dspy.evaluate.evaluate: Average Metric: 14 / 24 (58.3%)
2025/04/24 11:11:17 INFO dspy.teleprompt.copro_optimizer: At Depth 2/3, Evaluating Prompt Candidate #4/4 for Predictor 1 of 1.



internal                                                 | 0/24 [00:00<?, ?it/s]
internal
internalMetric: 1.00 / 1 (100.0%):   4%|▍        | 1/24 [00:02<00:56,  2.44s/it]
internal
internalMetric: 2.00 / 2 (100.0%):   8%|▊        | 2/24 [00:02<00:27,  1.24s/it]
internal
externalMetric: 3.00 / 3 (100.0%):   8%|▊        | 2/24 [00:02<00:27,  1.24s/it]
zero
externalMetric: 3.00 / 4 (75.0%):  17%|█▋        | 4/24 [00:03<00:14,  1.38it/s]
external
zeroage Metric: 4.00 / 5 (80.0%):  21%|██        | 5/24 [00:05<00:18,  1.01it/s]
zero
internalMetric: 5.00 / 6 (83.3%):  25%|██▌       | 6/24 [00:05<00:14,  1.23it/s]
internal
zeroage Metric: 6.00 / 7 (85.7%):  29%|██▉       | 7/24 [00:07<00:19,  1.14s/it]
zero
externalMetric: 7.00 / 8 (87.5%):  29%|██▉       | 7/24 [00:07<00:19,  1.14s/it]
zero
zeroage Metric: 7.00 / 9 (77.8%):  38%|███▊      | 9/24 [00:08<00:11,  1.25it/s]
zero
internalMetric: 8.00 / 10 (80.0%):  42%|███▎    | 10/24 [00:08<00:09,  1.50it/s]
zero
internalMetric: 8.00 / 11 (72.7%)

2025/04/24 11:11:37 INFO dspy.evaluate.evaluate: Average Metric: 17 / 24 (70.8%)





2025/04/24 11:11:46 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 3/3.
2025/04/24 11:11:46 INFO dspy.teleprompt.copro_optimizer: At Depth 3/3, Evaluating Prompt Candidate #1/4 for Predictor 1 of 1.


zero|                                                    | 0/24 [00:00<?, ?it/s]
zero
internalMetric: 1.00 / 1 (100.0%):   4%|▍        | 1/24 [00:03<01:15,  3.30s/it]
external
internalMetric: 1.00 / 2 (50.0%):   4%|▍         | 1/24 [00:03<01:15,  3.30s/it]
internal
internalMetric: 2.00 / 3 (66.7%):  12%|█▎        | 3/24 [00:04<00:26,  1.26s/it]
internal
externalMetric: 3.00 / 4 (75.0%):  12%|█▎        | 3/24 [00:04<00:26,  1.26s/it]
external
externalMetric: 4.00 / 5 (80.0%):  21%|██        | 5/24 [00:05<00:16,  1.13it/s]
zero
zeroage Metric: 4.00 / 6 (66.7%):  25%|██▌       | 6/24 [00:05<00:12,  1.45it/s]
zero
internalMetric: 5.00 / 7 (71.4%):  29%|██▉       | 7/24 [00:06<00:13,  1.28it/s]
internal
zeroage Metric: 6.00 / 8 (75.0%):  33%|███▎      | 8/24 [00:07<00:13,  1.17it/s]
zero
externalMetric: 7.00 / 9 (77.8%):  33%|███▎      | 8/24 [00:07<00:13,  1.17it/s]
zero
internalMetric: 7.00 / 10 (70.0%):  42%|███▎    | 10/24 [00:09<00:11,  1.22it/s]
internal
zeroage Metric: 8.00 / 11 (72.

2025/04/24 11:12:05 INFO dspy.evaluate.evaluate: Average Metric: 15 / 24 (62.5%)
2025/04/24 11:12:05 INFO dspy.teleprompt.copro_optimizer: At Depth 3/3, Evaluating Prompt Candidate #2/4 for Predictor 1 of 1.



external                                                 | 0/24 [00:00<?, ?it/s]
internal
internalMetric: 0.00 / 1 (0.0%):   4%|▍          | 1/24 [00:01<00:45,  2.00s/it]
internal
internalMetric: 1.00 / 2 (50.0%):   4%|▍         | 1/24 [00:02<00:45,  2.00s/it]
internal
externalMetric: 2.00 / 3 (66.7%):  12%|█▎        | 3/24 [00:03<00:26,  1.25s/it]
external
internalMetric: 3.00 / 4 (75.0%):  17%|█▋        | 4/24 [00:04<00:18,  1.07it/s]
internal
zeroage Metric: 4.00 / 5 (80.0%):  21%|██        | 5/24 [00:04<00:12,  1.49it/s]
zero
zeroage Metric: 5.00 / 6 (83.3%):  21%|██        | 5/24 [00:04<00:12,  1.49it/s]
zero
zeroage Metric: 6.00 / 7 (85.7%):  29%|██▉       | 7/24 [00:05<00:09,  1.86it/s]
zero
internalMetric: 7.00 / 8 (87.5%):  33%|███▎      | 8/24 [00:06<00:12,  1.26it/s]
internal
externalMetric: 8.00 / 9 (88.9%):  33%|███▎      | 8/24 [00:06<00:12,  1.26it/s]
zero
internalMetric: 8.00 / 10 (80.0%):  38%|███▍     | 9/24 [00:06<00:11,  1.26it/s]
internal
zeroage Metric: 9.00 / 11

2025/04/24 11:12:20 INFO dspy.evaluate.evaluate: Average Metric: 17 / 24 (70.8%)
2025/04/24 11:12:20 INFO dspy.teleprompt.copro_optimizer: At Depth 3/3, Evaluating Prompt Candidate #3/4 for Predictor 1 of 1.



external                                                 | 0/24 [00:00<?, ?it/s]
external
internalMetric: 1.00 / 1 (100.0%):   4%|▍        | 1/24 [00:02<00:52,  2.30s/it]
internal
internalMetric: 2.00 / 2 (100.0%):   8%|▊        | 2/24 [00:02<00:27,  1.25s/it]
internal
externalMetric: 3.00 / 3 (100.0%):  12%|█▏       | 3/24 [00:03<00:25,  1.20s/it]
external
internalMetric: 4.00 / 4 (100.0%):  17%|█▌       | 4/24 [00:04<00:21,  1.07s/it]
internal
zeroage Metric: 5.00 / 5 (100.0%):  17%|█▌       | 4/24 [00:04<00:21,  1.07s/it]
zero
zeroage Metric: 6.00 / 6 (100.0%):  25%|██▎      | 6/24 [00:05<00:11,  1.61it/s]
zero
internalMetric: 7.00 / 7 (100.0%):  29%|██▋      | 7/24 [00:05<00:09,  1.86it/s]
internal
internalMetric: 8.00 / 8 (100.0%):  33%|███      | 8/24 [00:07<00:12,  1.25it/s]
internal
externalMetric: 9.00 / 9 (100.0%):  38%|███▍     | 9/24 [00:07<00:12,  1.22it/s]
zero
zeroage Metric: 9.00 / 10 (90.0%):  42%|███▎    | 10/24 [00:08<00:09,  1.55it/s]
zero
zeroage Metric: 10.00 / 1

2025/04/24 11:12:39 INFO dspy.evaluate.evaluate: Average Metric: 16 / 24 (66.7%)
2025/04/24 11:12:39 INFO dspy.teleprompt.copro_optimizer: At Depth 3/3, Evaluating Prompt Candidate #4/4 for Predictor 1 of 1.



internal                                                 | 0/24 [00:00<?, ?it/s]
internal
internalMetric: 1.00 / 1 (100.0%):   4%|▍        | 1/24 [00:02<00:56,  2.46s/it]
internal
zeroage Metric: 2.00 / 2 (100.0%):   4%|▍        | 1/24 [00:02<00:56,  2.46s/it]
zero
externalMetric: 3.00 / 3 (100.0%):  12%|█▏       | 3/24 [00:03<00:20,  1.03it/s]
zero
externalMetric: 3.00 / 4 (75.0%):  17%|█▋        | 4/24 [00:03<00:14,  1.33it/s]
external
zeroage Metric: 4.00 / 5 (80.0%):  21%|██        | 5/24 [00:04<00:13,  1.44it/s]
zero
internalMetric: 5.00 / 6 (83.3%):  25%|██▌       | 6/24 [00:05<00:16,  1.06it/s]
internal
externalMetric: 6.00 / 7 (85.7%):  25%|██▌       | 6/24 [00:05<00:16,  1.06it/s]
zero
internalMetric: 6.00 / 8 (75.0%):  33%|███▎      | 8/24 [00:06<00:10,  1.59it/s]
internal
zeroage Metric: 7.00 / 9 (77.8%):  38%|███▊      | 9/24 [00:08<00:13,  1.08it/s]
zero
internalMetric: 8.00 / 10 (80.0%):  42%|███▎    | 10/24 [00:08<00:10,  1.37it/s]
internal
internalMetric: 9.00 / 11 (81

2025/04/24 11:12:56 INFO dspy.evaluate.evaluate: Average Metric: 17 / 24 (70.8%)





In [57]:
compiled_prompt_opt

predictor.predict = Predict(StringSignature(text_snippet -> reasoning, tag
    instructions='You will be given a sentence. Your task is to identify its mode of focalization by choosing one of the following labels: "internal", "external", or "zero". Use these definitions to guide your decision:\n- Internal focalization: The sentence presents the world or events through the direct perspective, thoughts, or sensory experiences of a character involved in the scene.\n- External focalization: The sentence depicts observable actions or perceptions that could be experienced by a character, but does not reveal any character’s inner thoughts, feelings, or explicit viewpoint.\n- Zero focalization: The sentence provides information about the story world that is not accessible to any character’s perception or consciousness at the moment—such as omniscient narration, background facts, or events no character could witness in real time.\n\nCarefully analyze the sentence for clues about perspective, ac

# Test with new Prompt

In [58]:
results = []
for text_snippet in df_anno_test.Absatz:
    response = compiled_prompt_opt(text_snippet=text_snippet)
    print(response.tag)
    results.append(response.tag)

zero
zero
internal
zero
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
external
zero
zero
internal
internal
internal
internal
zero
internal
zero
internal
external
external
internal
internal
internal
internal
internal
internal
external
zero
external
zero
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
internal
external
internal
internal
internal
internal
zero
zero
zero
zero
zero
zero
zero
zero
zero
external
zero
internal
zero
internal
internal
zero
internal
external
internal
internal
internal
internal
internal
internal
internal
internal
external
zero
zero
zero
external
zero
internal
internal
internal
external
external
external
internal
internal
internal
internal
external
zero
internal
internal
internal
internal
internal
zero


In [59]:
results_1 = []
for text in results:
    if "internal focalization" in text:
        results_1.append("internal")
    elif "internal" in text:
        results_1.append("internal")
    elif "external focalization" in text:
        results_1.append("external")
    elif "external" in text:
        results_1.append("external")
    elif "zero focalization" in text:
        results_1.append("zero")
    elif "zero" in text:
        results_1.append("zero")
    else:
        results_1.append("NaN")

In [60]:
predictions_dspy = pd.Series(results_1)

In [61]:
predictions_dspy

0          zero
1          zero
2      internal
3          zero
4      internal
         ...   
101    internal
102    internal
103    internal
104    internal
105        zero
Length: 106, dtype: object

In [62]:
ground_truth = df_anno_test.Fokalisierung

In [63]:
ground_truth

0          zero
1          zero
2      internal
3          zero
4      internal
         ...   
101    internal
102    internal
103    internal
104    internal
105    internal
Name: Fokalisierung, Length: 106, dtype: object

In [64]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [65]:
pd.DataFrame(list(zip([f1_score(ground_truth, predictions_dspy, average="weighted")],
                      [recall_score(ground_truth, predictions_dspy,  average="weighted")],
                      [precision_score(ground_truth, predictions_dspy, average="weighted")],
                      [accuracy_score(ground_truth, predictions_dspy,)])),
                      columns = ["F1", "Recall", "Precision", "Accuracy"])

Unnamed: 0,F1,Recall,Precision,Accuracy
0,0.762909,0.773585,0.763847,0.773585


In [66]:
compiled_prompt_opt.save('gpt-41_dspied.pkl')

## Statistischer Test: McNemar

In [67]:
from typing import Optional
from statsmodels.stats.contingency_tables import mcnemar

In [68]:
# Funktion zum Vergleichen der Vorhersagen mit dem Goldstandard
def compare_with_gold(predictions, gold_standard):
    return [pred == gold for pred, gold in zip(predictions, gold_standard)]

In [69]:
# Funktion zum Erstellen einer Kontingenztafel
def create_contingency_table(results_a, results_b):
    both_correct = sum(a and b for a, b in zip(results_a, results_b))
    only_a_correct = sum(a and not b for a, b in zip(results_a, results_b))
    only_b_correct = sum(b and not a for a, b in zip(results_a, results_b))
    both_incorrect = sum(not a and not b for a, b in zip(results_a, results_b))
    return np.array([[both_correct, only_a_correct],
                     [only_b_correct, both_incorrect]])


In [70]:
# McNemar-Test durchführen
def run_mcnemar_test(results_a, results_b):
    table = create_contingency_table(results_a, results_b)
    return mcnemar(table, exact=True)


In [71]:
def generate_mcnemar_comparisons(
    results_gpt4o,
    df_anno,
    num_templates=3,
    runs=[1, 2],
    optimized_predictions: Optional[pd.Series] = None
):
    """
    Führt McNemar-Vergleiche zwischen verschiedenen Prompt-Vorhersagen durch,
    inklusive eines optionalen optimierten Prompts.

    Args:
        results_gpt4o: DataFrame mit Prediction-Spalten für alle Templates/Runs
        df_anno: DataFrame mit Goldstandard ('Fokalisierung')
        num_templates: Anzahl der verwendeten Template-Prompts
        runs: Liste der Runs (z. B. [1, 2])
        optimized_predictions: (optional) pd.Series mit Optimized-Prompt-Vorhersagen

    Returns:
        comparison_df: DataFrame mit allen McNemar-Test-Ergebnissen
    """
    gold = df_anno["Fokalisierung"]
    comparison_results = []

    for run in runs:
        # Vorhersagen der Templates
        predictions = {
            f"T{template_idx+1}": compare_with_gold(
                results_gpt4o[f"Prediction_{template_idx}_Run{run}"], gold
            )
            for template_idx in range(num_templates)
        }

        # Optional: Optimierten Prompt ergänzen
        if optimized_predictions is not None:
            predictions["Optimized"] = compare_with_gold(optimized_predictions, gold)

        # Paarweise Vergleiche
        templates = list(predictions.keys())
        for i in range(len(templates)):
            for j in range(i + 1, len(templates)):
                t1, t2 = templates[i], templates[j]
                result = run_mcnemar_test(predictions[t1], predictions[t2])
                comparison_results.append({
                    "Run": run,
                    "Comparison": f"{t1} vs {t2}",
                    "Prompt_A": t1,
                    "Prompt_B": t2,
                    "p-value": result.pvalue
                })

    comparison_df = pd.DataFrame(comparison_results)
    return comparison_df


In [73]:
comparison_df = generate_mcnemar_comparisons(
    results_gpt4o=results_gpt41,
    df_anno=df_anno,
    num_templates=3,
    runs=[1, 2],
    optimized_predictions=predictions_dspy
)

In [74]:
print(comparison_df)

    Run       Comparison Prompt_A   Prompt_B   p-value
0     1         T1 vs T2       T1         T2  1.000000
1     1         T1 vs T3       T1         T3  0.803619
2     1  T1 vs Optimized       T1  Optimized  0.167068
3     1         T2 vs T3       T2         T3  0.250000
4     1  T2 vs Optimized       T2  Optimized  0.179565
5     1  T3 vs Optimized       T3  Optimized  0.049042
6     2         T1 vs T2       T1         T2  0.423950
7     2         T1 vs T3       T1         T3  1.000000
8     2  T1 vs Optimized       T1  Optimized  0.063568
9     2         T2 vs T3       T2         T3  0.125000
10    2  T2 vs Optimized       T2  Optimized  0.332306
11    2  T3 vs Optimized       T3  Optimized  0.049042


In [75]:
comparison_df.to_excel("mcnemar_vergleiche_gpt41.xlsx", index=False)

In [76]:
def filter_significant_comparisons(comparison_df, alpha=0.05):
    """
    Gibt alle Vergleichspaare mit p-Wert < alpha (standardmäßig 0.05) zurück.
    
    Args:
        comparison_df: DataFrame mit den Spalten 'Run', 'Comparison', 'Prompt_A', 'Prompt_B', 'p-value'
        alpha: Signifikanzniveau (Default: 0.05)
        
    Returns:
        DataFrame mit nur signifikant unterschiedlichen Prompt-Paaren
    """
    significant_df = comparison_df[comparison_df["p-value"] < alpha].copy()
    return significant_df.sort_values(by=["Run", "p-value"])

In [77]:
significant_comparisons = filter_significant_comparisons(comparison_df)
significant_comparisons

Unnamed: 0,Run,Comparison,Prompt_A,Prompt_B,p-value
5,1,T3 vs Optimized,T3,Optimized,0.049042
11,2,T3 vs Optimized,T3,Optimized,0.049042
