In [1]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from statsmodels.stats.contingency_tables import mcnemar

In [3]:
from scipy import stats
from itertools import combinations

In [4]:
import openai
from openai import OpenAI

In [5]:
import os

In [6]:
import re
import torch
import sklearn
from sklearn.model_selection import train_test_split

In [7]:
df_anno = pd.read_csv('plasticity_2025_Anno_DEU_test_2nd_run.csv')

In [8]:
df_anno['Fokalisierung'] = df_anno['Fokalisierung'].replace({
    'intern': 'internal',
    'extern': 'external',
    'null': 'zero',
    None: 'zero',
    np.nan: 'zero'
})

In [9]:
df_anno.head()

Unnamed: 0,Autor,Titel,Absatz,Fokalisierung,Kommentar
0,Schiller,Der Vebrecher aus verlorener Ehre,In der ganzen Geschichte des Menschen ist kein...,zero,
1,Schiller,Der Vebrecher aus verlorener Ehre,Es ist etwas so Einförmiges und doch wieder so...,zero,
2,Schiller,Der Vebrecher aus verlorener Ehre,"Ich weiß, daß von den besten Geschichtschreibe...",internal,
3,Schiller,Der Vebrecher aus verlorener Ehre,"Der Held muß kalt werden wie der Leser, oder, ...",zero,
4,Tieck,Die beiden merkwürdigsten Tage aus Siegmunds L...,"Es war schon gegen Abend, als ein Wagen vor de...",internal,


In [10]:
api_key = os.getenv('MY_OPENAI')

In [11]:
def get_completion(prompt):  
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(
        temperature = 0.1,
        messages=[{
        "role": "user",
        "content": prompt,
        }],
        model="gpt-4o",
    )
    return response.choices[0].message.content

In [12]:
prompt_basic = """
### Instruction
Your task is to classify the focalization of the following sentence

###
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text 
Sentence: '''{text}''''
Label:
"""

In [13]:
prompt_labels = """
### Instruction
Your task is to classify the focalization of the following sentence

### Labels
There are three modes of focalization:
- internal
- external
- zero

####
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text 
Sentence: '''{text}''' 
Label: 
"""

In [14]:
prompt_redefin = """
### Instruction
Your task is to classify the focalization of the following sentence

### Labels
There are three modes of focalization:
- internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
- external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
- zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously.

####
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text  
Sentence: '''{text}''' 
Label: 
"""

In [15]:
prompt_meta = """
### Instruction
Your task is to classify the focalization of the following sentence

### Labels
There are three modes of focalization:
- internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
- external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
- zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously. 
These definitions are redefinitions of the standard understanding of focalization.

####
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text 
Sentence: '''{text}''' 
Label: 
"""

In [16]:
prompt_templates = [prompt_labels, prompt_redefin, prompt_meta]

In [17]:
def evaluate_prompts_and_predictions(df, prompt_templates):
    """
    Evaluiert verschiedene Prompt-Templates und berechnet Metriken für die Vorhersagen.
    Jeder Prompt wird zweimal über die Daten ausgeführt.
    
    Args:
        df: Pandas DataFrame mit den Spalten 'Absatz' und 'Fokalisierung'
        prompt_templates: Liste der Prompt-Templates
    
    Returns:
        Tuple mit:
            - df: DataFrame mit den gespeicherten Vorhersagen
            - results_df: DataFrame mit den Evaluierungsmetriken für jeden Prompt und jeden Run
    """
    results = []

    for run in range(1, 6):
        print(f"Starte Run {run}/5")

        # Iteration über die Prompt-Templates
        for prompt_idx, template in enumerate(prompt_templates):
            print(f"Verarbeite Prompt-Template {prompt_idx + 1}/{len(prompt_templates)} - Run {run}")
            
            # Spaltenname für Vorhersage definieren
            prediction_col = f'Prediction_{prompt_idx}_Run{run}'
            df[prediction_col] = None
            
            # Iteration über die Zeilen des DataFrames
            for idx, row in df.iterrows():
                prompt = template.format(text=row['Absatz'])
                prediction = get_completion(prompt)
                first_word = prediction.split()[0].lower()
                print(first_word)
                df.at[idx, prediction_col] = first_word
            
            # Metriken berechnen
            metrics = {
                'Prompt': f'Template_{prompt_idx}',
                'Run': run,
                'F1-Score': f1_score(df['Fokalisierung'], df[prediction_col], average='weighted'),
                'Recall': recall_score(df['Fokalisierung'], df[prediction_col], average='weighted'),
                'Precision': precision_score(df['Fokalisierung'], df[prediction_col], average='weighted'),
                'Accuracy': accuracy_score(df['Fokalisierung'], df[prediction_col])
            }
            
            results.append(metrics)
    
    # Ergebnisse in DataFrame umwandeln
    results_df = pd.DataFrame(results)
    return df, results_df

In [18]:
results_gpt4o, test = evaluate_prompts_and_predictions(df_anno, prompt_templates)

Starte Run 1/5
Verarbeite Prompt-Template 1/3 - Run 1
zero
zero
zero
internal
external
internal
internal
internal
internal
internal
internal
internal
internal
zero
internal
internal
external
zero
external
zero
internal
internal
internal
internal
external
external
external
zero
external
external
external
external
internal
internal
internal
internal
external
external
external
external
internal
internal
internal
internal
internal
internal
internal
internal
zero
zero
external
external
internal
internal
internal
internal
external
external
zero
external
external
external
external
zero
external
internal
external
internal
zero
external
external
zero
external
external
internal
internal
internal
internal
internal
internal
external
external
external
external
external
internal
external
external
zero
internal
external
external
external
external
external
internal
internal
internal
external
zero
internal
zero
zero
internal
internal
internal
Verarbeite Prompt-Template 2/3 - Run 1
zero
zero
zero
intern

In [82]:
results_gpt4o.to_json("DH_gpt4o_results", orient="records", indent=4, force_ascii=False)

In [20]:
test

Unnamed: 0,Prompt,Run,F1-Score,Recall,Precision,Accuracy
0,Template_0,1,0.574324,0.566038,0.62342,0.566038
1,Template_1,1,0.618868,0.688679,0.56761,0.688679
2,Template_2,1,0.626415,0.698113,0.573899,0.698113
3,Template_0,2,0.595919,0.584906,0.626357,0.584906
4,Template_1,2,0.647046,0.707547,0.67655,0.707547
5,Template_2,2,0.635094,0.707547,0.579414,0.707547
6,Template_0,3,0.596809,0.584906,0.639392,0.584906
7,Template_1,3,0.663746,0.726415,0.691637,0.726415
8,Template_2,3,0.64636,0.707547,0.674333,0.707547
9,Template_0,4,0.589169,0.575472,0.635581,0.575472


In [74]:
import pandas as pd
import numpy as np

def summarize_prompt_metrics(df):
    summary_rows = []

    # Gruppieren nach Prompt-Template
    grouped = df.groupby("Prompt")

    for prompt, group in grouped:
        f1_median = group["F1-Score"].median()
        f1_min = group["F1-Score"].min()
        f1_max = group["F1-Score"].max()

        precision_mean = group["Precision"].mean()
        precision_std = group["Precision"].std()

        recall_mean = group["Recall"].mean()
        recall_std = group["Recall"].std()

        acc_mean = group["Accuracy"].mean()
        acc_std = group["Accuracy"].std()

        summary_rows.append({
            "Prompt": prompt,
            "F1-Median (Min–Max)": f"{f1_median:.3f} ({f1_min:.3f}–{f1_max:.3f})",
            "Precision (M ± SD)": f"{precision_mean:.3f} ± {precision_std:.3f}",
            "Recall (M ± SD)": f"{recall_mean:.3f} ± {recall_std:.3f}",
            "Accuracy (M ± SD)": f"{acc_mean:.3f} ± {acc_std:.3f}",
        })

    summary_df = pd.DataFrame(summary_rows)
    return summary_df


In [75]:
summary = summarize_prompt_metrics(test)
summary

Unnamed: 0,Prompt,F1-Median (Min–Max),Precision (M ± SD),Recall (M ± SD),Accuracy (M ± SD)
0,Template_0,0.596 (0.574–0.621),0.639 ± 0.018,0.583 ± 0.014,0.583 ± 0.014
1,Template_1,0.639 (0.619–0.664),0.648 ± 0.050,0.702 ± 0.016,0.702 ± 0.016
2,Template_2,0.635 (0.622–0.654),0.633 ± 0.052,0.702 ± 0.014,0.702 ± 0.014


## Optimization – DSPY

In [21]:
import litellm

In [22]:
import dspy

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
litellm.drop_params = True

# Loading Data

In [24]:
df_anno_train = pd.read_csv('plasticity_focalization_trainset.csv')

In [25]:
df_anno_train['Fokalisierung'] = df_anno_train['Fokalisierung'].replace({
    'intern': 'internal',
    'extern': 'external',
    'null': 'zero',
    None: 'zero',
    np.nan: 'zero'
})

In [26]:
df_anno_test = pd.read_csv('plasticity_2025_Anno_DEU_test_2nd_run.csv')

In [27]:
df_anno_test['Fokalisierung'] = df_anno_test['Fokalisierung'].replace({
    'intern': 'internal',
    'extern': 'external',
    'null': 'zero',
    None: 'zero',
    np.nan: 'zero'
})

In [28]:
df_anno_train.head()

Unnamed: 0,Autor,Titel,Absatz,Fokalisierung,Kommentar,Link
0,Goethe,Die Sängerin Antonelli,"Als ich mich in Neapel aufhielt, begegnete das...",internal,,https://www.projekt-gutenberg.org/goethe/anton...
1,Goethe,Die Sängerin Antonelli,"Eine Sängerin, Antonelli genannt, war zu meine...",internal,,https://www.projekt-gutenberg.org/goethe/anton...
2,Goethe,Die Sängerin Antonelli,Bei ihren bisherigen Verbindungen war ihr Geis...,internal,,https://www.projekt-gutenberg.org/goethe/anton...
3,Goethe,Die Sängerin Antonelli,"Es war ein Genueser, der sich um diese Zeit ei...",internal,,https://www.projekt-gutenberg.org/goethe/anton...
4,Tieck,Das grüne Band,Durch die Thäler und über die Wiesen wandelte ...,external,,https://www.projekt-gutenberg.org/tieck/grueba...


In [29]:
def balanced_sample_by_category(df, category_column, n_per_category=8, random_state=42):
    """
    Gibt ein balanciertes Sample aus dem DataFrame zurück mit n_per_category Einträgen pro Kategorie.

    Args:
        df (pd.DataFrame): Eingabedaten.
        category_column (str): Spaltenname, nach dem kategorisiert werden soll.
        n_per_category (int): Anzahl von Einträgen pro Kategorie.
        random_state (int): Seed für Reproduzierbarkeit.

    Returns:
        pd.DataFrame: Balanciertes Sample.
    """
    # Fehlende Kategorien ausschließen
    df_clean = df.dropna(subset=[category_column])

    # Alle eindeutigen Kategorien abrufen
    categories = df_clean[category_column].unique()

    # Sampling durchführen
    balanced_df = pd.concat([
        df_clean[df_clean[category_column] == cat].sample(
            n=min(n_per_category, len(df_clean[df_clean[category_column] == cat])),
            random_state=random_state
        )
        for cat in categories
    ])

    # Index zurücksetzen
    return balanced_df.reset_index(drop=True)

In [30]:
df_train_balanced = balanced_sample_by_category(df_anno_train, category_column="Fokalisierung", n_per_category=8)

In [31]:
df_train_balanced.describe()

Unnamed: 0,Autor,Titel,Absatz,Fokalisierung,Kommentar,Link
count,24,24,24,24,0.0,24
unique,14,14,24,3,0.0,14
top,Brentano,Baron Hüpfenstich,"Als ich mich in Neapel aufhielt, begegnete das...",internal,,https://www.projekt-gutenberg.org/brentano/hue...
freq,3,3,1,8,,3


# Preparing Dataset

Anleitung zur Erstellung eines Datensets: https://dspy-docs.vercel.app/docs/deep-dive/data-handling/loading-custom-data

In [32]:
from dspy.datasets.dataset import Dataset

In [33]:
train = df_train_balanced[["Absatz", "Fokalisierung"]].copy(deep=True)

In [34]:
train.head()

Unnamed: 0,Absatz,Fokalisierung
0,"Als ich mich in Neapel aufhielt, begegnete das...",internal
1,"Eine große Sorge hatte der gute König jetzt, d...",internal
2,"Es ist doch etwas Schönes, Herrliches, Erhaben...",internal
3,"Eine Sängerin, Antonelli genannt, war zu meine...",internal
4,Einen anderen Weg schlag ich ein; er ist aller...,internal


In [35]:
len(train)

24

In [36]:
class CSVDataset(Dataset):
    def __init__(self, df, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        
        df=df
        #self._train = df.iloc[0:35].to_dict(orient='records')
        self._dev = df.to_dict(orient='records')

In [37]:
dataset = CSVDataset(train)

In [38]:
len(dataset.dev)

24

# Setting LLM

In [39]:
dspy.settings.configure(
    cache=None
)

In [40]:
gpt_key = os.getenv('MY_OPENAI')

In [41]:
gpt = dspy.LM('gpt-4o', api_key=gpt_key)

In [42]:
dspy.settings.configure(lm=gpt)

# Setting Up Module + checking output

In [43]:
class Determinacy(dspy.Signature):
    """
    Your task is to classify the focalization of the following sentence 
    
    ### Labels
    There are three modes of focalization:
    - internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
    - external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
    - zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously.
    """
    #context = dspy.InputField(desc="contains annotation guidelines and scoring instructions")
    text_snippet = dspy.InputField(desc="contains a snippet of a narrative text")
    tag = dspy.OutputField(desc="contains only the **label** in lower case")

In [44]:
context = """ 
### Labels
There are three modes of focalization:
- internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
- external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
- zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously.
"""

# Setting Metric

Anleitung zu den Metriken in DSPY: https://dspy-docs.vercel.app/docs/building-blocks/metrics

In [45]:
from dspy.evaluate import Evaluate

In [46]:
def validate_tag(example, pred, trace=None):
    print(example.answer)
    print(pred.tag)
    return example.answer in pred.tag

# Trying out the Signature Optimizer

Anleitung zur Arbeit mit dem Optimizer bei zero-shot: https://dspy-docs.vercel.app/docs/deep-dive/teleprompter/signature-optimizer

In [47]:
class DeterminacyPipe(dspy.Module):
    def __init__(self):
        super().__init__()
        
        self.signature = Determinacy
        self.predictor = dspy.ChainOfThought(self.signature)
        
    def forward(self, text_snippet):
        result = self.predictor(text_snippet=text_snippet)
        return dspy.Prediction(
            tag = result.tag
        )      

In [48]:
devset = dataset.dev

In [49]:
evaluate = Evaluate(devset=devset, metric=validate_tag, num_threads=3, display_progress=True, display_table=True)

In [50]:
event_baseline = DeterminacyPipe()
devset_with_input = [dspy.Example({"text_snippet": r["Absatz"], "answer": r["Fokalisierung"]}).with_inputs("context", "text_snippet") for r in devset]

In [51]:
evaluate(event_baseline, devset=devset_with_input)

external                                                 | 0/24 [00:00<?, ?it/s]
internal
zerointernalic: 0.00 / 1 (0.0%):   0%|                   | 0/24 [00:00<?, ?it/s]
internal

zero
externalMetric: 1.00 / 2 (50.0%):   4%|▍        | 1/24 [00:00<00:00, 269.09it/s]
external
Average Metric: 2.00 / 3 (66.7%):   8%|▊        | 2/24 [00:00<00:00, 372.28it/s]internal
external
Average Metric: 3.00 / 4 (75.0%):  12%|█▏       | 3/24 [00:00<00:00, 419.85it/s]zero
zero
internal
internal
Average Metric: 3.00 / 5 (60.0%):  17%|█▌       | 4/24 [00:00<00:00, 446.51it/s]zero
external
internal
internal
Average Metric: 4.00 / 6 (66.7%):  21%|█▉       | 5/24 [00:00<00:00, 454.00it/s]external
zero
internal
internal
zero
internal
internal
zero
internal
internal
zeroage Metric: 5.00 / 7 (71.4%):  25%|██▎      | 6/24 [00:00<00:00, 405.43it/s]
external
zero
zero
external
external
zero
zero
external
internal
Average Metric: 5.00 / 8 (62.5%):  29%|██▋      | 7/24 [00:00<00:00, 358.09it/s]zero
zero
externalinte

2025/04/24 10:39:33 INFO dspy.evaluate.evaluate: Average Metric: 13 / 24 (54.2%)





Unnamed: 0,text_snippet,answer,tag,validate_tag
0,"Als der Tag anbrach, noch ehe die Sonne aufgegangen war, kam schon...",external,internal,
1,"Als ich mich in Neapel aufhielt, begegnete daselbst eine Geschicht...",internal,external,
2,In den letzten Jahrzehnten ist das Interesse an Hungerkünstlern se...,zero,zero,✔️ [True]
3,"Wie gesagt, die Hand warf mich wieder zur Erde. Bald darauf erfaßt...",internal,internal,✔️ [True]
4,"Aber da keine Krankheit in ihm war, so war der Gedanke nicht graue...",internal,internal,✔️ [True]
5,"Es blieb daher nur noch die andere Seite neben dem Herrenkreuz, un...",external,external,✔️ [True]
6,"In M..., einer bedeutenden Stadt im oberen Italien, ließ die verwi...",zero,zero,✔️ [True]
7,"Die Jugend, welche die beiden Freunde Aeins und Azwei verband, war...",zero,external,
8,"Wenn man in jenen Tagen ein Ding durch die Fichtau bringen wollte,...",external,zero,
9,"Einen anderen Weg schlag ich ein; er ist allerdings etwas weit, ab...",internal,internal,✔️ [True]


54.17

# Using Copro

In [52]:
from dspy.teleprompt import COPRO

In [53]:
teleprompter = dspy.teleprompt.COPRO(
    program_mode="basic",
    init_temperature=0.4,  
    breadth=4,
    metric=validate_tag,
)

In [54]:
kwargs = dict(num_threads=5, display_progress=True, display_table=0) # Used in Evaluate class in the optimization process
compiled_prompt_opt = teleprompter.compile(DeterminacyPipe(), trainset=devset_with_input, eval_kwargs=kwargs)

2025/04/24 10:39:33 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 1/3.
2025/04/24 10:39:33 INFO dspy.teleprompt.copro_optimizer: At Depth 1/3, Evaluating Prompt Candidate #1/4 for Predictor 1 of 1.


internal                                                 | 0/24 [00:00<?, ?it/s]
external focalization
externalMetric: 0.00 / 1 (0.0%):   0%|                   | 0/24 [00:00<?, ?it/s]
external focalization
zero
external focalization
external
external focalization
internal
internal focalization
externalMetric: 0.00 / 2 (0.0%):   4%|▍         | 1/24 [00:00<00:00, 132.82it/s]
zero focalization
zero
zero focalization
internal
internal focalization
Average Metric: 1.00 / 3 (33.3%):   8%|▊        | 2/24 [00:00<00:00, 173.41it/s]zero
external focalization
internal
internal focalization
Average Metric: 2.00 / 4 (50.0%):  12%|█▏       | 3/24 [00:00<00:00, 225.09it/s]
internal focalization
zero
internal focalization
internal
internal focalization
Average Metric: 3.00 / 5 (60.0%):  17%|█▌       | 4/24 [00:00<00:00, 247.85it/s]internal
external focalization
zero
internal focalization
zerozero
internal focalization
Average Metric: 4.00 / 6 (66.7%):  21%|█▉       | 5/24 [00:00<00:00, 266.69it/s]exte

2025/04/24 10:39:33 INFO dspy.evaluate.evaluate: Average Metric: 12 / 24 (50.0%)
2025/04/24 10:39:33 INFO dspy.teleprompt.copro_optimizer: At Depth 1/3, Evaluating Prompt Candidate #2/4 for Predictor 1 of 1.



internalexternal                                         | 0/24 [00:00<?, ?it/s]
external focalization
internal
internal focalization
Average Metric: 1.00 / 1 (100.0%):   0%|                 | 0/24 [00:00<?, ?it/s]zero
external

external focalization
Average Metric: 2.00 / 2 (100.0%):   4%|▎       | 1/24 [00:00<00:00, 144.38it/s]
zero focalization
external
external focalization
external
external
Average Metric: 3.00 / 3 (100.0%):   8%|▋       | 2/24 [00:00<00:00, 211.06it/s]
external focalization
internal
internal focalization
internal
internal focalization
zero
internal focalization
Average Metric: 3.00 / 4 (75.0%):  12%|█▏       | 3/24 [00:00<00:00, 232.72it/s]zero
external
zero
internal focalization
Average Metric: 4.00 / 5 (80.0%):  17%|█▌       | 4/24 [00:00<00:00, 264.80it/s]internal
internal focalization
Average Metric: 5.00 / 6 (83.3%):  21%|█▉       | 5/24 [00:00<00:00, 301.46it/s]
internal focalization
external
internal focalization
external
external focalization
Average Met

2025/04/24 10:39:33 INFO dspy.evaluate.evaluate: Average Metric: 15 / 24 (62.5%)
2025/04/24 10:39:33 INFO dspy.teleprompt.copro_optimizer: At Depth 1/3, Evaluating Prompt Candidate #3/4 for Predictor 1 of 1.



internal                                                 | 0/24 [00:00<?, ?it/s]
external focalization
Average Metric: 0.00 / 1 (0.0%):   0%|                   | 0/24 [00:00<?, ?it/s]internal
internal focalization
external
external focalization
external
zero focalization
internal
internal focalization
Average Metric: 1.00 / 2 (50.0%):   4%|▍        | 1/24 [00:00<00:00, 147.65it/s]zero
external focalization
zero
zero focalization
internal
internal focalization
Average Metric: 2.00 / 3 (66.7%):   8%|▊        | 2/24 [00:00<00:00, 212.18it/s]internal
internal focalization

external focalization
external
external focalization
zero
internal focalization
Average Metric: 2.00 / 4 (50.0%):  12%|█▏       | 3/24 [00:00<00:00, 223.13it/s]internal
internal focalization
internal
internal focalization
zero
external focalization
external
internal focalization
Average Metric: 3.00 / 5 (60.0%):  17%|█▌       | 4/24 [00:00<00:00, 252.47it/s]external
external focalization
zero
internal focalization
exter

2025/04/24 10:39:33 INFO dspy.evaluate.evaluate: Average Metric: 15 / 24 (62.5%)
2025/04/24 10:39:33 INFO dspy.teleprompt.copro_optimizer: At Depth 1/3, Evaluating Prompt Candidate #4/4 for Predictor 1 of 1.



external
internal
zero
zero
internal
internal
internal
external
internal
internal
external
zero
zero
external
external
external
internal
zero
internal
internal
internal
internal
zero
zero
external
internal
zero
zero
external
external
zero
zero
external
zero
internal
internal
external
internal
external
zero
zero
zero
internal
internal
zero
external
zero
internal
Average Metric: 13.00 / 24 (54.2%): 100%|█████| 24/24 [00:00<00:00, 5908.51it/s]

2025/04/24 10:39:33 INFO dspy.evaluate.evaluate: Average Metric: 13 / 24 (54.2%)
2025/04/24 10:39:33 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 2/3.
2025/04/24 10:39:33 INFO dspy.teleprompt.copro_optimizer: At Depth 2/3, Evaluating Prompt Candidate #1/4 for Predictor 1 of 1.



internal                                                 | 0/24 [00:00<?, ?it/s]
external focalization
internalMetric: 0.00 / 1 (0.0%):   0%|                   | 0/24 [00:00<?, ?it/s]
internal focalization
external
internal focalization
zero
external focalization
external
external focalization
internal
internal focalization
internal
internal focalization
external
external focalization
zero
external focalization
Average Metric: 1.00 / 2 (50.0%):   4%|▍         | 1/24 [00:00<00:00, 90.84it/s]zero
internal focalization
zero
zero focalization
internal
internal focalization
internalzero
internal focalization

internal focalization
zeroage Metric: 2.00 / 3 (66.7%):   8%|▊        | 2/24 [00:00<00:00, 134.49it/s]
internal focalization
external
internal focalization
zero
internal focalization
external
external focalization
internal
internal focalization
externalMetric: 3.00 / 4 (75.0%):  12%|█▏       | 3/24 [00:00<00:00, 154.01it/s]
external focalization
zero
zero focalization
Average Metric: 

2025/04/24 10:39:33 INFO dspy.evaluate.evaluate: Average Metric: 14 / 24 (58.3%)
2025/04/24 10:39:33 INFO dspy.teleprompt.copro_optimizer: At Depth 2/3, Evaluating Prompt Candidate #2/4 for Predictor 1 of 1.



zero|                                                    | 0/24 [00:00<?, ?it/s]
external focalization
externalMetric: 0.00 / 1 (0.0%):   0%|                   | 0/24 [00:00<?, ?it/s]
external focalization
externalinternal1.00 / 2 (50.0%):   4%|▍        | 1/24 [00:00<00:00, 198.65it/s]
internal focalization
Average Metric: 2.00 / 3 (66.7%):   8%|▊        | 2/24 [00:00<00:00, 299.54it/s]
external focalization
internal
external focalization
zero
zero focalization
internal
internal focalization
external
external focalization
Average Metric: 3.00 / 4 (75.0%):  12%|█▏       | 3/24 [00:00<00:00, 261.37it/s]zero
external focalization
internalinternal
internal focalization

internal focalization
Average Metric: 4.00 / 5 (80.0%):  17%|█▌       | 4/24 [00:00<00:00, 269.52it/s]zero
internal focalization
Average Metric: 5.00 / 6 (83.3%):  21%|█▉       | 5/24 [00:00<00:00, 319.82it/s]zero
internal focalization
Average Metric: 6.00 / 7 (85.7%):  25%|██▎      | 6/24 [00:00<00:00, 341.94it/s]external

2025/04/24 10:39:33 INFO dspy.evaluate.evaluate: Average Metric: 15 / 24 (62.5%)
2025/04/24 10:39:33 INFO dspy.teleprompt.copro_optimizer: At Depth 2/3, Evaluating Prompt Candidate #3/4 for Predictor 1 of 1.



internal                                                 | 0/24 [00:00<?, ?it/s]
external focalization
zeroage Metric: 0.00 / 1 (0.0%):   0%|                   | 0/24 [00:00<?, ?it/s]
external focalization
external
internal focalization
Average Metric: 0.00 / 3 (0.0%):   8%|▊         | 2/24 [00:00<00:00, 394.18it/s]
external focalization
zero
external focalization
Average Metric: 0.00 / 4 (0.0%):  12%|█▎        | 3/24 [00:00<00:00, 448.70it/s]zero
zero focalization
internal
internal focalization
Average Metric: 2.00 / 6 (33.3%):  21%|█▉       | 5/24 [00:00<00:00, 528.20it/s]external
external focalization
zero
internal focalization
internal
internal focalization
internal
internal focalization

internal focalization
zero
internal focalization
zero
internal focalization
internal
internal focalization
internal
internal focalization
zeroage Metric: 3.00 / 7 (42.9%):  25%|██▎      | 6/24 [00:00<00:00, 380.01it/s]
internal focalization
externalexternal
internal focalization

external focaliz

2025/04/24 10:39:33 INFO dspy.evaluate.evaluate: Average Metric: 13 / 24 (54.2%)
2025/04/24 10:39:33 INFO dspy.teleprompt.copro_optimizer: At Depth 2/3, Evaluating Prompt Candidate #4/4 for Predictor 1 of 1.



zero|                                                    | 0/24 [00:00<?, ?it/s]
external focalization
Average Metric: 0.00 / 1 (0.0%):   0%|                   | 0/24 [00:00<?, ?it/s]internal
internal focalization
external
external focalization
external
external focalization
internal
external focalization
zero
external focalization
zero
external focalization
Average Metric: 0.00 / 2 (0.0%):   4%|▍         | 1/24 [00:00<00:00, 106.07it/s]external
external focalization
internal
internal focalization
internal
internal focalization
Average Metric: 1.00 / 3 (33.3%):   8%|▊        | 2/24 [00:00<00:00, 165.18it/s]zero
internal focalization
internal
internal focalization
zero
zero focalization
internal
internal focalization
zeroage Metric: 2.00 / 4 (50.0%):  12%|█▏       | 3/24 [00:00<00:00, 201.57it/s]
internal focalization
Average Metric: 2.00 / 5 (40.0%):  17%|█▌       | 4/24 [00:00<00:00, 233.73it/s]internal
internal focalization
external
internal focalization
zeroage Metric: 2.00 / 6 (33

2025/04/24 10:39:34 INFO dspy.evaluate.evaluate: Average Metric: 15 / 24 (62.5%)
2025/04/24 10:39:34 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 3/3.
2025/04/24 10:39:34 INFO dspy.teleprompt.copro_optimizer: At Depth 3/3, Evaluating Prompt Candidate #1/4 for Predictor 1 of 1.


internal                                                 | 0/24 [00:00<?, ?it/s]
internal focalization
internalMetric: 1.00 / 1 (100.0%):   0%|                 | 0/24 [00:00<?, ?it/s]
internal focalization
externalMetric: 2.00 / 2 (100.0%):   4%|▎       | 1/24 [00:00<00:00, 231.22it/s]
external focalization
zero
external focalization
zeroage Metric: 2.00 / 3 (66.7%):   8%|▊        | 2/24 [00:00<00:00, 329.44it/s]
zero focalization
internal
external focalization
external
internal focalization
zero
external focalization
external
external focalization
zero
zero focalization
Average Metric: 3.00 / 4 (75.0%):  12%|█▏       | 3/24 [00:00<00:00, 255.01it/s]zero
internal focalization
internal
internal focalization
internal
internal focalization
internal
internal focalization
zero
internal focalization
internal
internal focalization
external
external focalization
zerozero
internal focalization

external focalization
Average Metric: 3.00 / 5 (60.0%):  17%|█▌       | 4/24 [00:00<00:00, 221.60it/s

2025/04/24 10:39:34 INFO dspy.evaluate.evaluate: Average Metric: 14 / 24 (58.3%)
2025/04/24 10:39:34 INFO dspy.teleprompt.copro_optimizer: At Depth 3/3, Evaluating Prompt Candidate #2/4 for Predictor 1 of 1.


internal                                                 | 0/24 [00:00<?, ?it/s]
internal focalization
internalMetric: 1.00 / 1 (100.0%):   0%|                 | 0/24 [00:00<?, ?it/s]
internal focalization
external
external focalization
zero
external focalization
Average Metric: 1.00 / 2 (50.0%):   4%|▍        | 1/24 [00:00<00:00, 132.99it/s]external
internal focalization
zero
external focalization
Average Metric: 2.00 / 3 (66.7%):   8%|▊        | 2/24 [00:00<00:00, 197.21it/s]internal
external focalization
internalexternal
external focalization
Average Metric: 3.00 / 4 (75.0%):  12%|█▏       | 3/24 [00:00<00:00, 253.39it/s]
internal focalization
zero
zero focalization
Average Metric: 4.00 / 5 (80.0%):  17%|█▌       | 4/24 [00:00<00:00, 300.43it/s]
internal focalization
internal
internal focalization
zero
zero focalization
zeroage Metric: 4.00 / 6 (66.7%):  21%|█▉       | 5/24 [00:00<00:00, 318.55it/s]
internal focalization
zero
zero focalization
internal
internal focalization
Average 

2025/04/24 10:39:34 INFO dspy.evaluate.evaluate: Average Metric: 15 / 24 (62.5%)





2025/04/24 10:39:34 INFO dspy.teleprompt.copro_optimizer: At Depth 3/3, Evaluating Prompt Candidate #3/4 for Predictor 1 of 1.


internal                                                 | 0/24 [00:00<?, ?it/s]
external focalization
Average Metric: 0.00 / 1 (0.0%):   0%|                   | 0/24 [00:00<?, ?it/s]external
external focalization
internal
internal focalization
internal
internal focalization
zero
external focalization
external
external focalization
external
external focalization
zero
external focalization
internal
internal focalization
internal
internal focalization
internal
internal focalization
zero
external focalization
zero
internal focalization
zero
internal focalization
zero
internal focalization
zero
zero focalization
external
external focalization
internal
internal focalization
external
internal focalization
zero
zero focalization
external
internal focalization
internal
internal focalization
external
external focalization
external
external focalization
Average Metric: 15.00 / 24 (62.5%): 100%|██████| 24/24 [00:00<00:00, 964.39it/s]

2025/04/24 10:39:34 INFO dspy.evaluate.evaluate: Average Metric: 15 / 24 (62.5%)
2025/04/24 10:39:34 INFO dspy.teleprompt.copro_optimizer: At Depth 3/3, Evaluating Prompt Candidate #4/4 for Predictor 1 of 1.



internal                                                 | 0/24 [00:00<?, ?it/s]
internal focalization
zeroage Metric: 1.00 / 1 (100.0%):   0%|                 | 0/24 [00:00<?, ?it/s]
zero focalization
internalexternal2.00 / 2 (100.0%):   4%|▎       | 1/24 [00:00<00:00, 166.34it/s]
external focalization

zero focalization
Average Metric: 3.00 / 3 (100.0%):   8%|▋       | 2/24 [00:00<00:00, 255.52it/s]internal
internal focalization
zero
external focalization
external
external focalization
Average Metric: 3.00 / 5 (60.0%):  17%|█▌       | 4/24 [00:00<00:00, 352.27it/s]zero
external focalization
zero
zero focalization
internal
internal focalization
internal
internal focalization
internal
internal focalization
Average Metric: 4.00 / 6 (66.7%):  21%|█▉       | 5/24 [00:00<00:00, 334.56it/s]zero
internal focalization
internal
internal focalization
external
internal focalization
zero
internal focalization
Average Metric: 5.00 / 7 (71.4%):  25%|██▎      | 6/24 [00:00<00:00, 318.39it/s]zero
in

2025/04/24 10:39:34 INFO dspy.evaluate.evaluate: Average Metric: 15 / 24 (62.5%)





In [55]:
compiled_prompt_opt

predictor.predict = Predict(StringSignature(text_snippet -> reasoning, tag
    instructions="Your task is to determine the focalization mode of a given sentence. Focalization refers to the perspective through which a narrative is presented. You must decide if the sentence is internally, externally, or zero focalized based on the following criteria:\n\n- **Internal Focalization**: The narrative is presented from the perspective of a character, involving their thoughts, perceptions, or emotions.\n- **External Focalization**: The narrative is presented from an observer's perspective, focusing on observable actions and events without delving into the internal thoughts or feelings of characters.\n- **Zero Focalization**: The narrative is presented with an omniscient viewpoint, describing events and circumstances without being tied to any character's perspective.\n\nAnalyze the sentence and classify it under one of these three modes."
    text_snippet = Field(annotation=str required=True jso

# Test with new Prompt

In [56]:
results = []
for text_snippet in df_anno_test.Absatz:
    response = compiled_prompt_opt(text_snippet=text_snippet)
    print(response.tag)
    results.append(response.tag)

external
zero focalization
external focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
external
internal focalization
internal focalization
external focalization
internal focalization
internal focalization
internal focalization
external
external
external focalization
internal
zero focalization
external focalization
internal focalization
external focalization
internal focalization
internal focalization
internal focalization
internal focalization
external focalization
external
external focalization
external
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
external focalization
external
external
ext

In [57]:
results_1 = []
for text in results:
    if "internal focalization" in text:
        results_1.append("internal")
    elif "internal" in text:
        results_1.append("internal")
    elif "external focalization" in text:
        results_1.append("external")
    elif "external" in text:
        results_1.append("external")
    elif "zero focalization" in text:
        results_1.append("zero")
    elif "zero" in text:
        results_1.append("zero")
    else:
        results_1.append("NaN")

In [58]:
predictions_dspy = pd.Series(results_1)

In [59]:
predictions_dspy

0      external
1          zero
2      external
3      internal
4      internal
         ...   
101    internal
102    external
103    internal
104    internal
105    internal
Length: 106, dtype: object

In [60]:
ground_truth = df_anno_test.Fokalisierung

In [61]:
ground_truth

0          zero
1          zero
2      internal
3          zero
4      internal
         ...   
101    internal
102    internal
103    internal
104    internal
105    internal
Name: Fokalisierung, Length: 106, dtype: object

In [62]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [63]:
pd.DataFrame(list(zip([f1_score(ground_truth, predictions_dspy, average="weighted")],
                      [recall_score(ground_truth, predictions_dspy,  average="weighted")],
                      [precision_score(ground_truth, predictions_dspy, average="weighted")],
                      [accuracy_score(ground_truth, predictions_dspy,)])),
                      columns = ["F1", "Recall", "Precision", "Accuracy"])

Unnamed: 0,F1,Recall,Precision,Accuracy
0,0.497815,0.528302,0.586171,0.528302


In [64]:
compiled_prompt_opt.save('gpt-40_dspied.pkl')

## Statistischer Test: McNemar

In [65]:
from typing import Optional
from statsmodels.stats.contingency_tables import mcnemar

In [66]:
# Funktion zum Vergleichen der Vorhersagen mit dem Goldstandard
def compare_with_gold(predictions, gold_standard):
    return [pred == gold for pred, gold in zip(predictions, gold_standard)]

In [67]:
# Funktion zum Erstellen einer Kontingenztafel
def create_contingency_table(results_a, results_b):
    both_correct = sum(a and b for a, b in zip(results_a, results_b))
    only_a_correct = sum(a and not b for a, b in zip(results_a, results_b))
    only_b_correct = sum(b and not a for a, b in zip(results_a, results_b))
    both_incorrect = sum(not a and not b for a, b in zip(results_a, results_b))
    return np.array([[both_correct, only_a_correct],
                     [only_b_correct, both_incorrect]])


In [68]:
# McNemar-Test durchführen
def run_mcnemar_test(results_a, results_b):
    table = create_contingency_table(results_a, results_b)
    return mcnemar(table, exact=True)


In [76]:
def generate_mcnemar_comparisons(
    results_gpt4o,
    df_anno,
    num_templates=3,
    runs=[1, 2, 3, 4, 5],
    optimized_predictions: Optional[pd.Series] = None
):
    """
    Führt McNemar-Vergleiche zwischen verschiedenen Prompt-Vorhersagen durch,
    inklusive eines optionalen optimierten Prompts.

    Args:
        results_gpt4o: DataFrame mit Prediction-Spalten für alle Templates/Runs
        df_anno: DataFrame mit Goldstandard ('Fokalisierung')
        num_templates: Anzahl der verwendeten Template-Prompts
        runs: Liste der Runs (z. B. [1, 2])
        optimized_predictions: (optional) pd.Series mit Optimized-Prompt-Vorhersagen

    Returns:
        comparison_df: DataFrame mit allen McNemar-Test-Ergebnissen
    """
    gold = df_anno["Fokalisierung"]
    comparison_results = []

    for run in runs:
        # Vorhersagen der Templates
        predictions = {
            f"T{template_idx+1}": compare_with_gold(
                results_gpt4o[f"Prediction_{template_idx}_Run{run}"], gold
            )
            for template_idx in range(num_templates)
        }

        # Optional: Optimierten Prompt ergänzen
        if optimized_predictions is not None:
            predictions["Optimized"] = compare_with_gold(optimized_predictions, gold)

        # Paarweise Vergleiche
        templates = list(predictions.keys())
        for i in range(len(templates)):
            for j in range(i + 1, len(templates)):
                t1, t2 = templates[i], templates[j]
                result = run_mcnemar_test(predictions[t1], predictions[t2])
                comparison_results.append({
                    "Run": run,
                    "Comparison": f"{t1} vs {t2}",
                    "Prompt_A": t1,
                    "Prompt_B": t2,
                    "p-value": result.pvalue
                })

    comparison_df = pd.DataFrame(comparison_results)
    return comparison_df


In [77]:
comparison_df = generate_mcnemar_comparisons(
    results_gpt4o=results_gpt4o,
    df_anno=df_anno,
    num_templates=3,
    runs=[1, 2, 3, 4, 5],
    optimized_predictions=predictions_dspy
)

In [78]:
print(comparison_df)

    Run       Comparison Prompt_A   Prompt_B   p-value
0     1         T1 vs T2       T1         T2  0.059584
1     1         T1 vs T3       T1         T3  0.054076
2     1  T1 vs Optimized       T1  Optimized  0.480682
3     1         T2 vs T3       T2         T3  1.000000
4     1  T2 vs Optimized       T2  Optimized  0.013718
5     1  T3 vs Optimized       T3  Optimized  0.013283
6     2         T1 vs T2       T1         T2  0.047031
7     2         T1 vs T3       T1         T3  0.053252
8     2  T1 vs Optimized       T1  Optimized  0.286279
9     2         T2 vs T3       T2         T3  1.000000
10    2  T2 vs Optimized       T2  Optimized  0.005402
11    2  T3 vs Optimized       T3  Optimized  0.005402
12    3         T1 vs T2       T1         T2  0.023703
13    3         T1 vs T3       T1         T3  0.053252
14    3  T1 vs Optimized       T1  Optimized  0.237885
15    3         T2 vs T3       T2         T3  0.500000
16    3  T2 vs Optimized       T2  Optimized  0.001914
17    3  T

In [83]:
comparison_df.to_excel("mcnemar_vergleiche_gpt40.xlsx", index=False)

In [80]:
def filter_significant_comparisons(comparison_df, alpha=0.05):
    """
    Gibt alle Vergleichspaare mit p-Wert < alpha (standardmäßig 0.05) zurück.
    
    Args:
        comparison_df: DataFrame mit den Spalten 'Run', 'Comparison', 'Prompt_A', 'Prompt_B', 'p-value'
        alpha: Signifikanzniveau (Default: 0.05)
        
    Returns:
        DataFrame mit nur signifikant unterschiedlichen Prompt-Paaren
    """
    significant_df = comparison_df[comparison_df["p-value"] < alpha].copy()
    return significant_df.sort_values(by=["Run", "p-value"])

In [81]:
significant_comparisons = filter_significant_comparisons(comparison_df)
significant_comparisons

Unnamed: 0,Run,Comparison,Prompt_A,Prompt_B,p-value
5,1,T3 vs Optimized,T3,Optimized,0.013283
4,1,T2 vs Optimized,T2,Optimized,0.013718
10,2,T2 vs Optimized,T2,Optimized,0.005402
11,2,T3 vs Optimized,T3,Optimized,0.005402
6,2,T1 vs T2,T1,T2,0.047031
16,3,T2 vs Optimized,T2,Optimized,0.001914
17,3,T3 vs Optimized,T3,Optimized,0.005402
12,3,T1 vs T2,T1,T2,0.023703
23,4,T3 vs Optimized,T3,Optimized,0.003658
22,4,T2 vs Optimized,T2,Optimized,0.00956
