In [1]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from statsmodels.stats.contingency_tables import mcnemar

In [3]:
from scipy import stats
from itertools import combinations

In [4]:
import anthropic

In [5]:
import os

In [6]:
import re
import torch
import sklearn
from sklearn.model_selection import train_test_split

In [7]:
df_anno = pd.read_csv('plasticity_2025_Anno_DEU_test_2nd_run.csv')

In [8]:
df_anno['Fokalisierung'] = df_anno['Fokalisierung'].replace({
    'intern': 'internal',
    'extern': 'external',
    'null': 'zero',
    None: 'zero',
    np.nan: 'zero'
})

In [9]:
df_anno.head()

Unnamed: 0,Autor,Titel,Absatz,Fokalisierung,Kommentar
0,Schiller,Der Vebrecher aus verlorener Ehre,In der ganzen Geschichte des Menschen ist kein...,zero,
1,Schiller,Der Vebrecher aus verlorener Ehre,Es ist etwas so Einförmiges und doch wieder so...,zero,
2,Schiller,Der Vebrecher aus verlorener Ehre,"Ich weiß, daß von den besten Geschichtschreibe...",internal,
3,Schiller,Der Vebrecher aus verlorener Ehre,"Der Held muß kalt werden wie der Leser, oder, ...",zero,
4,Tieck,Die beiden merkwürdigsten Tage aus Siegmunds L...,"Es war schon gegen Abend, als ein Wagen vor de...",internal,


In [10]:
api_key = os.getenv('MY_ANTHROPIC')

In [11]:
client = anthropic.Anthropic(
    api_key = api_key
)

In [12]:
def get_completion(prompt):
    message = client.messages.create(
        model="claude-3-7-sonnet-latest",
        max_tokens=1024,
        temperature=0.1,
        messages=[
            {"role": "user", "content": prompt}
        ])
    return message.content[0].text

In [13]:
prompt_basic = """
### Instruction
Your task is to classify the focalization of the following sentence

###
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text 
Sentence: '''{text}''''
Label:
"""

In [14]:
prompt_labels = """
### Instruction
Your task is to classify the focalization of the following sentence

### Labels
There are three modes of focalization:
- internal
- external
- zero

####
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text 
Sentence: '''{text}''' 
Label: 
"""

In [15]:
prompt_redefin = """
### Instruction
Your task is to classify the focalization of the following sentence

### Labels
There are three modes of focalization:
- internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
- external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
- zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously.

####
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text  
Sentence: '''{text}''' 
Label: 
"""

In [16]:
prompt_meta = """
### Instruction
Your task is to classify the focalization of the following sentence

### Labels
There are three modes of focalization:
- internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
- external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
- zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously. 
These definitions are redefinitions of the standard understanding of focalization.

####
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text 
Sentence: '''{text}''' 
Label: 
"""

In [17]:
prompt_templates = [prompt_labels, prompt_redefin, prompt_meta]

In [18]:
def evaluate_prompts_and_predictions(df, prompt_templates):
    """
    Evaluiert verschiedene Prompt-Templates und berechnet Metriken für die Vorhersagen.
    Jeder Prompt wird zweimal über die Daten ausgeführt.
    
    Args:
        df: Pandas DataFrame mit den Spalten 'Absatz' und 'Fokalisierung'
        prompt_templates: Liste der Prompt-Templates
    
    Returns:
        Tuple mit:
            - df: DataFrame mit den gespeicherten Vorhersagen
            - results_df: DataFrame mit den Evaluierungsmetriken für jeden Prompt und jeden Run
    """
    results = []

    for run in range(1, 6):
        print(f"Starte Run {run}/5")

        # Iteration über die Prompt-Templates
        for prompt_idx, template in enumerate(prompt_templates):
            print(f"Verarbeite Prompt-Template {prompt_idx + 1}/{len(prompt_templates)} - Run {run}")
            
            # Spaltenname für Vorhersage definieren
            prediction_col = f'Prediction_{prompt_idx}_Run{run}'
            df[prediction_col] = None
            
            # Iteration über die Zeilen des DataFrames
            for idx, row in df.iterrows():
                prompt = template.format(text=row['Absatz'])
                prediction = get_completion(prompt)
                first_word = prediction.split()[0].lower()
                print(first_word)
                df.at[idx, prediction_col] = first_word
            
            # Metriken berechnen
            metrics = {
                'Prompt': f'Template_{prompt_idx}',
                'Run': run,
                'F1-Score': f1_score(df['Fokalisierung'], df[prediction_col], average='weighted'),
                'Recall': recall_score(df['Fokalisierung'], df[prediction_col], average='weighted'),
                'Precision': precision_score(df['Fokalisierung'], df[prediction_col], average='weighted'),
                'Accuracy': accuracy_score(df['Fokalisierung'], df[prediction_col])
            }
            
            results.append(metrics)
    
    # Ergebnisse in DataFrame umwandeln
    results_df = pd.DataFrame(results)
    return df, results_df

In [19]:
results_haiku, test = evaluate_prompts_and_predictions(df_anno, prompt_templates)

Starte Run 1/5
Verarbeite Prompt-Template 1/3 - Run 1
zero
zero
internal
zero
external
external
external
internal
internal
internal
internal
internal
internal
zero
internal
internal
zero
zero
zero
external
zero
internal
internal
internal
external
external
zero
external
external
external
external
external
internal
internal
internal
internal
external
external
external
external
internal
internal
internal
internal
internal
internal
internal
internal
external
external
zero
external
internal
internal
internal
internal
external
zero
zero
zero
external
external
external
zero
external
external
zero
external
zero
external
external
zero
external
external
internal
internal
internal
internal
internal
internal
external
external
external
external
external
zero
external
external
external
external
external
external
external
external
internal
internal
internal
internal
external
external
internal
internal
external
internal
internal
zero
Verarbeite Prompt-Template 2/3 - Run 1
zero
zero
zero
zero
external


In [20]:
results_haiku.to_json("DH_haiku_results.json", orient="records", indent=4, force_ascii=False)

In [21]:
test

Unnamed: 0,Prompt,Run,F1-Score,Recall,Precision,Accuracy
0,Template_0,1,0.681871,0.660377,0.767731,0.660377
1,Template_1,1,0.757626,0.754717,0.762855,0.754717
2,Template_2,1,0.753257,0.754717,0.75438,0.754717
3,Template_0,2,0.672165,0.650943,0.762793,0.650943
4,Template_1,2,0.784411,0.783019,0.78802,0.783019
5,Template_2,2,0.753257,0.754717,0.75438,0.754717
6,Template_0,3,0.670319,0.650943,0.752835,0.650943
7,Template_1,3,0.766906,0.764151,0.771985,0.764151
8,Template_2,3,0.753257,0.754717,0.75438,0.754717
9,Template_0,4,0.662156,0.641509,0.757536,0.641509


In [22]:
def summarize_prompt_metrics(df):
    summary_rows = []

    # Gruppieren nach Prompt-Template
    grouped = df.groupby("Prompt")

    for prompt, group in grouped:
        f1_median = group["F1-Score"].median()
        f1_min = group["F1-Score"].min()
        f1_max = group["F1-Score"].max()

        precision_mean = group["Precision"].mean()
        precision_std = group["Precision"].std()

        recall_mean = group["Recall"].mean()
        recall_std = group["Recall"].std()

        acc_mean = group["Accuracy"].mean()
        acc_std = group["Accuracy"].std()

        summary_rows.append({
            "Prompt": prompt,
            "F1-Median (Min–Max)": f"{f1_median:.3f} ({f1_min:.3f}–{f1_max:.3f})",
            "Precision (M ± SD)": f"{precision_mean:.3f} ± {precision_std:.3f}",
            "Recall (M ± SD)": f"{recall_mean:.3f} ± {recall_std:.3f}",
            "Accuracy (M ± SD)": f"{acc_mean:.3f} ± {acc_std:.3f}",
        })

    summary_df = pd.DataFrame(summary_rows)
    return summary_df


In [23]:
summary = summarize_prompt_metrics(test)
summary

Unnamed: 0,Prompt,F1-Median (Min–Max),Precision (M ± SD),Recall (M ± SD),Accuracy (M ± SD)
0,Template_0,0.672 (0.662–0.682),0.762 ± 0.007,0.653 ± 0.008,0.653 ± 0.008
1,Template_1,0.767 (0.758–0.784),0.774 ± 0.009,0.766 ± 0.010,0.766 ± 0.010
2,Template_2,0.753 (0.753–0.764),0.757 ± 0.005,0.757 ± 0.004,0.757 ± 0.004


## Optimization – DSPY

In [24]:
import litellm

In [25]:
import dspy

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
litellm.drop_params = True

# Loading Data

In [27]:
df_anno_train = pd.read_csv('plasticity_focalization_trainset.csv')

In [28]:
df_anno_train['Fokalisierung'] = df_anno_train['Fokalisierung'].replace({
    'intern': 'internal',
    'extern': 'external',
    'null': 'zero',
    None: 'zero',
    np.nan: 'zero'
})

In [29]:
df_anno_test = pd.read_csv('plasticity_2025_Anno_DEU_test_2nd_run.csv')

In [30]:
df_anno_test['Fokalisierung'] = df_anno_test['Fokalisierung'].replace({
    'intern': 'internal',
    'extern': 'external',
    'null': 'zero',
    None: 'zero',
    np.nan: 'zero'
})

In [31]:
df_anno_train.head()

Unnamed: 0,Autor,Titel,Absatz,Fokalisierung,Kommentar,Link
0,Goethe,Die Sängerin Antonelli,"Als ich mich in Neapel aufhielt, begegnete das...",internal,,https://www.projekt-gutenberg.org/goethe/anton...
1,Goethe,Die Sängerin Antonelli,"Eine Sängerin, Antonelli genannt, war zu meine...",internal,,https://www.projekt-gutenberg.org/goethe/anton...
2,Goethe,Die Sängerin Antonelli,Bei ihren bisherigen Verbindungen war ihr Geis...,internal,,https://www.projekt-gutenberg.org/goethe/anton...
3,Goethe,Die Sängerin Antonelli,"Es war ein Genueser, der sich um diese Zeit ei...",internal,,https://www.projekt-gutenberg.org/goethe/anton...
4,Tieck,Das grüne Band,Durch die Thäler und über die Wiesen wandelte ...,external,,https://www.projekt-gutenberg.org/tieck/grueba...


In [32]:
def balanced_sample_by_category(df, category_column, n_per_category=8, random_state=42):
    """
    Gibt ein balanciertes Sample aus dem DataFrame zurück mit n_per_category Einträgen pro Kategorie.

    Args:
        df (pd.DataFrame): Eingabedaten.
        category_column (str): Spaltenname, nach dem kategorisiert werden soll.
        n_per_category (int): Anzahl von Einträgen pro Kategorie.
        random_state (int): Seed für Reproduzierbarkeit.

    Returns:
        pd.DataFrame: Balanciertes Sample.
    """
    # Fehlende Kategorien ausschließen
    df_clean = df.dropna(subset=[category_column])

    # Alle eindeutigen Kategorien abrufen
    categories = df_clean[category_column].unique()

    # Sampling durchführen
    balanced_df = pd.concat([
        df_clean[df_clean[category_column] == cat].sample(
            n=min(n_per_category, len(df_clean[df_clean[category_column] == cat])),
            random_state=random_state
        )
        for cat in categories
    ])

    # Index zurücksetzen
    return balanced_df.reset_index(drop=True)

In [33]:
df_train_balanced = balanced_sample_by_category(df_anno_train, category_column="Fokalisierung", n_per_category=8)

In [34]:
df_train_balanced.describe()

Unnamed: 0,Autor,Titel,Absatz,Fokalisierung,Kommentar,Link
count,24,24,24,24,0.0,24
unique,14,14,24,3,0.0,14
top,Brentano,Baron Hüpfenstich,"Als ich mich in Neapel aufhielt, begegnete das...",internal,,https://www.projekt-gutenberg.org/brentano/hue...
freq,3,3,1,8,,3


# Preparing Dataset

Anleitung zur Erstellung eines Datensets: https://dspy-docs.vercel.app/docs/deep-dive/data-handling/loading-custom-data

In [35]:
from dspy.datasets.dataset import Dataset

In [36]:
train = df_train_balanced[["Absatz", "Fokalisierung"]].copy(deep=True)

In [37]:
train.head()

Unnamed: 0,Absatz,Fokalisierung
0,"Als ich mich in Neapel aufhielt, begegnete das...",internal
1,"Eine große Sorge hatte der gute König jetzt, d...",internal
2,"Es ist doch etwas Schönes, Herrliches, Erhaben...",internal
3,"Eine Sängerin, Antonelli genannt, war zu meine...",internal
4,Einen anderen Weg schlag ich ein; er ist aller...,internal


In [38]:
len(train)

24

In [39]:
class CSVDataset(Dataset):
    def __init__(self, df, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        
        df=df
        #self._train = df.iloc[0:35].to_dict(orient='records')
        self._dev = df.to_dict(orient='records')

In [40]:
dataset = CSVDataset(train)

In [41]:
len(dataset.dev)

24

# Setting LLM

In [42]:
dspy.settings.configure(
    cache=None
)

In [43]:
haiku = dspy.LM('claude-3-7-sonnet-latest', api_key=api_key)

In [44]:
dspy.settings.configure(lm=haiku)

# Setting Up Module + checking output

In [45]:
class Determinacy(dspy.Signature):
    """
    Your task is to classify the focalization of the following sentence 
    
    ### Labels
    There are three modes of focalization:
    - internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
    - external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
    - zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously.
    """
    #context = dspy.InputField(desc="contains annotation guidelines and scoring instructions")
    text_snippet = dspy.InputField(desc="contains a snippet of a narrative text")
    tag = dspy.OutputField(desc="contains only the **label** in lower case")

In [46]:
context = """ 
### Labels
There are three modes of focalization:
- internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
- external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
- zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously.
"""

# Setting Metric

Anleitung zu den Metriken in DSPY: https://dspy-docs.vercel.app/docs/building-blocks/metrics

In [47]:
from dspy.evaluate import Evaluate

In [48]:
def validate_tag(example, pred, trace=None):
    print(example.answer)
    print(pred.tag)
    return example.answer in pred.tag

# Trying out the Signature Optimizer

Anleitung zur Arbeit mit dem Optimizer bei zero-shot: https://dspy-docs.vercel.app/docs/deep-dive/teleprompter/signature-optimizer

In [49]:
class DeterminacyPipe(dspy.Module):
    def __init__(self):
        super().__init__()
        
        self.signature = Determinacy
        self.predictor = dspy.ChainOfThought(self.signature)
        
    def forward(self, text_snippet):
        result = self.predictor(text_snippet=text_snippet)
        return dspy.Prediction(
            tag = result.tag
        )      

In [50]:
devset = dataset.dev

In [51]:
evaluate = Evaluate(devset=devset, metric=validate_tag, num_threads=3, display_progress=True, display_table=True)

In [52]:
event_baseline = DeterminacyPipe()
devset_with_input = [dspy.Example({"text_snippet": r["Absatz"], "answer": r["Fokalisierung"]}).with_inputs("context", "text_snippet") for r in devset]

In [53]:
evaluate(event_baseline, devset=devset_with_input)

internal                                                 | 0/24 [00:00<?, ?it/s]
internal
zeroage Metric: 1.00 / 1 (100.0%):   4%|▍        | 1/24 [00:07<02:46,  7.25s/it]
zero
externalMetric: 2.00 / 2 (100.0%):   8%|▊        | 2/24 [00:07<01:13,  3.33s/it]
zero
externalMetric: 2.00 / 3 (66.7%):  12%|█▎        | 3/24 [00:08<00:40,  1.93s/it]
zero
internalMetric: 2.00 / 4 (50.0%):  17%|█▋        | 4/24 [00:13<01:08,  3.44s/it]
internal
internalMetric: 3.00 / 5 (60.0%):  21%|██        | 5/24 [00:14<00:43,  2.31s/it]
internal
zeroage Metric: 4.00 / 6 (66.7%):  25%|██▌       | 6/24 [00:14<00:32,  1.78s/it]
zero
zeroage Metric: 5.00 / 7 (71.4%):  29%|██▉       | 7/24 [00:20<00:53,  3.15s/it]
zero
externalMetric: 6.00 / 8 (75.0%):  29%|██▉       | 7/24 [00:20<00:53,  3.15s/it]
zero
internalMetric: 6.00 / 9 (66.7%):  38%|███▊      | 9/24 [00:21<00:26,  1.76s/it]
internal
internalMetric: 7.00 / 10 (70.0%):  42%|███▎    | 10/24 [00:26<00:37,  2.66s/it]
internal
zeroage Metric: 8.00 / 11 (72.7%):

2025/04/24 22:36:23 INFO dspy.evaluate.evaluate: Average Metric: 14 / 24 (58.3%)





Unnamed: 0,text_snippet,answer,tag,validate_tag
0,"Als der Tag anbrach, noch ehe die Sonne aufgegangen war, kam schon...",external,zero,
1,"Als ich mich in Neapel aufhielt, begegnete daselbst eine Geschicht...",internal,internal,✔️ [True]
2,In den letzten Jahrzehnten ist das Interesse an Hungerkünstlern se...,zero,zero,✔️ [True]
3,"Wie gesagt, die Hand warf mich wieder zur Erde. Bald darauf erfaßt...",internal,internal,✔️ [True]
4,"Aber da keine Krankheit in ihm war, so war der Gedanke nicht graue...",internal,internal,✔️ [True]
5,"Es blieb daher nur noch die andere Seite neben dem Herrenkreuz, un...",external,zero,
6,"In M..., einer bedeutenden Stadt im oberen Italien, ließ die verwi...",zero,zero,✔️ [True]
7,"Die Jugend, welche die beiden Freunde Aeins und Azwei verband, war...",zero,zero,✔️ [True]
8,"Wenn man in jenen Tagen ein Ding durch die Fichtau bringen wollte,...",external,zero,
9,"Einen anderen Weg schlag ich ein; er ist allerdings etwas weit, ab...",internal,internal,✔️ [True]


58.33

# Using Copro

In [54]:
from dspy.teleprompt import COPRO

In [55]:
teleprompter = dspy.teleprompt.COPRO(
    program_mode="basic",
    init_temperature=0.4,  
    breadth=4,
    metric=validate_tag,
)

In [56]:
kwargs = dict(num_threads=5, display_progress=True, display_table=0) # Used in Evaluate class in the optimization process
compiled_prompt_opt = teleprompter.compile(DeterminacyPipe(), trainset=devset_with_input, eval_kwargs=kwargs)

2025/04/24 22:36:32 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 1/3.
2025/04/24 22:36:32 INFO dspy.teleprompt.copro_optimizer: At Depth 1/3, Evaluating Prompt Candidate #1/2 for Predictor 1 of 1.


internal                                                 | 0/24 [00:00<?, ?it/s]
internal focalization
internalMetric: 1.00 / 1 (100.0%):   4%|▍        | 1/24 [00:06<02:25,  6.34s/it]
internal focalization
internalMetric: 2.00 / 2 (100.0%):   4%|▍        | 1/24 [00:06<02:25,  6.34s/it]
internal focalization
zeroage Metric: 3.00 / 3 (100.0%):  12%|█▏       | 3/24 [00:07<00:41,  2.00s/it]
zero focalization
externalMetric: 4.00 / 4 (100.0%):  17%|█▌       | 4/24 [00:08<00:33,  1.70s/it]
external focalization
externalMetric: 5.00 / 5 (100.0%):  21%|█▉       | 5/24 [00:09<00:26,  1.41s/it]
external focalization
zeroage Metric: 6.00 / 6 (100.0%):  25%|██▎      | 6/24 [00:13<00:39,  2.19s/it]
zero focalization
externalMetric: 7.00 / 7 (100.0%):  29%|██▋      | 7/24 [00:14<00:32,  1.93s/it]
zero focalization
zeroage Metric: 7.00 / 8 (87.5%):  33%|███▎      | 8/24 [00:15<00:24,  1.55s/it]
zero focalization
internalMetric: 8.00 / 9 (88.9%):  33%|███▎      | 8/24 [00:15<00:24,  1.55s/it]
internal

2025/04/24 22:37:08 INFO dspy.evaluate.evaluate: Average Metric: 18 / 24 (75.0%)
2025/04/24 22:37:08 INFO dspy.teleprompt.copro_optimizer: At Depth 1/3, Evaluating Prompt Candidate #2/2 for Predictor 1 of 1.



external
zero
zero
zero
internal
internal
internal
internal
internal
internal
zero
zero
external
zero
internal
internal
internal
internal
zero
zero
internal
zero
zero
zero
external
internal
zero
zero
external
zero
zero
zero
external
zero
internal
zero
external
external
external
zero
external
zero
zero
internal
zero
zero
internal
internal
Average Metric: 14.00 / 24 (58.3%): 100%|█████| 24/24 [00:00<00:00, 3376.38it/s]

2025/04/24 22:37:08 INFO dspy.evaluate.evaluate: Average Metric: 14 / 24 (58.3%)





2025/04/24 22:37:19 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 2/3.
2025/04/24 22:37:19 INFO dspy.teleprompt.copro_optimizer: At Depth 2/3, Evaluating Prompt Candidate #1/1 for Predictor 1 of 1.


internal                                                 | 0/24 [00:00<?, ?it/s]
internal
internalMetric: 1.00 / 1 (100.0%):   4%|▍        | 1/24 [00:06<02:35,  6.77s/it]
internal
zeroage Metric: 2.00 / 2 (100.0%):   8%|▊        | 2/24 [00:07<01:06,  3.03s/it]
zero
internalMetric: 3.00 / 3 (100.0%):  12%|█▏       | 3/24 [00:07<00:40,  1.93s/it]
internal
externalMetric: 4.00 / 4 (100.0%):  12%|█▏       | 3/24 [00:07<00:40,  1.93s/it]
zero focalization
externalMetric: 4.00 / 5 (80.0%):  21%|██        | 5/24 [00:08<00:17,  1.09it/s]
external
externalMetric: 5.00 / 6 (83.3%):  25%|██▌       | 6/24 [00:13<00:40,  2.23s/it]
zero
zeroage Metric: 5.00 / 7 (71.4%):  29%|██▉       | 7/24 [00:14<00:31,  1.83s/it]
zero focalization
Average Metric: 6.00 / 8 (75.0%):  33%|███▎      | 8/24 [00:14<00:21,  1.35s/it]zero
zero focalization
internalMetric: 7.00 / 9 (77.8%):  33%|███▎      | 8/24 [00:14<00:21,  1.35s/it]
internal
internalMetric: 8.00 / 10 (80.0%):  42%|███▎    | 10/24 [00:15<00:13,  1.07it

2025/04/24 22:37:58 INFO dspy.evaluate.evaluate: Average Metric: 17 / 24 (70.8%)





2025/04/24 22:38:09 INFO dspy.teleprompt.copro_optimizer: Iteration Depth: 3/3.
2025/04/24 22:38:09 INFO dspy.teleprompt.copro_optimizer: At Depth 3/3, Evaluating Prompt Candidate #1/1 for Predictor 1 of 1.


internal                                                 | 0/24 [00:00<?, ?it/s]
internal
zeroage Metric: 1.00 / 1 (100.0%):   4%|▍        | 1/24 [00:06<02:37,  6.85s/it]
zero
internalMetric: 2.00 / 2 (100.0%):   8%|▊        | 2/24 [00:07<01:11,  3.24s/it]
internal
internalMetric: 3.00 / 3 (100.0%):   8%|▊        | 2/24 [00:07<01:11,  3.24s/it]
zero
externalMetric: 3.00 / 4 (75.0%):  17%|█▋        | 4/24 [00:07<00:25,  1.29s/it]
external
externalMetric: 4.00 / 5 (80.0%):  17%|█▋        | 4/24 [00:07<00:25,  1.29s/it]
external
internalMetric: 5.00 / 6 (83.3%):  25%|██▌       | 6/24 [00:12<00:33,  1.85s/it]
internal
externalMetric: 6.00 / 7 (85.7%):  29%|██▉       | 7/24 [00:14<00:29,  1.75s/it]
zero
internalMetric: 6.00 / 8 (75.0%):  33%|███▎      | 8/24 [00:16<00:30,  1.88s/it]
internal
zeroage Metric: 7.00 / 9 (77.8%):  38%|███▊      | 9/24 [00:19<00:34,  2.32s/it]
zero
zeroage Metric: 8.00 / 10 (80.0%):  42%|███▎    | 10/24 [00:21<00:29,  2.10s/it]
zero
zeroage Metric: 9.00 / 11 (81.

2025/04/24 22:38:53 INFO dspy.evaluate.evaluate: Average Metric: 17 / 24 (70.8%)





In [57]:
compiled_prompt_opt

predictor.predict = Predict(StringSignature(text_snippet -> reasoning, tag
    instructions="# Narrative Focalization Analysis\n\nYou are a literary analysis expert specializing in narrative perspective and focalization. Your task is to carefully analyze the given sentence and determine which type of focalization it demonstrates.\n\n## Focalization Types\nFocalization refers to the perspective through which a narrative is presented. There are three main types:\n\n1. **Internal Focalization**: The narrative is presented through the perceptual lens of a character within the story. The text explicitly shows what a character perceives, thinks, or feels. Look for sensory descriptions, thoughts, emotions, or judgments that clearly belong to a character's perspective.\n\n2. **External Focalization**: The narrative describes events that could be perceived by a character, but without explicitly presenting their internal thoughts or feelings. The narrator observes the character from the outside,

# Test with new Prompt

In [58]:
results = []
for text_snippet in df_anno_test.Absatz:
    response = compiled_prompt_opt(text_snippet=text_snippet)
    print(response.tag)
    results.append(response.tag)

zero focalization
zero focalization
zero
zero focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
zero focalization
zero focalization
external focalization
zero focalization
internal focalization
internal focalization
internal focalization
internal focalization
zero focalization
external focalization
external focalization
external focalization
external focalization
external focalization
internal focalization
internal
internal
internal focalization
internal focalization
internal focalization
zero focalization
external focalization
external focalization
zero focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
internal focalization
external focalization
external
zero 

In [59]:
results_1 = []
for text in results:
    if "internal focalization" in text:
        results_1.append("internal")
    elif "internal" in text:
        results_1.append("internal")
    elif "external focalization" in text:
        results_1.append("external")
    elif "external" in text:
        results_1.append("external")
    elif "zero focalization" in text:
        results_1.append("zero")
    elif "zero" in text:
        results_1.append("zero")
    else:
        results_1.append("NaN")

In [60]:
predictions_dspy = pd.Series(results_1)

In [61]:
predictions_dspy

0          zero
1          zero
2          zero
3          zero
4      internal
         ...   
101    internal
102    external
103    internal
104    internal
105        zero
Length: 106, dtype: object

In [62]:
ground_truth = df_anno_test.Fokalisierung

In [63]:
ground_truth

0          zero
1          zero
2      internal
3          zero
4      internal
         ...   
101    internal
102    internal
103    internal
104    internal
105    internal
Name: Fokalisierung, Length: 106, dtype: object

In [64]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [65]:
pd.DataFrame(list(zip([f1_score(ground_truth, predictions_dspy, average="weighted")],
                      [recall_score(ground_truth, predictions_dspy,  average="weighted")],
                      [precision_score(ground_truth, predictions_dspy, average="weighted")],
                      [accuracy_score(ground_truth, predictions_dspy,)])),
                      columns = ["F1", "Recall", "Precision", "Accuracy"])

Unnamed: 0,F1,Recall,Precision,Accuracy
0,0.753935,0.745283,0.772708,0.745283


In [66]:
compiled_prompt_opt.save('haiku_dspied.pkl')

## Statistischer Test: McNemar

In [67]:
from typing import Optional
from statsmodels.stats.contingency_tables import mcnemar

In [68]:
# Funktion zum Vergleichen der Vorhersagen mit dem Goldstandard
def compare_with_gold(predictions, gold_standard):
    return [pred == gold for pred, gold in zip(predictions, gold_standard)]

In [69]:
# Funktion zum Erstellen einer Kontingenztafel
def create_contingency_table(results_a, results_b):
    both_correct = sum(a and b for a, b in zip(results_a, results_b))
    only_a_correct = sum(a and not b for a, b in zip(results_a, results_b))
    only_b_correct = sum(b and not a for a, b in zip(results_a, results_b))
    both_incorrect = sum(not a and not b for a, b in zip(results_a, results_b))
    return np.array([[both_correct, only_a_correct],
                     [only_b_correct, both_incorrect]])


In [70]:
# McNemar-Test durchführen
def run_mcnemar_test(results_a, results_b):
    table = create_contingency_table(results_a, results_b)
    return mcnemar(table, exact=True)


In [71]:
def generate_mcnemar_comparisons(
    results_gpt4o,
    df_anno,
    num_templates=3,
    runs=[1, 2, 3, 4, 5],
    optimized_predictions: Optional[pd.Series] = None
):
    """
    Führt McNemar-Vergleiche zwischen verschiedenen Prompt-Vorhersagen durch,
    inklusive eines optionalen optimierten Prompts.

    Args:
        results_gpt4o: DataFrame mit Prediction-Spalten für alle Templates/Runs
        df_anno: DataFrame mit Goldstandard ('Fokalisierung')
        num_templates: Anzahl der verwendeten Template-Prompts
        runs: Liste der Runs (z. B. [1, 2])
        optimized_predictions: (optional) pd.Series mit Optimized-Prompt-Vorhersagen

    Returns:
        comparison_df: DataFrame mit allen McNemar-Test-Ergebnissen
    """
    gold = df_anno["Fokalisierung"]
    comparison_results = []

    for run in runs:
        # Vorhersagen der Templates
        predictions = {
            f"T{template_idx+1}": compare_with_gold(
                results_gpt4o[f"Prediction_{template_idx}_Run{run}"], gold
            )
            for template_idx in range(num_templates)
        }

        # Optional: Optimierten Prompt ergänzen
        if optimized_predictions is not None:
            predictions["Optimized"] = compare_with_gold(optimized_predictions, gold)

        # Paarweise Vergleiche
        templates = list(predictions.keys())
        for i in range(len(templates)):
            for j in range(i + 1, len(templates)):
                t1, t2 = templates[i], templates[j]
                result = run_mcnemar_test(predictions[t1], predictions[t2])
                comparison_results.append({
                    "Run": run,
                    "Comparison": f"{t1} vs {t2}",
                    "Prompt_A": t1,
                    "Prompt_B": t2,
                    "p-value": result.pvalue
                })

    comparison_df = pd.DataFrame(comparison_results)
    return comparison_df


In [72]:
comparison_df = generate_mcnemar_comparisons(
    results_gpt4o=results_haiku,
    df_anno=df_anno,
    num_templates=3,
    runs=[1, 2, 3, 4, 5],
    optimized_predictions=predictions_dspy
)

In [73]:
print(comparison_df)

    Run       Comparison Prompt_A   Prompt_B   p-value
0     1         T1 vs T2       T1         T2  0.052479
1     1         T1 vs T3       T1         T3  0.063915
2     1  T1 vs Optimized       T1  Optimized  0.093140
3     1         T2 vs T3       T2         T3  1.000000
4     1  T2 vs Optimized       T2  Optimized  1.000000
5     1  T3 vs Optimized       T3  Optimized  1.000000
6     2         T1 vs T2       T1         T2  0.006611
7     2         T1 vs T3       T1         T3  0.043285
8     2  T1 vs Optimized       T1  Optimized  0.063915
9     2         T2 vs T3       T2         T3  0.250000
10    2  T2 vs Optimized       T2  Optimized  0.454498
11    2  T3 vs Optimized       T3  Optimized  1.000000
12    3         T1 vs T2       T1         T2  0.022656
13    3         T1 vs T3       T1         T3  0.043285
14    3  T1 vs Optimized       T1  Optimized  0.052479
15    3         T2 vs T3       T2         T3  1.000000
16    3  T2 vs Optimized       T2  Optimized  0.803619
17    3  T

In [74]:
comparison_df.to_excel("mcnemar_vergleiche_haiku.xlsx", index=False)

In [75]:
def filter_significant_comparisons(comparison_df, alpha=0.05):
    """
    Gibt alle Vergleichspaare mit p-Wert < alpha (standardmäßig 0.05) zurück.
    
    Args:
        comparison_df: DataFrame mit den Spalten 'Run', 'Comparison', 'Prompt_A', 'Prompt_B', 'p-value'
        alpha: Signifikanzniveau (Default: 0.05)
        
    Returns:
        DataFrame mit nur signifikant unterschiedlichen Prompt-Paaren
    """
    significant_df = comparison_df[comparison_df["p-value"] < alpha].copy()
    return significant_df.sort_values(by=["Run", "p-value"])

In [76]:
significant_comparisons = filter_significant_comparisons(comparison_df)
significant_comparisons

Unnamed: 0,Run,Comparison,Prompt_A,Prompt_B,p-value
6,2,T1 vs T2,T1,T2,0.006611
7,2,T1 vs T3,T1,T3,0.043285
12,3,T1 vs T2,T1,T2,0.022656
13,3,T1 vs T3,T1,T3,0.043285
18,4,T1 vs T2,T1,T2,0.014633
19,4,T1 vs T3,T1,T3,0.014633
20,4,T1 vs Optimized,T1,Optimized,0.03469
24,5,T1 vs T2,T1,T2,0.03469
