In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from statsmodels.stats.contingency_tables import mcnemar

In [2]:
from scipy import stats
from itertools import combinations

In [3]:
import openai
from openai import OpenAI

In [4]:
import os

In [5]:
df_anno = pd.read_csv('DH_2025_Anno_DEU_2nd_run.csv')

In [6]:
df_anno['Fokalisierung'] = df_anno['Fokalisierung'].replace({
    'intern': 'internal',
    'extern': 'external',
    'null': 'zero',
    None: 'zero',
    np.nan: 'zero'
})

In [7]:
df_anno.head()

Unnamed: 0,Autor,Titel,Absatz,Fokalisierung,Kommentar
0,Schiller,Der Vebrecher aus verlorener Ehre,In der ganzen Geschichte des Menschen ist kein...,zero,
1,Schiller,Der Vebrecher aus verlorener Ehre,Es ist etwas so Einförmiges und doch wieder so...,zero,
2,Schiller,Der Vebrecher aus verlorener Ehre,"Ich weiß, daß von den besten Geschichtschreibe...",internal,
3,Schiller,Der Vebrecher aus verlorener Ehre,"Der Held muß kalt werden wie der Leser, oder, ...",zero,
4,Tieck,Die beiden merkwürdigsten Tage aus Siegmunds L...,"Es war schon gegen Abend, als ein Wagen vor de...",internal,


In [8]:
api_key = os.getenv('MY_OPENAI')

In [9]:
def get_completion(prompt, model="o1-mini"):  
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(
        messages=[{
        "role": "user",
        "content": prompt,
        }],
        model=model,
    )
    return response.choices[0].message.content

In [10]:
prompt_basic = """
### Instruction
Your task is to classify the focalization of the following sentence

###
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text 
Sentence: '''{text}''''
Label:
"""

In [11]:
prompt_labels = """
### Instruction
Your task is to classify the focalization of the following sentence

### Labels
There are three modes of focalization:
- internal
- external
- zero

####
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text 
Sentence: '''{text}''' 
Label: 
"""

In [12]:
prompt_redefin = """
### Instruction
Your task is to classify the focalization of the following sentence

### Labels
There are three modes of focalization:
- internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
- external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
- zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously.

####
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text  
Sentence: '''{text}''' 
Label: 
"""

In [13]:
prompt_meta = """
### Instruction
Your task is to classify the focalization of the following sentence

### Labels
There are three modes of focalization:
- internal: A text passage is internally focalized precisely when a perceptual process is part of the depicted event and is presented from the perspective of a character.
- external: A text passage is externally focalized precisely when a perceptual process is part of the depicted event and could be presented from the perspective of a character.
- zero: A text passage is zero focalized precisely when circumstances of the narrated world are described as if they were independent of a particular perceptual process of a person or are not possible for a person to perceive synchronously. 
These definitions are redefinitions of the standard understanding of focalization.

####
Only respond with one word representing the mode of focalization, do NOT give explenations or generate more text 
Sentence: '''{text}''' 
Label: 
"""

In [14]:
prompt_templates = [prompt_labels, prompt_redefin, prompt_meta]

In [15]:
def evaluate_prompts_and_predictions(df, prompt_templates):
    """
    Evaluiert verschiedene Prompt-Templates und berechnet Metriken für die Vorhersagen.
    
    Args:
        df: Pandas DataFrame mit den Spalten 'Absatz' und 'Fokalisierung'
        prompt_templates: Liste der Prompt-Templates
    
    Returns:
        DataFrame mit den Evaluierungsmetriken für jeden Prompt
    """
    
    results = []
    
    # Iteration über die Prompt-Templates
    for prompt_idx, template in enumerate(prompt_templates):
        print(f"Verarbeite Prompt-Template {prompt_idx + 1}/{len(prompt_templates)}")
        
        # Neue Spalte für Vorhersagen erstellen
        df[f'Prediction_{prompt_idx}'] = None
        
        # Iteration über die Zeilen des DataFrames
        for idx, row in df.iterrows():
            # Prompt erstellen durch Einsetzen des Absatzes in das Template
            prompt = template.format(text=row['Absatz'])
            
            # Vorhersage mit Mixtral-Modell
            prediction = get_completion(prompt)
            first_word = prediction.split()[0]
            first_word = first_word.lower()
            print(first_word)
            
            # Vorhersage speichern
            df.at[idx, f'Prediction_{prompt_idx}'] = first_word
        
        #print(df)
        
        # Metriken berechnen
        metrics = {
            'Prompt': f'Template_{prompt_idx}',
            'F1-Score': f1_score(df['Fokalisierung'], df[f'Prediction_{prompt_idx}'], average='weighted'),
            'Recall': recall_score(df['Fokalisierung'], df[f'Prediction_{prompt_idx}'], average='weighted'),
            'Precision': precision_score(df['Fokalisierung'], df[f'Prediction_{prompt_idx}'], average='weighted'),
            'Accuracy': accuracy_score(df['Fokalisierung'], df[f'Prediction_{prompt_idx}'])
        }
        
        results.append(metrics)
    
    # Ergebnisse in DataFrame umwandeln
    results_df = pd.DataFrame(results)
    
    return df, results_df

In [16]:
results_o1mini, test = evaluate_prompts_and_predictions(df_anno, prompt_templates)

Verarbeite Prompt-Template 1/3
zero
zero
internal
zero
external
external
external
internal
internal
internal
internal
internal
internal
internal
internal
internal
zero
external
zero
internal
internal
internal
internal
internal
internal
external
internal
external
external
zero
internal
external
internal
internal
internal
internal
zero
external
external
zero
internal
internal
internal
internal
internal
internal
internal
internal
external
zero
internal
external
internal
internal
internal
internal
external
external
external
zero
zero
zero
external
external
external
internal
external
external
zero
zero
external
zero
external
external
internal
internal
internal
internal
internal
internal
external
zero
zero
external
zero
external
external
external
external
external
external
external
external
external
external
internal
external
internal
external
zero
external
external
external
internal
external
zero
Verarbeite Prompt-Template 2/3
zero
zero
zero
internal
external
internal
internal
internal
inte

In [17]:
results_o1mini.to_json("DH_o1mini_results", orient="records", indent=4, force_ascii=False)

In [18]:
test

Unnamed: 0,Prompt,F1-Score,Recall,Precision,Accuracy
0,Template_0,0.586123,0.566038,0.64289,0.566038
1,Template_1,0.635171,0.622642,0.657107,0.622642
2,Template_2,0.621288,0.622642,0.629142,0.622642


## Statistischer Test 1: McNemar

In [19]:
# Funktion zum Vergleichen der Vorhersagen mit dem Goldstandard
def compare_with_gold(predictions, gold_standard):
    return [pred == gold for pred, gold in zip(predictions, gold_standard)]

In [20]:
results_1 = compare_with_gold(results_o1mini["Prediction_0"], df_anno["Fokalisierung"])
results_2 = compare_with_gold(results_o1mini["Prediction_1"], df_anno["Fokalisierung"])
results_3 = compare_with_gold(results_o1mini["Prediction_2"], df_anno["Fokalisierung"])

In [21]:
def create_contingency_table(results_a, results_b):
    both_correct = sum(a and b for a, b in zip(results_a, results_b))
    only_a_correct = sum(a and not b for a, b in zip(results_a, results_b))
    only_b_correct = sum(b and not a for a, b in zip(results_a, results_b))
    both_incorrect = sum(not a and not b for a, b in zip(results_a, results_b))
    return np.array([[both_correct, only_a_correct],
                     [only_b_correct, both_incorrect]])

In [22]:
def run_mcnemar_test(results_a, results_b):
    table = create_contingency_table(results_a, results_b)
    return mcnemar(table, exact=True)

In [23]:
test_1_2 = run_mcnemar_test(results_1, results_2)
test_1_3 = run_mcnemar_test(results_1, results_3)
test_2_3 = run_mcnemar_test(results_2, results_3)

In [24]:
print("McNemar-Test Ergebnisse:")
print(f"Template 1 vs Template 2: p-Wert = {test_1_2.pvalue}")
print(f"Template 1 vs Template 3: p-Wert = {test_1_3.pvalue}")
print(f"Template 2 vs Template 3: p-Wert = {test_2_3.pvalue}")

McNemar-Test Ergebnisse:
Template 1 vs Template 2: p-Wert = 0.4050322461407632
Template 1 vs Template 3: p-Wert = 0.37708558747544885
Template 2 vs Template 3: p-Wert = 1.0


## Statistischer Test 2: Paired-ttest

In [25]:
def analyze_statistical_significance(df, results_df):
    """
    Analysiert die statistische Signifikanz der Unterschiede zwischen F1-Scores verschiedener Prompts
    
    Args:
        df: Original DataFrame mit den Vorhersagen
        results_df: DataFrame mit den Evaluierungsmetriken aus evaluate_prompts_and_predictions
    
    Returns:
        DataFrame mit den Ergebnissen der statistischen Tests
    """
    
    # Liste aller Prompt-Kombinationen erstellen
    prompt_combinations = list(combinations(results_df['Prompt'], 2))
    
    # Ergebnisse speichern
    test_results = []
    
    for prompt1, prompt2 in prompt_combinations:
        # Indizes extrahieren
        idx1 = int(prompt1.split('_')[1])
        idx2 = int(prompt2.split('_')[1])
        
        # F1-Scores für jeden einzelnen Fall berechnen
        f1_scores_1 = []
        f1_scores_2 = []
        
        # Für jede Instanz einzeln F1-Score berechnen
        for idx in df.index:
            true = df.loc[idx, 'Fokalisierung']
            pred1 = df.loc[idx, f'Prediction_{idx1}']
            pred2 = df.loc[idx, f'Prediction_{idx2}']
            
            # Einzelne F1-Scores berechnen (1 für korrekt, 0 für falsch)
            f1_scores_1.append(1 if true == pred1 else 0)
            f1_scores_2.append(1 if true == pred2 else 0)
        
        # Verschiedene statistische Tests durchführen
        
        # 1. Paired t-test
        t_stat, t_pvalue = stats.ttest_rel(f1_scores_1, f1_scores_2)
        
        # 2. Wilcoxon signed-rank test
        w_stat, w_pvalue = stats.wilcoxon(f1_scores_1, f1_scores_2)
        
        # Mittlere Differenz berechnen
        mean_diff = np.mean(f1_scores_1) - np.mean(f1_scores_2)
        
        # Ergebnisse speichern
        test_results.append({
            'Prompt_Comparison': f'{prompt1} vs {prompt2}',
            'Mean_Difference': mean_diff,
            'T_Test_p_value': t_pvalue,
            'Wilcoxon_p_value': w_pvalue,
            'Significant_0.05': any(p < 0.05 for p in [t_pvalue, w_pvalue]),
            'Significant_0.01': any(p < 0.01 for p in [t_pvalue, w_pvalue])
        })
    
    # Ergebnisse in DataFrame umwandeln
    results_df = pd.DataFrame(test_results)
    
    return results_df

In [26]:
stats = analyze_statistical_significance(df_anno, test)

In [27]:
stats

Unnamed: 0,Prompt_Comparison,Mean_Difference,T_Test_p_value,Wilcoxon_p_value,Significant_0.05,Significant_0.01
0,Template_0 vs Template_1,-0.056604,0.319609,0.317311,False,False
1,Template_0 vs Template_2,-0.056604,0.290992,0.288844,False,False
2,Template_1 vs Template_2,0.0,1.0,1.0,False,False
