In [2]:
from difflib import SequenceMatcher
import pandas as pd
import re

In [6]:
df_zero_llama3 = pd.read_csv('output_zeroshot_llama3.csv')
df_one_llama3 = pd.read_csv('output_oneshot_llama3.csv')
df_zero_gemma2 = pd.read_csv('output_zeroshot_gemma2.csv')
df_one_gemma2 = pd.read_csv('output_oneshot_gemma2.csv')

In [3]:
def clean_text(text):
    return re.sub(r'<<<|>>>', '', text).strip().lower()

def extract_aligned_prediction(mt_text, raw_output):
    if not isinstance(mt_text, str) or not isinstance(raw_output, str):
        return None

    cleaned_mt = clean_text(mt_text)
    lines = raw_output.strip().split('\n')

    best_match = None
    best_ratio = 0.0

    for line in lines:
        line_clean = clean_text(line)
        ratio = SequenceMatcher(None, cleaned_mt, line_clean).ratio()

        if ratio > best_ratio:
            best_ratio = ratio
            best_match = line.strip()

    if best_ratio >= 0.8:
        return best_match
    else:
        return None


In [7]:
df_one_gemma2['prediction_hall'] = df_one_gemma2.apply(
    lambda row: extract_aligned_prediction(row['mt_text'], row['raw_output']), axis=1
)


In [9]:
df_one_llama3['prediction_hall'] = df_one_llama3.apply(
    lambda row: extract_aligned_prediction(row['mt_text'], row['raw_output']), axis=1
)

In [11]:
df_zero_llama3['prediction_hall'] = df_zero_llama3.apply(
    lambda row: extract_aligned_prediction(row['mt_text'], row['raw_output']), axis=1
)

In [12]:
df_zero_gemma2['prediction_hall'] = df_zero_gemma2.apply(
    lambda row: extract_aligned_prediction(row['mt_text'], row['raw_output']), axis=1
)

In [20]:
import re

def extract_prediction_label(output_text):
    if not isinstance(output_text, str):
        return None

    output_text = output_text.lower()

    labels = [
        "no_hallucination",
        "small_hallucination",
        "partial_hallucination",
        "full_hallucination"
    ]

    for label in labels:
        # Patrón: número opcional + separador opcional + label
        pattern = rf"(\d+[\._]?\s*)?{label}"
        if re.search(pattern, output_text):
            return label

    return None


In [21]:
df_list = [df_one_gemma2, df_one_llama3, df_zero_gemma2, df_zero_llama3]

for df in df_list:
    df['prediction_label'] = df['raw_output'].apply(extract_prediction_label)

In [24]:
df_names = ['one_gemma2', 'one_llama3', 'zero_gemma2', 'zero_llama3']

for df, name in zip(df_list, df_names):
    df.to_csv(f'clean_output_{name}.csv', index=False)