In [None]:
from loguru import logger

import pandas as pd

from slt_positional_bias.dataset import generate_merged_data_frame, sort_data_frame, store_df_as_parquet, load_parquet_as_df

df = generate_merged_data_frame()

df_sorted = sort_data_frame(df, 4, 36)

In [None]:
API_URL = "https://api.helmholtz-blablador.fz-juelich.de/v1/"
API_KEY = ""
API_MODEL = "alias-large"

from openai import OpenAI

client = OpenAI(
    api_key=API_KEY,
    base_url=API_URL
)

In [None]:
import pandas as pd
from tqdm import tqdm

# Wie viele Dokumente sollen bereinigt werden?
n_to_clean = 1600

# Neue Kopie für bearbeitete Texte
df_cleaned = df_sorted.copy()

# System Prompt
system_prompt = (
    "You are a language model specialized in content distillation. "
    "Your goal is to reduce passages to around 200 tokens while preserving all critical information. "
    "You must avoid removing any content that could be essential to understanding the topic, its implications, or its technical depth. You should use the same vocabularies used in the original document."
)

# Kürzungsschleife
for idx in tqdm(range(n_to_clean)):
    topic = df_sorted.iloc[idx]['topic']
    original_text = df_sorted.iloc[idx]['doc']

    user_prompt = f"""
    You are given a passage from a document. The document is related to the topic: "{topic}".

    Your task is to shorten the passage to approximately 200 tokens while retaining all essential and topic-relevant information. 
    Focus on preserving technical content, facts, processes, and insights that directly support or explain the topic. 
    You should use the of vocabularies used in the original document.

    Avoid:
    - Structural or editorial elements (like tables of contents, section numbers, or headings)
    - Redundant or general-purpose filler text

    Keep:
    - All technical explanations
    - Relevant data or findings
    - Descriptions of methods, applications, or implications related to the topic

    Input passage:
    \"\"\"{original_text}\"\"\"

    Shortened version (~200 tokens, no critical information lost):
    """.strip()

    # LLM Anfrage
    response = client.chat.completions.create(
        model=API_MODEL,
        temperature=0.3,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
    )

    raw_output = response.choices[0].message.content.strip()
    if "</think>" in raw_output.lower():
        cleaned_text = raw_output.lower().split("</think>")[-1].strip()
    else:
        cleaned_text = raw_output
    # Gekürzten Text in DataFrame schreiben
    df_cleaned.at[idx, 'doc'] = cleaned_text

store_df_as_parquet(df_cleaned, "output_cleaned_df")

