In [None]:
import os
import pandas as pd
from tqdm import tqdm
from tenacity import retry, wait_random_exponential, stop_after_attempt
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline
)
import torch
import zipfile

In [None]:
!git clone https://gitlab.com/checkthat_lab/clef2025-checkthat-lab.git

In [None]:
INPUT_CSV  = '/content/clef2025-checkthat-lab/task2/data/test/test-te.csv'
OUTPUT_CSV = 'task2_ell.csv'
OUTPUT_ZIP = 'task2_ell.zip'
#MODEL_NAME = 'facebook/mbart-large-50-many-to-many-mmt'  
MODEL_NAME = 'Qwen/Qwen2.5-0.5B'

In [None]:
df = pd.read_csv(INPUT_CSV)
assert 'post' in df.columns, "CSV must have 'post' column"
df.head()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model     = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True)
device_id = 0 if torch.cuda.is_available() else -1

generator = pipeline(
    task='text-generation',
    model=model,
    tokenizer=tokenizer,
    device=device_id,
    do_sample=False,
    max_new_tokens=128,
)

@retry(wait=wait_random_exponential(min=1, max=10),
       stop=stop_after_attempt(3))
def gerar_normalized_claim(post_text: str) -> str:
    prompt = (
        """Σου δίνεται μια ανοργάνωτη και ανεπίσημη ανάρτηση στα κοινωνικά δίκτυα.
Περίληψέ την σε μια σαφή και συνοπτική δήλωση, χωρίς να προσθέσεις επιπλέον πληροφορίες.
Ανάρτηση: {post_text}
Κανονικοποιημένη δήλωση:"""

    )
    out = generator(prompt)
    return out[0]['generated_text'].strip()


In [None]:
normalized = []
for post in tqdm(df['post'], desc="Processing"):
    try:
        normalized.append(gerar_normalized_claim(post))
    except Exception as e:
        tqdm.write(f"Error in post [{post[:30]}…]: {e}")
        normalized.append("")

df['normalized claim'] = normalized

In [None]:
df[['post','normalized claim']].to_csv(OUTPUT_CSV, index=False)
with zipfile.ZipFile(OUTPUT_ZIP, 'w', zipfile.ZIP_DEFLATED) as zp:
    zp.write(OUTPUT_CSV)

print(f"✅ Generated file: {OUTPUT_ZIP}")
