In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
nlp = spacy.load("en_core_web_lg")

from tqdm import tqdm
tqdm.pandas()   

from src import config
from src.datasets import TextConcatFactCheck, TextConcatPosts
from src.utils import cleaning_spacy, cleaning_spacy_batch

tasks_path = config.TASKS_PATH
posts_path = config.POSTS_PATH
fact_checks_path = config.FACT_CHECKS_PATH
gs_path = config.GS_PATH
lang = 'fra'
task_name = "monolingual"

print("Loading Fact Checks...")
fc = TextConcatFactCheck(fact_checks_path, tasks_path=tasks_path, task_name=task_name, lang=lang, version="english")
print("Loading Fact Checks (English + Clean)...")
# fc_eng = TextConcatFactCheck(fact_checks_path, tasks_path=tasks_path, task_name=task_name, lang=lang, version="english", cleaning_function=lambda x: cleaning_spacy_batch(x, nlp))

print("Loading Posts...")
posts = TextConcatPosts(posts_path, tasks_path=tasks_path, task_name=task_name, lang=lang, gs_path=gs_path, version="english")
print("Loading Posts (English + Clean)...")
# posts_eng = TextConcatPosts(posts_path, tasks_path=tasks_path, task_name=task_name, lang=lang, gs_path=gs_path, version="english", cleaning_function=lambda x: cleaning_spacy_batch(x, nlp))

Loading Fact Checks...
Loading Fact Checks (English + Clean)...
Loading Posts...
Loading Posts (English + Clean)...


In [62]:
df_fc = fc.df
df_train_posts = posts.df_train
df_dev_posts = posts.df_dev

df_train_mini = df_train_posts.iloc[:20]
df_fc_mini = df_fc.iloc[:100]

In [63]:
from datasets import Dataset

ds_fc = Dataset.from_pandas(df_fc)
ds_train = Dataset.from_pandas(df_train_posts)
ds_dev = Dataset.from_pandas(df_dev_posts)

In [64]:
import torch
from transformers import pipeline

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    # device="cuda",
    device_map="auto",
)

terminators = [
    pipe.tokenizer.eos_token_id,
    pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.67s/it]
Device set to use cuda:0


In [70]:
from datasets import Dataset
import numpy as np

def get_scores_hf(dataset, post, pipe, terminators):
    """
    Compute similarity scores between a post and fact checks using a Hugging Face dataset.

    Args:
        dataset (Dataset): Hugging Face dataset containing fact checks with a 'full_text' column.
        post (str): The post text to compare against.
        pipe (callable): The model pipeline for generating scores.
        terminators (list): List of token IDs marking the end of the response.

    Returns:
        list: Indices of the dataset fact checks sorted by similarity scores.
    """
    def generate_scores(batch):
        # Prepare messages for the pipeline
        batch_messages = [
            [
                {"role": "system", "content": "You are an expert saying if one text is a fact check of another text. You can only answer with a number from 0 to 1. (e.g. 0.5)"},
                {"role": "user", "content": f"From 0 to 1, rate how much the following text:\n{fc}\nis a fact check of the following text?:\n{post}"}
            ]
            for fc in batch['full_text']
        ]

        # Process in batches
        outputs = pipe(
            batch_messages,
            max_new_tokens=2,
            temperature=0.01,
            eos_token_id=terminators,
            pad_token_id=pipe.tokenizer.eos_token_id
        )

        # Extract scores from the pipeline's output
        batch_scores = []
        for output in outputs:
            try:
                score = float(output[0]["generated_text"][-1]["content"])
            except ValueError:
                score = 0.0
            batch_scores.append(score)
        return {"scores": batch_scores}

    # Apply the function to the dataset
    dataset = dataset.map(generate_scores, batched=True, batch_size=16)

    # Sort indices by scores
    sorted_indices = np.argsort(dataset["scores"])[::-1]
    return sorted_indices

# Example usage
dataset = Dataset.from_pandas(df_fc_mini)
post_0 = df_train_mini.iloc[0]["full_text"]
sorted_indices = get_scores_hf(dataset, post_0, pipe, terminators)


Map: 100%|██████████| 100/100 [00:04<00:00, 24.72 examples/s]


In [71]:
sorted_indices

array([99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86, 85, 84, 83,
       82, 81, 80, 79, 78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66,
       65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
       48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32,
       31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
       14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0])

In [None]:
df_