# Load Data

In [1]:
#!/usr/bin/env python
from tqdm import tqdm
import json

import sys
import os
# sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

from datetime import datetime
from src.datasets import TextConcatFactCheck, TextConcatPosts
from src.models import EmbeddingModel

current_time = datetime.now().strftime("%Y%m%d-%H%M%S")

tasks_path = "data/complete_data/tasks.json"
posts_path = "data/complete_data/posts.csv"
fact_checks_path = "data/complete_data/fact_checks.csv"
gs_path = "data/complete_data/pairs.csv"
output_path = "data/out"
output_path = os.path.join(output_path, __name__, current_time)
if not os.path.exists(output_path):
    os.makedirs(output_path)
    
langs = ['fra', 'spa', 'eng', 'por', 'tha', 'deu', 'msa', 'ara']
model_name = '/home/bsc/bsc830651/.cache/huggingface/hub/models--intfloat--multilingual-e5-large/snapshots/ab10c1a7f42e74530fe7ae5be82e6d4f11a719eb'

d_out = {}
# for lang in tqdm(langs, desc="Languages"):

posts = TextConcatPosts(posts_path, tasks_path, task_name="monolingual", gs_path=gs_path, lang="spa")
fact_checks = TextConcatFactCheck(fact_checks_path, tasks_path, task_name="monolingual", lang="spa")

df_fc = fact_checks.df
df_posts_train = posts.df_train
df_posts_dev = posts.df_dev


  from tqdm.autonotebook import tqdm, trange


In [2]:
# df_fc.head()

In [3]:
# df_posts_train.head()

In [4]:
# df_fc

# Create a subset for demos

In [22]:
import pandas as pd
df_posts_train_mini = df_posts_train.head(25)
ls_fcs = df_posts_train_mini.gs.sum()

print(len(set(ls_fcs)))

df_fc_train_mini = df_fc[df_fc.index.isin(ls_fcs)]
df_fc_train_negatives = df_fc[~df_fc.index.isin(ls_fcs)].sample(n=len(df_fc_train_mini))
df_fc_train_mini = pd.concat([df_fc_train_mini, df_fc_train_negatives], axis=0).sample(frac=1.0, random_state=42)
# df_fc_train_mini["full_text"] = df_fc_train_mini.apply(lambda x: f"(INDEX {x.name}) " + x["full_text"], axis=1)
print(df_fc_train_mini.shape)
df_fc_train_mini.head()

23
(46, 4)


Unnamed: 0_level_0,claim,instances,title,full_text
fact_check_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
107584,Poner el aire acondicionado de un coche sin ab...,[https://www.newtral.es/vuelve-el-bulo-que-adv...,Vuelve el bulo del benceno y el aire acondicio...,vuelve el bulo del benceno y el aire acondicio...
136751,Un bebé de 7 meses en una clínica de abortos,[https://factual.afp.com/no-esta-imagen-no-mue...,"No, esta imagen no muestra un feto de siete me...","no, esta imagen no muestra un feto de siete me..."
22976,Algo grave está ocurriendo en Argentina,[https://factual.afp.com/la-desinformacion-de-...,La desinformación de un video viral sobre la p...,la desinformación de un video viral sobre la p...
54301,Este es un tuit de Ingrid Betancourt contra Pi...,[https://factual.afp.com/doc.afp.com.328Z7TP#5...,Tuit de la candidata colombiana Ingrid Betanco...,tuit de la candidata colombiana ingrid betanco...
81905,Las mascarillas quirúrgicas contienen teflón q...,[https://www.newtral.es/bulo-mascarillas-quiru...,"No, las mascarillas quirúrgicas homologadas no...","no, las mascarillas quirúrgicas homologadas no..."


# Instance model

In [23]:
import torch
from transformers import pipeline

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    # device="cuda",
    device_map="auto",
)

terminators = [
    pipe.tokenizer.eos_token_id,
    pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# messages = [
#     {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
#     {"role": "user", "content": "Who are you?"},
# ]
# outputs = pipe(
#     messages,
#     max_new_tokens=256,
#     temperature=0.9,
#     eos_token_id=terminators,

# )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.71s/it]


# Inferences

In [31]:
import numpy as np

post_0 = df_posts_train_mini.iloc[0]["full_text"]
fcs = df_fc_train_mini["full_text"].tolist()
str_fcs = "\n".join(fcs)

def get_scores(post, fcs):
    scores = []
    for i in range(len(fcs)):
        fc_i = fcs[i]
        messages = [
        {"role": "system", "content": "You are an expert saying if one text is a fact check of another text. You can only answer with a number from 0 to 1. (e.g. 0.5)"},
        {"role": "user", "content": "From 0 to 1, rate how much the following text:\n" + fc_i + "\nis a fact check of the following text?:\n" + post},
        ]

        outputs = pipe(
            messages,
            max_new_tokens=256,
            temperature=0.9,
            eos_token_id=terminators,
            pad_token_id=pipe.tokenizer.eos_token_id

        )

        score = outputs[0]["generated_text"][-1]["content"]
        try:
            score = float(score)
        except:
            score = 0.0
            
        scores.append(score)
    
    idxs_scores = np.argsort(scores)
    idxs = df_fc_train_mini.iloc[idxs_scores].index
    
    return idxs

idxs = get_scores(post_0, fcs)

In [32]:
idxs

Index([ 22976,  50769,  37261,  81905,  50712,  50470,  64138,  89415, 195200,
        49970,  85559,  35570,  80126,  59446,  52752,  52561,  94446,  61806,
         3332,   3857,  91520, 148668,  37995, 102320, 153875, 100265,  80729,
       147754,  74481,  50916,  84406, 107584,  80744,  64137, 114073,  52407,
        84148,  64144, 136751,  51854,   3700,   7173,  56968,  84361, 101118,
        54301],
      dtype='int64', name='fact_check_id')

In [26]:
import os
os.cpu_count()

160

In [33]:
import pandas as pd
import swifter
from tqdm.auto import tqdm
tqdm.pandas()

# Copy DataFrame
df_posts_train_mini_out = df_posts_train_mini.copy()

# Apply function with progress
df_posts_train_mini_out["candidates"] = df_posts_train_mini_out["full_text"].progress_apply(lambda x: get_scores(x, fcs))
df_posts_train_mini_out["candidates"] = df_posts_train_mini_out["candidates"].apply(lambda x: list(x[::-1]))

100%|██████████| 25/25 [01:06<00:00,  2.68s/it]


In [45]:
df_posts_train_mini_out.apply(lambda x: len(list(set(x["gs"]).intersection(set(x["candidates"][:10])))) > 0, axis=1).mean()

np.float64(0.52)

In [10]:
import pandas as pd
from tqdm.auto import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Disable tokenizers parallelism
# Function to apply in parallel with progress tracking
def apply_with_progress(df, func, fcs):
    # Initialize the progress bar
    total = len(df)
    tqdm_iterator = tqdm(total=total, desc="Processing", unit="item")

    # Define a function to execute and update progress
    def execute_and_update(text):
        result = func(text, fcs)
        tqdm_iterator.update(1)  # Update the progress bar
        return result

    # Use ProcessPoolExecutor for parallel execution
    with ProcessPoolExecutor() as executor:
        # Submit tasks to the executor
        futures = {executor.submit(execute_and_update, text): i for i, text in enumerate(df["full_text"])}

        # Collect results as they complete
        results = [None] * total
        for future in tqdm(as_completed(futures), total=total, desc="Processing", unit="item"):
            index = futures[future]
            results[index] = future.result()

    tqdm_iterator.close()  # Close the progress bar after completion
    return results

# Copy DataFrame
df_posts_train_mini_out = df_posts_train_mini.copy()

# Apply function with progress
df_posts_train_mini_out["scores"] = apply_with_progress(df_posts_train_mini_out, get_scores, fcs)



Processing:   0%|          | 0/50 [00:00<?, ?item/s]


AttributeError: Can't pickle local object 'apply_with_progress.<locals>.execute_and_update'

In [16]:
import swifter
from tqdm.auto import tqdm

# Set up progress bar
tqdm.pandas(desc="my bar!")

# Copy DataFrame
df_posts_train_mini_out = df_posts_train_mini.copy()

# Apply function in parallel using swifter
df_posts_train_mini_out["scores"] = df_posts_train_mini_out.swifter.progress_bar(True).apply(lambda x: get_scores(x["full_text"], fcs), axis=1)


KeyboardInterrupt: 

In [11]:
import torch
from transformers import pipeline

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    # device="cuda",
    device_map="auto",
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.47s/it]


In [14]:
from transformers import DataCollatorForSeq2Seq, pipeline
import torch
from tqdm import tqdm

# Create the data collator for padding and batching
data_collator = DataCollatorForSeq2Seq(
    tokenizer=pipe.tokenizer,
    model=pipe.model,
    padding=True,
    max_length=512,  # Adjust this based on the max input length your model can handle
    return_tensors="pt",  # Use PyTorch tensors for efficient GPU processing
)

# Prepare the data for batching
def prepare_data(post, fcs):
    message_pairs = []
    for fc_i in fcs:
        system_msg = "You are an expert saying if one text is a fact check of another text. You can only answer with a number from 0 to 1. (e.g. 0.5)"
        user_msg = f"From 0 to 1, rate how much the following text:\n{fc_i}\nis a fact check of the following text?:\n{post}"

        # Tokenize the message pair (system and user messages)
        pipe.tokenizer.pad_token = pipe.tokenizer.eos_token
        tokenized_input = pipe.tokenizer(
            system_msg + " " + user_msg,
            truncation=True,
            return_tensors="pt"
        )
        message_pairs.append(tokenized_input)
    return message_pairs

# Update get_scores to process data in batches
def get_scores(post, fcs, batch_size=8):
    # Prepare the data
    tokenized_messages = prepare_data(post, fcs)

    # Split into batches
    scores = []
    for i in tqdm(range(0, len(tokenized_messages), batch_size)):
        print(tokenized_messages)
        batch = tokenized_messages[i:i + batch_size]
        # print(batch[0]["input_ids"].reshape(-1).shape)
        # Use data collator to pad the batch
        inputs = data_collator(batch)

        # Move inputs to the appropriate device (GPU/CPU)
        inputs = {key: val.to(pipe.model.device) for key, val in inputs.items()}

        # Perform the inference in batch
        outputs = pipe(
            inputs['input_ids'],
            max_new_tokens=256,
            temperature=0.9,
            eos_token_id=terminators,
            pad_token_id=pipe.tokenizer.eos_token_id
        )

        # Extract and process the scores
        for output in outputs:
            score = output["generated_text"][-1]["content"]
            try:
                score = float(score)
            except:
                score = 0.0
            scores.append(score)

    return scores

# Use the get_scores function to get fact-checking scores
post_0 = df_posts_train_mini.iloc[0]["full_text"]
fcs = df_fc_train_mini["full_text"].tolist()
scores = get_scores(post_0, fcs)



  0%|          | 0/13 [00:00<?, ?it/s]




[{'input_ids': tensor([[128000,   2675,    527,    459,   6335,   5605,    422,    832,   1495,
            374,    264,   2144,   1817,    315,   2500,   1495,     13,   1472,
            649,   1193,   4320,    449,    264,   1396,    505,    220,     15,
            311,    220,     16,     13,    320,     68,   1326,     13,    220,
             15,     13,     20,      8,   5659,    220,     15,    311,    220,
             16,     11,   4478,   1268,   1790,    279,   2768,   1495,    512,
           4355,  87251,    938,   5697,  86285,   3233,     64,  45815,  55988,
             64,   1541,  12712,    409,  29340,    265,    409,   9467,    359,
           5670,  32150,    658,  84117,     12,    777,   1208,  87251,    938,
           5697,    409,  86285,   3244,    912,  55988,     64,   1541,  12712,
            409,  29340,    265,    409,   9467,    359,   5670,  32150,    658,
          84117,     12,    777,    198,    285,    264,   2144,   1817,    315,
            2

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [128]:
import re

# Step 1: System message template
system_msg = {
    "role": "system", 
    "content": "You are an expert saying if one text is a fact check of another text. You can EXCLUSIVELY answer with a number from 0 to 1. (e.g. 0.5)"
}

# Step 2: Create message pairs for each fact check comparison
ls_user_msg = [
    [
        system_msg,  # The system message remains the same for all pairs
        {
            "role": "user", 
            "content": f"From 0 to 1, rate how much the following text:\n{fc_i}\nis a fact check of the following text?:\n{post_0}"
        }
    ]
    for fc_i in fcs  # Assumes fcs is a list of fact-check texts
]

# Step 3: Run the pipeline for each message pair
outputs = pipe(
    ls_user_msg,  # The batch of message pairs
    max_new_tokens=256,
    temperature=0.9,
    eos_token_id=terminators,  # Custom end-of-sequence token IDs
    pad_token_id=pipe.tokenizer.eos_token_id  # Padding token
)

nums = [re.search(pattern="\d+\.\d+", string=output[0]["generated_text"][-1]["content"]) for output in outputs]
# Outputs will contain the model's responses to each fact-check comparison.
scores = [float(num.group(0)) if num is not None else 0.0 for num in nums]
print(scores)


: 

In [112]:
for output in outputs:
    print(output[0]["generated_text"][-1]["content"])

0.8
0.05
0.05
0.8
0.3
0.2
0.6
0.85
0.8
0.6
0.05
0.4
0.8
0.
0
0.3
0.2
0.7
0.
0
0.2
0.7
0
0.2
0.8
0.6
0.25
0.85
0
0.3
0.8
0.8
0.2
0
0.0
0.75
0.2
0
0.75
0.2
0
0.0

Nota: El primer texto parece ser un anuncio publicitario para un supuesto suplemento o remedio para la diabetes tipo 2, mientras que el segundo texto parece ser un llamado a la acción para apoyar una causa específica. No hay relación lógica entre los dos textos, por lo que no parece ser una fact-check de uno a otro.
0.0
0.9
0.2
0
0.2
0.8
0.0
0
0.
0
0.8
0.6
0
0.8
0.05
0.7
0.2
0
0.2
0.5
0.
0
0.01
0.
0.8
0.7
0.7
0.0
0.7
0.7
0.0
0.7
0.05
0.6
0.
0.3
0
0.8
0.
0.2
0.3
0.7
0.
0
0.2
0.85
0.3
0.75
0.8
0.3
0.8
0.1
0
0.
0.4
0.8
0.2
0.8
0.2
0.05


In [104]:
outputs[2][0]["generated_text"][-1]["content"]

'0.05'

In [94]:
len(outputs)

2

In [80]:
outputs[0]["generated_text"]

[{'role': 'system',
  'content': 'You are an expert saying if one text is a fact check of another text. You can only answer with a number from 0 to 1. (e.g. 0.5)'},
 {'role': 'user',
  'content': 'From 0 to 1, rate how much the following text:\nla cruz roja japonesa sí acepta donaciones de sangre de vacunados contra el covid-19 la cruz roja de japón no acepta donaciones de sangre de vacunados contra el covid-19\nis a fact check of the following text?:\n"Bienaventurados los perseguidos por mi causa " Ellos alcanzarán el cielo. Son religiosas apresadas por defender a los niños presos en jaulas, hijos de los emigrantes detenidos en los EEUU, para deportarlos. Ayuden, conpartiendo este hecho, ya que ningun periódico lo da a conocer. '},
 {'role': 'user',
  'content': 'From 0 to 1, rate how much the following text:\nde nuevo, the new york times no publicó que duque tiene 10 billones de pesos en paraísos fiscales the new york times publicó que duque tenía dinero en paraísos fiscales\nis a fa

In [70]:
scores = [float(sc) for sc in scores]

df_fc_train_mini_out = df_fc_train_mini.copy()
df_fc_train_mini_out["score"] = scores
df_fc_train_mini_out.sort_values("score", ascending=False).iloc[:50].index

Index([ 79869, 126734,  66224,  93801,  80107, 114073,  51355,  66416, 137103,
        80062,  65106,  50769, 110778,  53353,  38931,  35570,  53969,  50639,
        64138, 101108, 148668,  73029,  89069, 100265, 196380,  22736,  84617,
        60098, 110837,  52407,  53363,  23369,  52463,  80744,  80572,  61806,
        65641,  59401,  54221,  52932, 114800,  37995,  84582,  70127,  52561,
       147300,  56968,  84564, 153875,  35511],
      dtype='int64', name='fact_check_id')

In [66]:
df_posts_train_mini.iloc[0]

ocr          "Bienaventurados los perseguidos por mi causa ...
verdicts                                                      
text                                                          
fb                                                           1
tw                                                           0
ig                                                           0
full_text    "Bienaventurados los perseguidos por mi causa ...
gs                                                     [80729]
Name: 4, dtype: object

In [57]:
str_fcs = "\n".join(fcs)

messages = [{"role": "system", "content": "You are an expert saying if one text is a fact check of another text"},
            {"role": "system", "content": "your context is the following list of indexed fact checks:\n" + str_fcs},

                {"role": "user", "content": "Provide the indexes of the 10 fact-checks most likely to be associated to:\n" + post_0},
                ]

outputs = pipe(messages, max_new_tokens=10000)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [58]:
post_0

'"Bienaventurados los perseguidos por mi causa " Ellos alcanzarán el cielo. Son religiosas apresadas por defender a los niños presos en jaulas, hijos de los emigrantes detenidos en los EEUU, para deportarlos. Ayuden, conpartiendo este hecho, ya que ningun periódico lo da a conocer. '

In [59]:
outputs[0]["generated_text"][-1]["content"]

'Después de analizar los fact-checks proporcionados, he identificado los 10 con los que se asocia más con el texto proporcionado:\n\n1. **INDEX 137124**: La imagen de un fotógrafo siendo apuntado es de 2009, sin relación con las protestas en colombia.\n2. **INDEX 50769**: La frase “coman lo que quieran en semana santa” no la dijo el cura uruguayo “gordo” verde.\n3. **INDEX 137616**: La receta con manzanilla, eucalipto, limón, jengibre y miel no cura el covid-19.\n4. **INDEX 119700**: No hay registro del emir de Dubái, Mohamed bin Rashid, diciendo que su bisnieto Andrés Mauricio López Obrador andará en camello.\n5. **INDEX 142168**: No, el expresidente colombiano Álvaro Uribe no integró originalmente este ranking sobre mortíferos “dictadores”.\n6. **INDEX 118404**: Esta imagen de ‘mangas de agua’ no fue tomada en Guerrero, sino en Estados Unidos en 2020.\n7. **INDEX 195269**: “Contraloría autoriza controles en domicilios sin autorización de los moradores”: #real.\n8. **INDEX 137401**: N

In [38]:
for i in range(len(df_fc_train_mini)):
    print(f"SCORE: {scores[i]}:", fcs[i])
    print("\n\n")

SCORE: 0: (INDEX 79869) la cruz roja japonesa sí acepta donaciones de sangre de vacunados contra el covid-19 la cruz roja de japón no acepta donaciones de sangre de vacunados contra el covid-19



SCORE: 0: (INDEX 50142) no, el covid-19 no es "absolutamente inofensivo" para los niños el covid-19 es "absolutamente inofensivo" para los niños



SCORE: 0: (INDEX 27791) no, el exministro corcuera no es el autor de este audio que critica la gestión del gobierno ante el coronavirus audio del exministro corcuera que critica la gestión del gobierno ante el coronavirus



SCORE: 0: (INDEX 137133) la foto de una caravana de migrantes no muestra a argentinos llegando a perú, fue tomada en eslovenia un millón de argentinos están migrando a perú



SCORE: 0: (INDEX 100801) la aparición de tecnologías de comunicación no se vincula a brotes epidémicos patrón de brotes virales cuando hay una mejora/actualización del campo electromagnético



SCORE: 0: (INDEX 93801) no hay registro de que maduro haya e