In [1]:
import argparse
import numpy as np
import pandas as pd
from rank_bm25 import BM25Okapi
from dataloader import get_data
from utils import read_json
from metrics import calculate_ndcg
import torch

from transformers import DPRConfig, DPRContextEncoder, DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from torch.utils.data import DataLoader, TensorDataset

import faiss
from tqdm import tqdm
import pytrec_eval
import sys
import os
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Check for Apple M2 GPU and set device
if torch.backends.mps.is_available():
    device = torch.device("mps")  # Use Metal Performance Shaders (MPS) for Apple GPUs
    print("Using Apple M2 GPU (MPS)")
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

batch_size = 4

Using device: cuda


In [2]:

df_fact_checks, df_posts, df_fact_check_post_mapping = get_data('./data')
tasks = read_json(f"./data/tasks.json")


In [20]:
tasks.keys()

dict_keys(['monolingual', 'crosslingual'])

In [7]:
df_fact_checks

Unnamed: 0_level_0,claim,instances,title
fact_check_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"( Are avocados good for you?, Are avocados go...","[(1525653998.0, https://metafact.io/factchecks...",
1,"( Can animals have headaches?, Can animals ha...","[(1617955634.0, https://metafact.io/factchecks...",
2,"( Can we help prevent Alzheimer's with diet?, ...","[(1525653998.0, https://metafact.io/factchecks...",
3,( Do any benefits of alcohol outweigh the risk...,"[(1525653998.0, https://metafact.io/factchecks...",
4,"( Does acupuncture work for headaches?, Does ...","[(1617955595.0, https://metafact.io/factchecks...",
...,...,...,...
205744,(🇫🇷 في فرنسا ، يقرر رجال الشرطة العسكرية والمد...,"[(1617976680.0, https://factuel.afp.com/ar/Fre...",(هذا الفيديو ليس لتحرّك الشرطة الفرنسيّة ضدّ ا...
205745,(👆This little beautiful girl was seen in Manga...,"[(1576281540.0, https://youturn.in/articles/ch...",(மங்களூரில் பிச்சை எடுக்கும் குழுவில் மீட்கப்ப...
205747,(📌إيطاليين و أجانب رجال و نساء ، أطفال و عجزة ...,"[(1616693700.0, https://factuel.afp.com/ar/thi...",(هذه الصور لطابورٍ أمام مركز توزيع مساعدات غذا...
205749,(🔵Confirmado... Amanhã acabarão as mensagens g...,"[(1570924680.0, https://www.boatos.org/tecnolo...",(WhatsApp vai cobrar 0.37 centavos por mensage...


In [8]:
df_posts

Unnamed: 0_level_0,instances,ocr,verdicts,text
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"[(1608571882.0, fb)]",[(! Dreister Impf-Fake von Markus Söder! Es is...,[False information],
1,"[(1586139153.0, fb)]",[(!! WARNING !! A new thing circulating now. P...,[False information],
2,"[(1610052141.0, fb), (1610072448.0, fb)]","[(""Actually, he's a damn sight better than any...",[Missing context],
3,"[(1645187790.0, ig)]","[(""Australia 50 MILLONES de dosis de ""vacuna"" ...",[False],
4,"[(1581697500.0, fb)]","[(""Bienaventurados los perseguidos por mi caus...",[],
...,...,...,...,...
28085,"[(1651921264.0, fb), (1651882168.0, fb)]",[],[Partly false information],(🧐Se separó el presidente...la constitución de...
28087,"[(1653138895.0, fb)]",[(bruising runny ed 1 e Contents of the pack a...,[Missing context],"(🧐🧐🧐, 🧐🧐🧐, [(eng, 1.0)])"
28089,"[(1657134606.0, fb)]",[],[Partly false information],"(🧬Robert Malone, inventeur de la technologie A..."
28090,"[(1646255245.0, tw)]",[(Number of Covid-19 Deaths 4500 4000 3500 300...,[],(🧵Enquanto você se distrai com a invasão da Rú...


In [9]:
df_fact_check_post_mapping

Unnamed: 0,post_id,fact_check_id
0,2228,33
1,2228,23568
2,2228,194577
3,2229,33
4,2229,23568
...,...,...
25738,14765,201987
25739,17076,201987
25740,18203,201987
25741,25641,201991


In [3]:
TASK = "crosslingual" #args.task
LANG =  "not rec" # "eng" # args.lang
SPLIT = "train" # args.split

print(f"Task: {TASK}, Language: {LANG}, Split: {SPLIT}")

posts_split = tasks[TASK][f'posts_{SPLIT}']
print(f"Number of posts in {SPLIT} set:", len(posts_split))

fact_checks = tasks[TASK]['fact_checks']
print("Number of fact checks:", len(fact_checks))

## filter dataframes
df_posts_split = df_posts[df_posts.index.isin(posts_split)]
assert len(df_posts_split) == len(posts_split)

df_fact_checks = df_fact_checks[df_fact_checks.index.isin(fact_checks)]
assert len(df_fact_checks) == len(fact_checks)



Task: crosslingual, Language: not rec, Split: train
Number of posts in train set: 4972
Number of fact checks: 153743


In [23]:
# tasks["crosslingual"].keys()

dict_keys(['fact_checks', 'posts_train', 'posts_dev'])

In [4]:
TASK = "monolingual" #args.task
LANG = "eng" # args.lang
SPLIT = "train" # args.split

print(f"Task: {TASK}, Language: {LANG}, Split: {SPLIT}")

posts_split = tasks[TASK][LANG][f'posts_{SPLIT}']
print(f"Number of posts in {SPLIT} set:", len(posts_split))

fact_checks = tasks[TASK][LANG]['fact_checks']
print("Number of fact checks:", len(fact_checks))

## filter dataframes
df_posts_split = df_posts[df_posts.index.isin(posts_split)]
assert len(df_posts_split) == len(posts_split)

df_fact_checks = df_fact_checks[df_fact_checks.index.isin(fact_checks)]
assert len(df_fact_checks) == len(fact_checks)



Task: monolingual, Language: eng, Split: train
Number of posts in train set: 4351
Number of fact checks: 85734


In [4]:
def save_dataframe_to_csv(df, file_path, index=True, encoding='utf-8'):
    """
    Saves a pandas DataFrame to a CSV file, including the index by default.

    Parameters:
    - df (pd.DataFrame): The DataFrame to save.
    - file_path (str): The path where the CSV file will be saved.
    - index (bool): Whether to include the DataFrame index in the CSV file. Default is True.
    - encoding (str): The encoding to use for the CSV file. Default is 'utf-8'.

    Returns:
    - None
    """
    try:
        df.to_csv(file_path, index=index, encoding=encoding)
        print(f"DataFrame successfully saved to {file_path}")
    except Exception as e:
        print(f"An error occurred while saving the DataFrame: {e}")

In [26]:
df_posts_split

Unnamed: 0_level_0,instances,ocr,verdicts,text
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,"[(1645187790.0, ig)]","[(""Australia 50 MILLONES de dosis de ""vacuna"" ...",[False],
16,"[(1633129058.0, fb)]","[(""Estrictamente y hablando con sentido, la co...",[Partly false information],
30,"[(1598378047.0, fb)]","[(""No es necesario creer en Dios para ser una ...",[False information],
60,"[(1631046537.0, fb)]",[(#Artés Presidente #Save Palestine ... [USER]...,[Partly false information],
62,"[(1649941805.0, fb)]",[(#CNN: Child soldiers are ok if they are to d...,[Altered photo],
...,...,...,...,...
28056,"[(1643613736.0, fb)]","[(Foto: EPA Es ist nur eine Frage von Zeit, bi...",[Altered photo],"(🤮, 🤮, [(und, 1.0)])"
28074,"[(1629072979.0, fb), (1629072967.0, fb)]",[],[False information],(🦑 Ladrão desembarcou hoje em recife olha a mu...
28087,"[(1653138895.0, fb)]",[(bruising runny ed 1 e Contents of the pack a...,[Missing context],"(🧐🧐🧐, 🧐🧐🧐, [(eng, 1.0)])"
28090,"[(1646255245.0, tw)]",[(Number of Covid-19 Deaths 4500 4000 3500 300...,[],(🧵Enquanto você se distrai com a invasão da Rú...


In [5]:
df_posts_split['query_ids'] = df_posts_split.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_posts_split['query_ids'] = df_posts_split.index


In [6]:
df_fact_checks['doc_ids'] = df_fact_checks.index

In [7]:
## Extract the source language 

# concat all OCR text from source language (0th index)
df_posts_split['ocr_all_srclang'] = df_posts_split['ocr'].apply(lambda x: ' '.join([i[0] for i in x]) if x else "")

# extract text from source language (0th index)
df_posts_split['text_srclang'] = df_posts_split['text'].apply(lambda x: x[0] if x else "")

# query: OCR + text
df_posts_split['query'] = df_posts_split['ocr_all_srclang'] + ' ' + df_posts_split['text_srclang']

# extract claim and title from source language (0th index)
df_fact_checks['claim_srclang'] = df_fact_checks['claim'].apply(lambda x: x[0] if x else "")
df_fact_checks['title_srclang'] = df_fact_checks['title'].apply(lambda x: x[0] if x else "")

# doc: claim + title
df_fact_checks['doc'] = df_fact_checks['claim_srclang'] + ' ' + df_fact_checks['title_srclang']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_posts_split['ocr_all_srclang'] = df_posts_split['ocr'].apply(lambda x: ' '.join([i[0] for i in x]) if x else "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_posts_split['text_srclang'] = df_posts_split['text'].apply(lambda x: x[0] if x else "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [14]:
import pandas as pd
from transformers import pipeline

# Function to expand document using LLM
def expand_document(doc, model_name="llama2"):
    # Initialize the LLM pipeline
    generator = pipeline('text-generation', model=model_name)
    
    # Create the prompt
    prompt = f"Expand the following fact-checked claims and title with additional context such that if there is some social media post we can retrieve this document for fact checking only if this document is relevant. Do not repeat the present words in the claim or question. Add two or three sentences and should not be long:\n\nClaim:{doc}\n\nContext:"
    # prompts = [f"Expand the following fact-checked claims and title with additional context such that if there is some social media post we can retrieve this document for fact checking only if this document is relevant. Do not repeat the present words in the claims\n\n{doc}\n\nContext:" for doc in batch]
        
    # Generate the expanded context
    expanded_context = generator(prompt, max_length=512, num_return_sequences=1)[0]['generated_text']
    
    # Concatenate the original doc with the expanded context
    expanded_doc = f"{doc} {expanded_context}"
    
    return expanded_doc

# Function to add expanded_doc column to DataFrame
def add_expanded_doc_column(df, model_name="llama2"):
    df['expanded_doc'] = df['doc'].apply(lambda x: expand_document(x, model_name))
    return df



In [9]:
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"

In [13]:
df_fact_checks["instances"][2]

[(1525653998.0,
  'https://metafact.io/factchecks/173-can-we-help-prevent-alzheimer-s-with-diet')]

In [None]:
# Example usage
if __name__ == "__main__":
    # Assuming df_fact_checks is already loaded and contains the 'doc' column
    
    # Add the expanded_doc column
    df_fact_checks = add_expanded_doc_column(df_fact_checks, model_name="meta-llama/Llama-3.2-1B")
    
    # Display the DataFrame with the new expanded_doc column
    print(df_fact_checks)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with

In [None]:
save_dataframe_to_csv(df_fact_checks,"exp_docs_crosslingual_dev.csv")
save_dataframe_to_csv(df_posts_split ,"exp_queries_crosslingual_dev.csv")

In [41]:
# save_dataframe_to_csv(df_fact_checks,"docs_monolingual_dev.csv")
# save_dataframe_to_csv(df_posts_split ,"queries_monolingual_dev.csv")

DataFrame successfully saved to docs_monolingual_dev.csv
DataFrame successfully saved to queries_monolingual_dev.csv
