In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
nlp = spacy.load("en_core_web_lg")

from tqdm import tqdm
tqdm.pandas()   

from src import config
from src.datasets import TextConcatFactCheck, TextConcatPosts
from src.utils import cleaning_spacy, cleaning_spacy_batch

tasks_path = config.TASKS_PATH
posts_path = config.POSTS_PATH
fact_checks_path = config.FACT_CHECKS_PATH
gs_path = config.GS_PATH
lang = 'eng'
task_name = "crosslingual"

print("Loading Fact Checks...")
fc = TextConcatFactCheck(fact_checks_path, tasks_path=tasks_path, task_name=task_name, lang=lang, version="english")
print("Loading Fact Checks (English + Clean)...")
# fc_eng = TextConcatFactCheck(fact_checks_path, tasks_path=tasks_path, task_name=task_name, lang=lang, version="english", cleaning_function=lambda x: cleaning_spacy_batch(x, nlp))

print("Loading Posts...")
posts = TextConcatPosts(posts_path, tasks_path=tasks_path, task_name=task_name, lang=lang, gs_path=gs_path, version="english")
print("Loading Posts (English + Clean)...")
posts_eng = TextConcatPosts(posts_path, tasks_path=tasks_path, task_name=task_name, lang=lang, gs_path=gs_path, version="english", cleaning_function=lambda x: cleaning_spacy_batch(x, nlp))

  from .autonotebook import tqdm as notebook_tqdm


Loading Fact Checks...
Loading Fact Checks (English + Clean)...
Loading Posts...
Loading Posts (English + Clean)...


100%|██████████| 4972/4972 [00:05<00:00, 869.80it/s] 


In [8]:
df_train_orig = posts.df_train
df_dev_orig = posts.df_dev
df_dev_clean = posts_eng.df_dev

df_fc_orig = fc.df
df_fc = fc.df

In [7]:
df_pairs = pd.read_csv("data/complete_data/pairs.csv")
fcs_per_post = df_pairs.groupby("fact_check_id").count()
# .value_counts()
fcs_per_post[fcs_per_post["post_id"] == 14]

Unnamed: 0_level_0,post_id
fact_check_id,Unnamed: 1_level_1
36164,14


In [10]:
import re
df_fc["webs"] = df_fc["instances"].apply(lambda x: [re.findall("https?\://(www\.)?(.+\.\w+)\/.*", y) for y in x])
df_fc["webs"] = df_fc["webs"].apply(lambda x: list(set([y[0][1] for y in x if len(y) > 0])))
df_fc

Unnamed: 0_level_0,claim,instances,title,full_text,webs
fact_check_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Are avocados good for you?,[https://metafact.io/factchecks/175-are-avocad...,,Are avocados good for you?,[metafact.io]
1,Can animals have headaches?,[https://metafact.io/factchecks/1754-can-anima...,,Can animals have headaches?,[metafact.io]
2,Can we help prevent Alzheimer's with diet?,[https://metafact.io/factchecks/173-can-we-hel...,,Can we help prevent Alzheimer's with diet?,[metafact.io]
3,Do any benefits of alcohol outweigh the risks?,[https://metafact.io/factchecks/172-do-any-ben...,,Do any benefits of alcohol outweigh the risks?,[metafact.io]
4,Does acupuncture work for headaches?,[https://metafact.io/factchecks/1752-does-acup...,,Does acupuncture work for headaches?,[metafact.io]
...,...,...,...,...,...
205744,"🇫🇷 In France, the military and civilian police...",[https://factuel.afp.com/ar/French-police-demo...,This video is not for the French police's acti...,This video is not for the French police's acti...,[factuel.afp.com]
205745,👆This little beautiful girl was seen in Mangal...,[https://youturn.in/articles/child-in-mangalor...,A child rescued from a group of beggars in Man...,A child rescued from a group of beggars in Man...,[youturn.in]
205747,"📌 Italians and foreigners, men and women, chil...",[https://factuel.afp.com/ar/this-photo-shows-a...,These photos of a queue in front of a food aid...,These photos of a queue in front of a food aid...,[factuel.afp.com]
205749,🔵Confirmed... Tomorrow the free messages will ...,[https://www.boatos.org/tecnologia/whatsapp-co...,WhatsApp will charge 0.37 cents per message st...,WhatsApp will charge 0.37 cents per message st...,[boatos.org]


In [17]:
df_preds = pd.read_json("output/contrastive/contrastive_eng_multi_multi/crosslingual/20241213-223605/crosslingual_predictions.json").T
# df_preds = df_preds.reset_index().rename(columns={"index": "post_id"})
df_preds = df_preds.apply(lambda x: list(x.values), axis=1).reset_index().rename(columns={"index":"post_id", 0: "preds"})


df_dev_preds = df_dev_orig.merge(df_preds, on="post_id")

# df_dev_preds = df_dev_preds[["post_id", "ocr", "text", "full_text", "verdicts", "preds", "gs"]]
df_dev_preds["correct"] = df_dev_preds.apply(lambda x: len(list(set(x["gs"]).intersection(set(x["preds"])))) > 0, axis=1)
df_dev_preds = df_dev_preds.explode("gs")
df_dev_preds = df_dev_preds.merge(df_fc, left_on="gs", right_on="fact_check_id")
df_dev_preds = df_dev_preds.explode("webs")

df_dev_preds_inc = df_dev_preds[df_dev_preds["correct"] == 0]
df_dev_preds_cor = df_dev_preds[df_dev_preds["correct"] == 1]

df_dev_preds_inc


Unnamed: 0,post_id,ocr,verdicts,text,lan,fb,tw,ig,full_text_x,gs,preds,correct,claim,instances,title,full_text_y,webs
13,1139,"Today more than ever, Chileans and the world r...",False information,,spa,1,0,0,"Today more than ever, Chileans and the world r...",34144,"[54497, 53903, 60130, 85387, 37262, 53783, 100...",False,CNN reported that the Pinochetists are increasing,[https://factual.afp.com/doc.afp.com.328K7ZB#8...,Supposed news about Pinochetists uses as a bas...,Supposed news about Pinochetists uses as a bas...,factual.afp.com
17,1339,The tragedy of Nova Scotia una favola plù big ...,False information,,spa,1,0,0,The tragedy of Nova Scotia una favola plù big ...,54008,"[73115, 24434, 54026, 194327, 11565, 154251, 4...",False,This magazine predicted what the world would l...,[https://factual.afp.com/la-imagen-muestra-un-...,"The image shows a futuristic vehicle, not a pr...","The image shows a futuristic vehicle, not a pr...",factual.afp.com
20,1589,Photos of sleeping in the office not at all Th...,False information,,other,1,0,0,Photos of sleeping in the office not at all Th...,69259,"[24505, 103714, 175767, 96949, 38653, 119789, ...",False,Image shows public sector worker in Sri Lanka,[https://factcheck.afp.com/photo-actually-show...,This photo actually shows a bank employee in I...,This photo actually shows a bank employee in I...,factcheck.afp.com
24,1753,First with a hood and stick in hand... After a...,False information,,spa,1,0,0,First with a hood and stick in hand... After a...,107154,"[107155, 76276, 44655, 144096, 193440, 69487, ...",False,Police infiltrated the demonstrations after th...,[https://factual.afp.com/las-fotos-que-senalan...,The photos that indicate a police infiltrated ...,The photos that indicate a police infiltrated ...,factual.afp.com
30,2300,[Breaking News] Japan's Fukushima Nuclear Powe...,Partly false information,,other,1,0,0,[Breaking News] Japan's Fukushima Nuclear Powe...,15081,"[108060, 104230, 171682, 104249, 68832, 39452,...",False,A massive fire broke out at the Fukushima nucl...,[https://factcheck.afp.com/doc.afp.com.326V4PP...,Social media users share misleading Fukushima ...,Social media users share misleading Fukushima ...,factcheck.afp.com
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,26723,,,A fire broke out at a nuclear power plant in U...,other,1,0,0,A fire broke out at a nuclear power plant in ...,105648,"[144966, 53865, 54276, 167657, 50938, 168781, ...",False,Photos show Ukrainian nuclear power plant on f...,[https://factcheck.afp.com/doc.afp.com.326V4VQ...,Old photos circulate after Russian forces atta...,Old photos circulate after Russian forces atta...,factcheck.afp.com
377,26969,,False information,🌀This image was created by Professor of Neurol...,por,1,0,0,🌀This image was created by Professor of Neuro...,53919,"[69474, 68366, 137997, 53949, 68395, 193416, 7...",False,This image was created by Professor of Neurolo...,[https://checamos.afp.com/esta-ilusao-de-optic...,This optical illusion was not created by a pro...,This optical illusion was not created by a pro...,checamos.afp.com
380,27075,,Partly false information,👆 Scare India and get vaccinated 🤦🏻 ♂️ Complet...,other,1,0,0,👆 Scare India and get vaccinated 🤦🏻 ♂️ Comple...,144939,"[136011, 52400, 141972, 142079, 143029, 146148...",False,Video shows vaccination arranged on the black ...,[https://factcheck.afp.com/video-shows-failed-...,The video shows a failed Covid-19 vaccination ...,The video shows a failed Covid-19 vaccination ...,factcheck.afp.com
383,27482,090177e196ea1800\Approved Approved On: 30-Apr-...,,🔴A confidential Pfizer report has been leaked....,fra,0,1,0,090177e196ea1800\Approved Approved On: 30-Apr-...,137239,"[44169, 46039, 59050, 41393, 201522, 149369, 1...",False,A Pfizer report reveals a hundred side effects...,[https://factuel.afp.com/doc.afp.com.324T2DV#9...,"No, a ""confidential report"" from Pfizer does n...","No, a ""confidential report"" from Pfizer does n...",factuel.afp.com


In [16]:
df_dev_preds_inc["webs"].value_counts()

webs
factcheck.afp.com       44
factual.afp.com         14
factuel.afp.com         10
checamos.afp.com         9
faktencheck.afp.com      6
periksafakta.afp.com     3
semakanfakta.afp.com     2
newsmobile.in            1
africacheck.org          1
correctiv.org            1
rappler.com              1
Name: count, dtype: int64

In [18]:
df_dev_preds_cor["webs"].value_counts()

webs
factcheck.afp.com            179
factual.afp.com               59
checamos.afp.com              20
factuel.afp.com               18
factcheckthailand.afp.com      6
faktencheck.afp.com            5
periksafakta.afp.com           5
politifact.com                 1
africacheck.org                1
indiatoday.in                  1
verifica.efe.com               1
newtral.es                     1
newsmobile.in                  1
semakanfakta.afp.com           1
correctiv.org                  1
thequint.com                   1
cinjenice.afp.com              1
leadstories.com                1
Name: count, dtype: int64