In [56]:
import json
import pandas as pd
import glob

monolingual_files = glob.glob("nbs/final_eval" + '/**/monolingual/**/*predictions.json', recursive=True)
crosslingual_files = glob.glob("nbs/final_eval" + '/**/crosslingual/**/*predictions.json', recursive=True)
d_tasks = json.load(open("data/splits/tasks.json"))
mono_gs = json.load(open("nbs/final_eval/mono_dev_gs.json"))
cross_gs = json.load(open("nbs/final_eval/cross_dev_gs.json"))

cross_preds = "official/contrastive/snowflake_mv2/crosslingual/20250131-000403/crosslingual_predictions.json"

def get_gs(d_task, task_name, d_gs):
    
    if task_name == "crosslingual":
        df_gs = pd.DataFrame({"cross": d_tasks["crosslingual"]})
    else:
        df_gs = pd.DataFrame(d_task[task_name])
        
    df_gs = df_gs.loc["posts_dev", :].reset_index().explode("posts_dev").rename(columns={"index": "lang", "posts_dev": "post_id"}).astype(str)
    df_gs["gs"] = df_gs["post_id"].map(d_gs)
    df_gs = df_gs.explode("gs")
    return df_gs

def get_preds(file):
    return pd.DataFrame(json.load(open(file))).T.apply(list, axis=1).explode().reset_index().rename(columns={"index": "post_id", 0: "pred"})

df_gs_cross = get_gs(d_tasks, "crosslingual", cross_gs)

df_cross_preds = get_preds(cross_preds)

In [23]:
df_cross_preds

Unnamed: 0,post_id,pred
0,34,6399
1,34,77464
2,34,77462
3,34,116777
4,34,73564
...,...,...
5515,28059,43041
5516,28059,40361
5517,28059,153751
5518,28059,139255


In [24]:
df_cross = df_gs_cross.merge(df_cross_preds, on="post_id")
df_cross["correct"] = df_cross["gs"] == df_cross["pred"]
ls_check_inc = df_cross.groupby("post_id")["correct"].any()
ls_check_inc = ls_check_inc[~ls_check_inc].index
df_check_inc = df_cross[df_cross["post_id"].isin(ls_check_inc)]
df_check_inc

Unnamed: 0,lang,post_id,gs,pred,correct
20,cross,116,106143,101502,False
21,cross,116,106143,104228,False
22,cross,116,106143,68966,False
23,cross,116,106143,167778,False
24,cross,116,106143,104307,False
...,...,...,...,...,...
6375,cross,27558,86293,91030,False
6376,cross,27558,86293,150274,False
6377,cross,27558,86293,90987,False
6378,cross,27558,86293,141885,False


In [67]:
from src.datasets import TextConcatPosts, TextConcatFactCheck
from src import config
posts_path = config.POSTS_PATH
tasks_path = config.PHASE1_TASKS_PATH
task_name = "crosslingual"
fact_checks_path = config.FACT_CHECKS_PATH
lang="eng"
gs_path = config.GS_PATH

posts = TextConcatPosts(posts_path, tasks_path, task_name=task_name, gs_path=gs_path, lang=lang)
fact_checks = TextConcatFactCheck(fact_checks_path, tasks_path, task_name=task_name, lang=lang)


In [68]:
df_posts = posts.df
df_fact_checks = fact_checks.df

df_posts

Unnamed: 0_level_0,ocr,verdicts,text,lan,fb,tw,ig,full_text,gs
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3,"""Australia 50 MILLONES de dosis de ""vacuna"" re...",False,,fra,0,0,1,"""Australia 50 MILLONES de dosis de ""vacuna"" re...",[50973]
16,"""Estrictamente y hablando con sentido, la conq...",Partly false information,,spa,1,0,0,"""Estrictamente y hablando con sentido, la conq...",[38000]
30,"""No es necesario creer en Dios para ser una bu...",False information,,spa,1,0,0,"""No es necesario creer en Dios para ser una bu...",[3857]
60,#Artés Presidente #Save Palestine ... [USER] M...,Partly false information,,spa,1,0,0,#Artés Presidente #Save Palestine ... [USER] M...,[50802]
62,#CNN: Child soldiers are ok if they are to def...,Altered photo,,eng,1,0,0,#CNN: Child soldiers are ok if they are to def...,"[34139, 134205]"
...,...,...,...,...,...,...,...,...,...
27750,"La ""mujer"" de Macron operada de la próstata!!,...",False information,🙄,eng,1,0,0,"La ""mujer"" de Macron operada de la próstata!!,...",[]
27794,,Missing context,"🚨 NOTICIERO DE PORTUGAL : ""La inmensa mayoría ...",spa,1,0,0,"🚨 NOTICIERO DE PORTUGAL : ""La inmensa mayoría...",[]
27995,,,🤔👌ईरान के यूवक ने मक्का 🕋मे दूध चढाया ओर बोला ...,hin,1,0,0,🤔👌ईरान के यूवक ने मक्का 🕋मे दूध चढाया ओर बोला...,[]
28013,,False information,🤝🙏🌹♦️ *जय जननी* कोलंबिया (अमेरिका🇺🇸) की सड़कों...,hin,1,0,0,🤝🙏🌹♦️ *जय जननी* कोलंबिया (अमेरिका🇺🇸) की सड़को...,[]


In [69]:
df_posts = posts.df
df_fact_checks = fact_checks.df
df_posts.index = df_posts.index.astype(str)

In [70]:
d_langs_posts = df_posts["lan"].to_dict()
d_langs_fact_checks = df_fact_checks["lan"].to_dict()

In [71]:
df_cross = df_check_inc.copy()
df_cross["post_id"] = df_cross["post_id"].astype(str)
df_cross["lang_post"] = df_cross["post_id"].map(d_langs_posts)
df_cross["lang_gs"] = df_cross["gs"].map(d_langs_fact_checks)
df_cross["lang_pred"] = df_cross["pred"].map(d_langs_fact_checks)


In [72]:
df_cross

Unnamed: 0,lang,post_id,gs,pred,correct,lang_post,lang_gs,lang_pred
20,cross,116,106143,101502,False,sin,eng,msa
21,cross,116,106143,104228,False,sin,eng,eng
22,cross,116,106143,68966,False,sin,eng,eng
23,cross,116,106143,167778,False,sin,eng,ara
24,cross,116,106143,104307,False,sin,eng,eng
...,...,...,...,...,...,...,...,...
6375,cross,27558,86293,91030,False,hin,eng,eng
6376,cross,27558,86293,150274,False,hin,eng,eng
6377,cross,27558,86293,90987,False,hin,eng,eng
6378,cross,27558,86293,141885,False,hin,eng,eng
