In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
nlp = spacy.load("en_core_web_lg")

from tqdm import tqdm
tqdm.pandas()   

from src import config
from src.datasets import TextConcatFactCheck, TextConcatPosts
from src.utils import cleaning_spacy, cleaning_spacy_batch

tasks_path = config.TASKS_PATH
posts_path = config.POSTS_PATH
fact_checks_path = config.FACT_CHECKS_PATH
gs_path = config.GS_PATH
lang = 'eng'
task_name = "crosslingual"

print("Loading Fact Checks...")
fc = TextConcatFactCheck(fact_checks_path, tasks_path=tasks_path, task_name=task_name, lang=lang, version="english")
print("Loading Fact Checks (English + Clean)...")
# fc_eng = TextConcatFactCheck(fact_checks_path, tasks_path=tasks_path, task_name=task_name, lang=lang, version="english", cleaning_function=lambda x: cleaning_spacy_batch(x, nlp))

print("Loading Posts...")
posts = TextConcatPosts(posts_path, tasks_path=tasks_path, task_name=task_name, lang=lang, gs_path=gs_path, version="english")
print("Loading Posts (English + Clean)...")
# posts_eng = TextConcatPosts(posts_path, tasks_path=tasks_path, task_name=task_name, lang=lang, gs_path=gs_path, version="english", cleaning_function=lambda x: cleaning_spacy_batch(x, nlp))

  from .autonotebook import tqdm as notebook_tqdm


Loading Fact Checks...
Loading Fact Checks (English + Clean)...
Loading Posts...
Loading Posts (English + Clean)...


In [2]:
df_train_orig = posts.df_train
df_dev_orig = posts.df_dev
# df_dev_clean = posts_eng.df_dev

df_fc_orig = fc.df
df_fc_clean = fc.df

In [3]:
df_pairs = pd.read_csv("data/complete_data/pairs.csv")
fcs_per_post = df_pairs.groupby("fact_check_id").count()
# .value_counts()
fcs_per_post[fcs_per_post["post_id"] == 14]

Unnamed: 0_level_0,post_id
fact_check_id,Unnamed: 1_level_1
36164,14


In [4]:
df_pairs.groupby("post_id").count().sort_values("fact_check_id", ascending=False)

Unnamed: 0_level_0,fact_check_id
post_id,Unnamed: 1_level_1
5767,10
15782,6
16882,6
16570,6
17569,6
...,...
9764,1
9760,1
9758,1
9756,1


In [5]:
for idx, val in df_fc_clean.loc[36164].to_dict().items():
    print(idx, val)

claim These are decisions made by Donald Trump
instances ['https://factuel.afp.com/attention-cette-liste-de-decisions-ou-dactions-attribuees-trump-contient-de-nombreuses-fausses#a3773bec82d816dc110bed03746976c3']
title Beware, this list of decisions or actions attributed to Trump contains many false claims
full_text Beware, this list of decisions or actions attributed to Trump contains many false claims These are decisions made by Donald Trump


In [6]:
df_fc_clean["full_text"].str.lower().str.contains("trump").sum()

np.int64(8478)

In [7]:
df_fc_clean["full_text"].str.lower().str.contains("covid").sum()

np.int64(14572)

In [8]:
total_text = " ".join(df_fc_clean["full_text"].to_list())


In [9]:
total_text[-1000:]

"aid WhatsApp will charge 0.37 cents per message starting tomorrow #rumor 🔵Confirmed... Tomorrow the free messages will end, and they will start charging for whatsapp at 0.37 cents. Resend this message to more than 3 groups, and it will be free for life. Pay attention to the ball, because it will turn green, do Post mixes true data with incorrect numbers about BNDES financing in PT governments : Projeto Comprova 🕋🗃 WE OPEN THE BLACK BOX OF BNDES.......\n✅ DURING THE PT GOVERNMENTS OF LULARAPIO 🚩🚫 AND DILMANTA, THE QUESTIONS THAT NEVER SHUT UP.......\n🛑WHAT IS THE VALUE OF ROUND IN BNDES???\n🛑 WHAT DAMAGE HAS THE LEFT DONE IN OUR COUNTRY???\n❇️NOW, WITH THE ELECTION OF 🔰JAIR BOLSONARO, THE BNDES BLACK BOX🕋🗃 IS BEING OPENED, WITH THE DISCLOSURE.....✔OF THE 50 LARGEST DEBTOR COMPANIES (THE LIST OF ALL DEBTOR COMPANIES IS STILL MISSING) THE VALUES ARE BEGINNING TO BE DISCLOSED, AND AMAZING! IT'S A LOT OF TAX MONEY FROM BRAZILIAN WORKERS 🇧🇷 IT'S BILLIONS OF REAIS STOLEN FROM THE BRAZILIAN P

In [11]:
import json

def clean_verdicts(verdict):
    verdict = verdict.replace(".", "").lower()
    if "partly" in verdict:
        return "false information"
    else:
        return verdict
    
df_preds = pd.read_json("output/contrastive/contrastive_eng_multi_multi/crosslingual/20241213-223605/crosslingual_predictions.json").T
# df_preds = df_preds.reset_index().rename(columns={"index": "post_id"})
df_preds = df_preds.apply(lambda x: list(x.values), axis=1).reset_index().rename(columns={"index":"post_id", 0: "preds"})
df_preds

Unnamed: 0,post_id,preds
0,30,"[3857, 103383, 107672, 36649, 100294, 164481, ..."
1,163,"[121296, 89577, 141000, 37428, 132402, 193293,..."
2,167,"[134543, 170525, 174058, 164325, 142461, 17489..."
3,189,"[72119, 103593, 115143, 132134, 150999, 154230..."
4,403,"[69200, 107791, 167606, 163781, 41363, 121360,..."
...,...,...
380,27511,"[52432, 14119, 56655, 56654, 124795, 27838, 46..."
381,27708,"[27184, 151067, 72341, 101579, 84153, 126546, ..."
382,27731,"[136764, 136762, 136763, 169611, 163987, 14492..."
383,27955,"[121296, 77861, 176517, 138249, 141762, 10632,..."


In [12]:

df_dev_preds = df_dev_orig.merge(df_preds, on="post_id")
df_dev_preds["verdicts"] = df_dev_preds["verdicts"].apply(clean_verdicts)

# df_dev_preds = df_dev_preds[["post_id", "ocr", "text", "full_text", "verdicts", "preds", "gs"]]
df_dev_preds["correct"] = df_dev_preds.apply(lambda x: len(list(set(x["gs"]).intersection(set(x["preds"])))) > 0, axis=1)
df_dev_preds_inc = df_dev_preds[df_dev_preds["correct"] == 0]
df_dev_preds_inc

Unnamed: 0,post_id,ocr,verdicts,text,lan,fb,tw,ig,full_text,gs,preds,correct
13,1139,"Today more than ever, Chileans and the world r...",false information,,spa,1,0,0,"Today more than ever, Chileans and the world r...",[34144],"[54497, 53903, 60130, 85387, 37262, 53783, 100...",False
17,1339,The tragedy of Nova Scotia una favola plù big ...,false information,,spa,1,0,0,The tragedy of Nova Scotia una favola plù big ...,[54008],"[73115, 24434, 54026, 194327, 11565, 154251, 4...",False
20,1589,Photos of sleeping in the office not at all Th...,false information,,other,1,0,0,Photos of sleeping in the office not at all Th...,[69259],"[24505, 103714, 175767, 96949, 38653, 119789, ...",False
24,1753,First with a hood and stick in hand... After a...,false information,,spa,1,0,0,First with a hood and stick in hand... After a...,[107154],"[107155, 76276, 44655, 144096, 193440, 69487, ...",False
30,2300,[Breaking News] Japan's Fukushima Nuclear Powe...,false information,,other,1,0,0,[Breaking News] Japan's Fukushima Nuclear Powe...,[15081],"[108060, 104230, 171682, 104249, 68832, 39452,...",False
...,...,...,...,...,...,...,...,...,...,...,...,...
370,26723,,,A fire broke out at a nuclear power plant in U...,other,1,0,0,A fire broke out at a nuclear power plant in ...,[105648],"[144966, 53865, 54276, 167657, 50938, 168781, ...",False
371,26969,,false information,🌀This image was created by Professor of Neurol...,por,1,0,0,🌀This image was created by Professor of Neuro...,[53919],"[69474, 68366, 137997, 53949, 68395, 193416, 7...",False
374,27075,,false information,👆 Scare India and get vaccinated 🤦🏻 ♂️ Complet...,other,1,0,0,👆 Scare India and get vaccinated 🤦🏻 ♂️ Comple...,[144939],"[136011, 52400, 141972, 142079, 143029, 146148...",False
377,27482,090177e196ea1800\Approved Approved On: 30-Apr-...,,🔴A confidential Pfizer report has been leaked....,fra,0,1,0,090177e196ea1800\Approved Approved On: 30-Apr-...,[137239],"[44169, 46039, 59050, 41393, 201522, 149369, 1...",False


In [12]:
df_dev_preds.apply(lambda x: 36164 in x["gs"], axis=1).sum()

np.int64(0)

In [13]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [14]:
df_dev_clean.loc[3771, :]

ocr          years in 2022. That makes him the oldest land ...
verdicts                                          Partly False
text         #ascoisasinteressantes #sabiocurioso #like4lik...
lan                                                        eng
fb                                                           0
tw                                                           0
ig                                                           1
full_text    year 2022 make old land animal know alive toda...
gs                                                    [124055]
Name: 3771, dtype: object

In [17]:
from collections import Counter

def display_content(df):
    spacy_classes = ["PERSON", "ORG", "GPE", "NORP", "FAC", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART"]
    for i, row in df.iterrows():
        print(row["post_id"])
        print("OCR:", row["ocr"])
        print("Text:", row["text"])
        print("Correct:", row["correct"])
        
        if row["fb"]:
            source = "Facebook"
        elif row["tw"]:
            source = "Twitter"
        elif row["ig"]:
            source = "Instagram"
        
        print("Source:", source)
        print("Clean En:", df_dev_clean.loc[row["post_id"], "full_text"])
        print("Verdicts:", df_dev_clean.loc[row["post_id"], "verdicts"])
        print("Entities:",  Counter([subent.lemma_ for ent in nlp(df_dev_orig.loc[row["post_id"], "full_text"]).ents if ent.label_ in spacy_classes for subent in ent]))

        # print(row["preds"])
        # print(row["gs"])
        
        print("\nFACT CHECKS")
        for f in row["gs"]:
            print("SP", df_fc_orig.loc[f, "full_text"])
            print("EN", df_fc_clean.loc[f, "full_text"])
            print(Counter([subent.lemma_ for ent in nlp(df_fc_orig.loc[f, "full_text"]).ents if ent.label_ in spacy_classes for subent in ent]))
            
        print("\nPREDICTIONS")
        for p in row["preds"]:
            print(df_fc_orig.loc[p, "full_text"])
            print(df_fc_clean.loc[p, "full_text"])
            print(Counter([subent.lemma_ for ent in nlp(df_fc_orig.loc[p, "full_text"]).ents if (ent.label_ in spacy_classes) for subent in ent]))
            print("")
            
        print("="*50)
        
display_content(df_dev_preds)

30
OCR: "You don't have to believe in God to be a good person. In a way, the idea God's tradition is out of date. One it can be spiritual but not religious. Nope it is necessary to go to church and give money. For many, nature can be a church. Some of the best people in history did not believe in God, while many of the worst acts were done in your name." - Pope Francisco - }}
Text: 
Correct: True
Source: Facebook
Clean En: believe god good person way idea god tradition date spiritual religious nope necessary church money nature church good people history believe god bad act pope francisco
Verdicts: False information
Entities: Counter({'Francisco': 1})

FACT CHECKS
SP There is no evidence that Pope Francis said that "it is not necessary to believe in God" "It is not necessary to believe in God to be a good person" - Pope Francis
EN There is no evidence that Pope Francis said that "it is not necessary to believe in God" "It is not necessary to believe in God to be a good person" - Pope F

Counter({'Myanmar': 2, 'chinese': 1, 'China': 1})

Photo of Covid-19 lockdown in mainland China misleadingly linked to Hong Kong pandemic relief Photo of people from Chinese mainland heading to Hong Kong for pandemic aid
Photo of Covid-19 lockdown in mainland China misleadingly linked to Hong Kong pandemic relief Photo of people from Chinese mainland heading to Hong Kong for pandemic aid
Counter({'Hong': 2, 'Kong': 2, 'China': 1, 'chinese': 1})

These images show delivery of Chinese medical supplies in Ghana China sends COVID-19 vaccines to Nigeria
These images show delivery of Chinese medical supplies in Ghana China sends COVID-19 vaccines to Nigeria
Counter({'chinese': 1, 'Ghana': 1, 'China': 1, 'covid-19': 1, 'vaccine': 1, 'Nigeria': 1})

Video shows children wearing hazmat suits in northern China, not 'Shanghai during lockdown' Video shows children going to school during Shanghai's Covid lockdown
Video shows children wearing hazmat suits in northern China, not 'Shanghai during lock

In [1]:
import torch
from transformers import pipeline

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    # device="cuda",
    device_map="auto",
)

terminators = [
    pipe.tokenizer.eos_token_id,
    pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.34s/it]
Device set to use cuda:0


In [96]:
# ocr = "years in 2022. That makes him the oldest land animal known alive today."
# text = "#ascoisasinteressantes #sabiocurioso #like4like #food #instalike #likeforlike #family #travel #fitness #igers #tagsforlikes #follow4follow #nofilter #life #filosofia #estoicismo #beauty #amazing #instamood #instagram #sun #photo #mphotographyusic #beach #followforfollow #bestoftheday"

ocr = "First with a hood and stick in hand... After arresting protesters... What is it called??? or Carlos Roman Hamu"
text = ""
messages = [
        {"role": "system", "content": "You are an expert recovering the image of a post from a social media platform only using the text and OCR of the post. do it in less than 512 tokens."},
        {"role": "user", "content": "The OCR of the post is: " + ocr + " The text of the post is: " + text + "The post is from Facebook."},
        {"role": "system", "content": "Complete the post with the information from the image. Be concise and provide proper names if necessary. Be 100% confident in your answer. Provide only the description of the image."},
        ]

outputs = pipe(
            messages,
            max_new_tokens=256,
            temperature=0.1,
            eos_token_id=terminators,
            pad_token_id=pipe.tokenizer.eos_token_id

        )

In [97]:
outputs

[{'generated_text': [{'role': 'system',
    'content': 'You are an expert recovering the image of a post from a social media platform only using the text and OCR of the post. do it in less than 512 tokens.'},
   {'role': 'user',
    'content': 'The OCR of the post is: First with a hood and stick in hand... After arresting protesters... What is it called??? or Carlos Roman Hamu The text of the post is: The post is from Facebook.'},
   {'role': 'system',
    'content': 'Complete the post with the information from the image. Be concise and provide proper names if necessary. Be 100% confident in your answer. Provide only the description of the image.'},
   {'role': 'assistant',
    'content': 'Based on the provided OCR and text, I believe the post is referring to the "Rodney Riot" or more specifically, the "Rodney Riot of 1965" or the "Rodney Riot of 1967". However, the most likely answer is the "Rodney Riot of 1965" or the "Rodney Riot of 1967" is not correct. \n\nThe most likely answer i

In [176]:

df_train_orig.verdicts.value_counts()

verdicts
False information                        2229
                                          949
Partly false information                  671
Missing context                           217
False information.                        160
Altered photo                             109
Partly false information.                  69
Missing context.                           43
False information and graphic content      42
False                                      32
Partly False                               29
Altered video                              11
Missing Context                            10
 Sensitive content                          8
Altered photo/video.                        4
Altered Photo/Video                         3
Altered photo/video                         1
Name: count, dtype: int64

In [175]:
df_dev_preds.groupby("verdicts").agg({"correct": ["mean", "sum"], "full_text": "count"})

Unnamed: 0_level_0,correct,correct,full_text
Unnamed: 0_level_1,mean,sum,count
verdicts,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
,0.722222,52,72
sensitive content,1.0,1,1
altered photo,0.714286,10,14
altered video,0.5,1,2
false,0.666667,2,3
false information,0.765,153,200
false information and graphic content,1.0,5,5
missing context,0.888889,24,27
partly false,0.770492,47,61


In [147]:
df_dev_preds_inc[df_dev_preds_inc["verdicts"].str.contains("False information.")]

Unnamed: 0,post_id,ocr,verdicts,text,lan,fb,tw,ig,full_text,gs,preds,correct
112,9262,||| E PARLER Donald J. Trump [USER] 3 hours ag...,False information.,El acto de insurrección fue firmado hoy. El ej...,spa,1,0,0,||| E PARLER Donald J. Trump [USER] 3 hours ag...,[47219],"[109388, 133923, 108825, 108851, 109399, 10940...",False
148,12991,,False information.,LA FRANCE CHERCHE LES TERRORISTES DANS LE SOUS...,fra,1,0,0,LA FRANCE CHERCHE LES TERRORISTES DANS LE SOU...,[36479],"[36740, 36483, 83488, 36673, 36748, 36717, 852...",False
162,14835,,False information.,NASA航拍璀璨夜景，🌌让你用上帝的视角看看我们颜值爆表的地球君🌍💗 *因疫情14行动管制 ...,other,1,0,0,NASA航拍璀璨夜景，🌌让你用上帝的视角看看我们颜值爆表的地球君🌍💗 *因疫情14行动管制...,[91834],"[115830, 163366, 69462, 131896, 103277, 106206...",False
178,16096,VICE PRESIDENT COMELEC REPORTED ABOUT 54.4M re...,False information.,PLANO lahat yan ni NOYNOY AQUINO gagawin niya ...,other,1,0,0,VICE PRESIDENT COMELEC REPORTED ABOUT 54.4M re...,[104225],"[32603, 32589, 32591, 35156, 86842, 83122, 326...",False
241,21776,18.52.00 III FBI warned about 'biosecurity ris...,False information.,ada perkembangan baru US baru saja menangkap c...,msa,1,0,0,18.52.00 III FBI warned about 'biosecurity ris...,[23694],"[49075, 49076, 98639, 110179, 55934, 48962, 37...",False
266,23759,,False information.,کرونا وائرس دنیا کے ہر ملک میں پھیل گیا سوائے ...,other,1,0,0,کرونا وائرس دنیا کے ہر ملک میں پھیل گیا سوائے...,[73727],"[42432, 142702, 73755, 73716, 73739, 73756, 61...",False
287,24115,sprieta abged ਸ S COMMO PARTS OF MART2 Itz a b...,False information.,पहचानो इन देशद्रोही झामपंथी गद्दारों को एक #वृ...,other,1,0,0,sprieta abged ਸ S COMMO PARTS OF MART2 Itz a b...,[82940],"[34793, 69739, 105401, 17287, 72133, 141509, 7...",False


In [148]:
df_dev_preds_inc[df_dev_preds_inc["verdicts"].str.contains("Missing context.")]

Unnamed: 0,post_id,ocr,verdicts,text,lan,fb,tw,ig,full_text,gs,preds,correct
313,24736,ADANA Fortune ෆෝලික් ඇllurial s Coconut Oil 20...,Missing context.,මතකද හොරෙන් ආනයනය කළා කියලා ලංකා ආණ්ඩුව පාසැල්...,other,1,0,0,ADANA Fortune ෆෝලික් ඇllurial s Coconut Oil 20...,[103909],"[62407, 61465, 35611, 73202, 66510, 48001, 156...",False


In [129]:
df_dev_preds_inc.loc[:, "text"] = df_dev_preds_inc.loc[:, "text"].apply(lambda x: None if x == "" else x)
df_dev_preds_inc.loc[:, "ocr"] = df_dev_preds_inc.loc[:, "ocr"].apply(lambda x: None if x == "" else x)
df_dev_preds_inc.isnull().sum() / df_dev_preds_inc.shape[0]


post_id      0.000000
ocr          0.472222
verdicts     0.000000
text         0.027778
lan          0.000000
fb           0.000000
tw           0.000000
ig           0.000000
full_text    0.000000
gs           0.000000
preds        0.000000
correct      0.000000
dtype: float64

In [131]:
df_dev_preds.shape

(339, 12)

In [134]:
df_dev_preds.groupby("correct").agg({"fb": ["sum"], "tw": ["sum"], "ig": ["sum",]}).T.apply(lambda x: x / x.sum(), axis=1)

Unnamed: 0,correct,False,True
fb,sum,0.099359,0.900641
tw,sum,0.210526,0.789474
ig,sum,0.076923,0.923077


In [135]:
df_dev_preds_inc[df_dev_preds_inc["tw"] == 1]

Unnamed: 0,post_id,ocr,verdicts,text,lan,fb,tw,ig,full_text,gs,preds,correct
39,3239,,,#CoronaVirusInNigeria all you need to know [URL],eng,0,1,0,#CoronaVirusInNigeria all you need to know [URL],[34786],"[94072, 132196, 105339, 69868, 143378, 34228, ...",False
164,12019,"Geert Vanden Bossche, DMV, PhD, independent vi...",,In this open letter I am appealing to the [USE...,eng,0,1,0,"Geert Vanden Bossche, DMV, PhD, independent vi...",[87396],"[34739, 31264, 31174, 34515, 195132, 199626, 1...",False
247,18729,,,Strange how this keeps happening. Will someone...,eng,0,1,0,Strange how this keeps happening. Will someon...,[29276],"[62385, 144338, 42696, 63453, 72388, 20049, 52...",False
289,20444,وددار ده شمال دوستان P NA-53 Islamabad Sa سوشل...,,"Usman Mirza, the known rapist &amp; a certifie...",eng,0,1,0,وددار ده شمال دوستان P NA-53 Islamabad Sa سوشل...,[69120],"[104381, 103820, 141774, 70075, 97315, 23325, ...",False


In [136]:
df_dev_preds_inc[df_dev_preds_inc["fb"] == 1]

Unnamed: 0,post_id,ocr,verdicts,text,lan,fb,tw,ig,full_text,gs,preds,correct
19,1702,Patrick Novick 15h- It's hard to trust pollste...,Missing context,,eng,1,0,0,Patrick Novick 15h- It's hard to trust pollste...,[39666],"[108067, 42045, 41564, 4666, 75793, 197133, 55...",False
41,3595,,Missing context,#RussiaUkraineWar: Two young Ukrainian childre...,eng,1,0,0,#RussiaUkraineWar: Two young Ukrainian childr...,[104881],"[112674, 134660, 112675, 68999, 24657, 15881, ...",False
50,4287,,,*UPDATE* South Africa: Military Taking Control...,eng,1,0,0,*UPDATE* South Africa: Military Taking Contro...,[26614],"[148742, 121252, 81727, 23145, 114960, 121234,...",False
61,5045,,False information,A photojournalist rolls down tears while captu...,eng,1,0,0,A photojournalist rolls down tears while capt...,[104957],"[105665, 17236, 104960, 69659, 105664, 153506,...",False
62,5076,,Partly false information,A real life Hero! His wife passed away during ...,eng,1,0,0,A real life Hero! His wife passed away during...,[86410],"[102976, 106017, 123995, 103078, 68567, 69192,...",False
69,5727,,False information,Almost 1 million descend on iconic Bondi Beach...,eng,1,0,0,Almost 1 million descend on iconic Bondi Beac...,[132150],"[132203, 103313, 155041, 144413, 69110, 69700,...",False
76,6331,11:53 AM Forwarded just now! 11:53 AM,False information,Ay! Kayo naman! Huwak fake News hah!,other,1,0,0,11:53 AM Forwarded just now! 11:53 AM Ay! Kayo...,[103842],"[102730, 62397, 62385, 102756, 91450, 62416, 5...",False
140,11220,,False information,Happen at sengkang..please look after your kids,eng,1,0,0,Happen at sengkang..please look after your kids,[142072],"[104360, 104215, 132160, 132404, 146685, 14226...",False
144,11351,,False information,"Here’s your problem, they’re going to eat us o...",eng,1,0,0,"Here’s your problem, they’re going to eat us ...",[12526],"[104307, 52200, 8073, 132173, 132892, 132271, ...",False
145,11491,NIZAMIYA HOTEL & BAKE HOUSE NING HALL +94 4 (U...,,Hotel Nizamiya🤢 කොහෙද දන් නෑ... 👉දැනුවත් වෙන්න...,other,1,0,0,NIZAMIYA HOTEL & BAKE HOUSE NING HALL +94 4 (U...,[112378],"[25386, 35072, 148549, 41561, 138462, 103859, ...",False


In [115]:
df_dev_preds_inc[df_dev_preds_inc["ocr"].isnull()]

Unnamed: 0,post_id,ocr,text,full_text,verdicts,preds,gs,correct
39,3239,,#CoronaVirusInNigeria all you need to know [URL],#CoronaVirusInNigeria all you need to know [URL],,"[94072, 132196, 105339, 69868, 143378, 34228, ...",[34786],0
41,3595,,#RussiaUkraineWar: Two young Ukrainian childre...,#RussiaUkraineWar: Two young Ukrainian childr...,Missing context,"[112674, 134660, 112675, 68999, 24657, 15881, ...",[104881],0
50,4287,,*UPDATE* South Africa: Military Taking Control...,*UPDATE* South Africa: Military Taking Contro...,,"[148742, 121252, 81727, 23145, 114960, 121234,...",[26614],0
61,5045,,A photojournalist rolls down tears while captu...,A photojournalist rolls down tears while capt...,False information,"[105665, 17236, 104960, 69659, 105664, 153506,...",[104957],0
62,5076,,A real life Hero! His wife passed away during ...,A real life Hero! His wife passed away during...,Partly false information,"[102976, 106017, 123995, 103078, 68567, 69192,...",[86410],0
69,5727,,Almost 1 million descend on iconic Bondi Beach...,Almost 1 million descend on iconic Bondi Beac...,False information,"[132203, 103313, 155041, 144413, 69110, 69700,...",[132150],0
140,11220,,Happen at sengkang..please look after your kids,Happen at sengkang..please look after your kids,False information,"[104360, 104215, 132160, 132404, 146685, 14226...",[142072],0
144,11351,,"Here’s your problem, they’re going to eat us o...","Here’s your problem, they’re going to eat us ...",False information,"[104307, 52200, 8073, 132173, 132892, 132271, ...",[12526],0
184,13843,,Look at this what a disgrace,Look at this what a disgrace,False information,"[106104, 104215, 52200, 69860, 103697, 62027, ...",[12526],0
194,14758,,"My response to [USER] , [USER] & John Oliver [...","My response to [USER] , [USER] & John Oliver ...",,"[150015, 131591, 146095, 62981, 132404, 58266,...",[58213],0
