In [8]:
import os
import json
import argparse
import numpy as np
import pytrec_eval
from utils import read_json
from dataloader import get_data

In [9]:
def save_dataframe_to_csv(df, file_path, index=True, encoding='utf-8'):
    """
    Saves a pandas DataFrame to a CSV file, including the index by default.

    Parameters:
    - df (pd.DataFrame): The DataFrame to save.
    - file_path (str): The path where the CSV file will be saved.
    - index (bool): Whether to include the DataFrame index in the CSV file. Default is True.
    - encoding (str): The encoding to use for the CSV file. Default is 'utf-8'.

    Returns:
    - None
    """
    try:
        df.to_csv(file_path, index=index, encoding=encoding)
        print(f"DataFrame successfully saved to {file_path}")
    except Exception as e:
        print(f"An error occurred while saving the DataFrame: {e}")

In [13]:
TASK = 'monolingual'
LANG = 'eng'
SPLIT = 'train'

print(f"Task: {TASK}, Language: {LANG}, Split: {SPLIT}")

df_fact_checks, df_posts, df_fact_check_post_mapping = get_data('./data')
tasks = read_json(f"./data/tasks.json")



Task: monolingual, Language: eng, Split: train


In [23]:
tasks[TASK][LANG].keys() #[f'posts_{SPLIT}']

dict_keys(['fact_checks', 'posts_train', 'posts_dev'])

In [25]:
posts_split = tasks[TASK][LANG][f'posts_{SPLIT}']
print(f"Number of posts in {SPLIT} set:", len(posts_split))



Number of posts in train set: 4351


In [32]:
# posts_split

In [28]:
fact_checks = tasks[TASK][LANG]['fact_checks']
print("Number of fact checks:", len(fact_checks))



Number of fact checks: 85734


In [30]:
# fact_checks

In [33]:
## filter dataframes
df_posts_split = df_posts[df_posts.index.isin(posts_split)]
assert len(df_posts_split) == len(posts_split)

df_fact_checks = df_fact_checks[df_fact_checks.index.isin(fact_checks)]
assert len(df_fact_checks) == len(fact_checks)


In [129]:
TASK = 'monolingual'
LANG = 'eng'
SPLIT = 'train'

df_fact_checks, df_posts, df_fact_check_post_mapping = get_data('')
tasks = read_json(f"tasks.json")

posts_split = tasks[TASK][LANG][f'posts_{SPLIT}']
print(f"Number of posts in {SPLIT} set:", len(posts_split))

fact_checks = tasks[TASK][LANG]['fact_checks']
print("Number of fact checks:", len(fact_checks))

## filter dataframes
df_posts_split = df_posts[df_posts.index.isin(posts_split)]
assert len(df_posts_split) == len(posts_split)

df_fact_checks = df_fact_checks[df_fact_checks.index.isin(fact_checks)]
assert len(df_fact_checks) == len(fact_checks)

Number of posts in train set: 4351
Number of fact checks: 85734


In [38]:
df_posts_split

Unnamed: 0_level_0,instances,ocr,verdicts,text
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,"[(1610052141.0, fb), (1610072448.0, fb)]","[(""Actually, he's a damn sight better than any...",[Missing context],
5,"[(1595754861.0, fb)]","[(""Cigarette smoking does not cause cancer."" -...",[False information],
13,"[(1473438408.0, fb)]","[(""Environmentalists"" Say Fracking is Evil 53-...",[Partly false information],
14,"[(1601665028.0, fb)]","[(""Environmentalists"" Say Fracking is Evil SAP...",[Partly false information],
15,"[(1603642996.0, fb), (1603523078.0, fb)]","[(""Environmentalists"" Say Fracking is Evil Thi...",[Partly false information],
...,...,...,...,...
28044,"[(1592947214.0, fb)]",[(COVID has killed 1.6M people in the world an...,[False information],"(🤦🏾 ♂️🙏🏾🙏🏾, 🤦🏾 ♂️🙏🏾🙏🏾, [(eng, 1.0)])"
28048,"[(1615388401.0, fb)]",[],[False information],(🤩Why Every Judge On Shark Tank Backed This Pr...
28061,"[(None, fb)]","[(Boris Johnson's dad, Stanley, wrote a novel ...",[],"(🤷, 🤷, [(eng, 1.0)])"
28063,"[(1657130896.0, fb)]",[(= HealthSite.com HEALTH A-Z NO MORE DENGUE D...,[Partly false information],"(🤷🏼 ♀️, 🤷🏼 ♀️, [(eng, 1.0)])"


In [42]:
df_posts_split['query'][62]

KeyError: 'query'

In [40]:
df_posts_split['query_ids'] = df_posts_split.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_posts_split['query_ids'] = df_posts_split.index


In [43]:
df_posts_split

Unnamed: 0_level_0,instances,ocr,verdicts,text,query_ids
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,"[(1610052141.0, fb), (1610072448.0, fb)]","[(""Actually, he's a damn sight better than any...",[Missing context],,2
5,"[(1595754861.0, fb)]","[(""Cigarette smoking does not cause cancer."" -...",[False information],,5
13,"[(1473438408.0, fb)]","[(""Environmentalists"" Say Fracking is Evil 53-...",[Partly false information],,13
14,"[(1601665028.0, fb)]","[(""Environmentalists"" Say Fracking is Evil SAP...",[Partly false information],,14
15,"[(1603642996.0, fb), (1603523078.0, fb)]","[(""Environmentalists"" Say Fracking is Evil Thi...",[Partly false information],,15
...,...,...,...,...,...
28044,"[(1592947214.0, fb)]",[(COVID has killed 1.6M people in the world an...,[False information],"(🤦🏾 ♂️🙏🏾🙏🏾, 🤦🏾 ♂️🙏🏾🙏🏾, [(eng, 1.0)])",28044
28048,"[(1615388401.0, fb)]",[],[False information],(🤩Why Every Judge On Shark Tank Backed This Pr...,28048
28061,"[(None, fb)]","[(Boris Johnson's dad, Stanley, wrote a novel ...",[],"(🤷, 🤷, [(eng, 1.0)])",28061
28063,"[(1657130896.0, fb)]",[(= HealthSite.com HEALTH A-Z NO MORE DENGUE D...,[Partly false information],"(🤷🏼 ♀️, 🤷🏼 ♀️, [(eng, 1.0)])",28063


In [44]:
df_fact_checks['doc_ids'] = df_fact_checks.index

In [47]:
df_fact_checks

Unnamed: 0_level_0,claim,instances,title,doc_ids
fact_check_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"( Are avocados good for you?, Are avocados go...","[(1525653998.0, https://metafact.io/factchecks...",,0
1,"( Can animals have headaches?, Can animals ha...","[(1617955634.0, https://metafact.io/factchecks...",,1
2,"( Can we help prevent Alzheimer's with diet?, ...","[(1525653998.0, https://metafact.io/factchecks...",,2
3,( Do any benefits of alcohol outweigh the risk...,"[(1525653998.0, https://metafact.io/factchecks...",,3
4,"( Does acupuncture work for headaches?, Does ...","[(1617955595.0, https://metafact.io/factchecks...",,4
...,...,...,...,...
202885,"(€500,000 COVID-19 fraud was traced to Nigeria...","[(1609372740.0, https://dubawa.org/how-true-is...","(How true is the claim that €500,000 COVID-19 ...",202885
202887,"(₹2000 note to be discarded from October 10, 2...","[(1570454615.0, https://www.boomlive.in/₹2000-...",(₹2000 Note To Be Discontinued From Oct 10? Wh...,202887
205741,(𝐀𝐝𝐢𝐝𝐚𝐬 is giving away 3000 Free Pair of Shoes...,"[(1599387035.0, https://hindi.asianetnews.com/...",(adidas मुफ्त दे रहा है 3100 जूते और टीशर्ट्स?...,205741
205742,(𝐔𝐒𝐀 𝐌𝐈𝐋𝐈𝐓𝐀𝐑𝐘 𝐀𝐓 𝐓𝐇𝐄 𝐖𝐇𝐈𝐓𝐄 𝐇𝐎𝐔𝐒𝐄 𝐀𝐑𝐑𝐄𝐒𝐓𝐈𝐍𝐆 𝐂𝐎𝐍...,"[(1654875008.0, https://leadstories.com/hoax-a...",(Fact Check: US Military Did NOT Arrest Congre...,205742


In [51]:
df_fact_checks['claim'][1]

(' Can animals have headaches?',
 ' Can animals have headaches?',
 [('eng', 1.0)])

In [66]:
doc_list = df_fact_checks['doc'].astype(str).tolist()

In [68]:
len(doc_list)

153743

In [130]:
## Extract the source language 

# concat all OCR text from source language (0th index)
df_posts_split['ocr_all_srclang'] = df_posts_split['ocr'].apply(lambda x: ' '.join([i[0] for i in x]) if x else "")

# extract text from source language (0th index)
df_posts_split['text_srclang'] = df_posts_split['text'].apply(lambda x: x[0] if x else "")

# query: OCR + text
df_posts_split['query'] = df_posts_split['ocr_all_srclang'] + ' ' + df_posts_split['text_srclang']

# extract claim and title from source language (0th index)
df_fact_checks['claim_srclang'] = df_fact_checks['claim'].apply(lambda x: x[0] if x else "")
df_fact_checks['title_srclang'] = df_fact_checks['title'].apply(lambda x: x[0] if x else "")

# doc: claim + title
df_fact_checks['doc'] = df_fact_checks['claim_srclang'] + ' ' + df_fact_checks['title_srclang']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_posts_split['ocr_all_srclang'] = df_posts_split['ocr'].apply(lambda x: ' '.join([i[0] for i in x]) if x else "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_posts_split['text_srclang'] = df_posts_split['text'].apply(lambda x: x[0] if x else "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [134]:
len(df_fact_checks)

85734

In [135]:
len(df_posts_split)

4351

In [52]:
save_dataframe_to_csv(df_fact_checks,"docs_monolingual.csv")

DataFrame successfully saved to docs_monolingual.csv


In [53]:
save_dataframe_to_csv(df_posts_split ,"queries_monolingual.csv")

DataFrame successfully saved to queries_monolingual.csv


In [56]:
!pwd

/gypsum/work1/allan/anijasure/multilingual_semeval/IRMultiLingualSemEval


In [58]:
import pandas as pd
df_docs = pd.read_csv("docs_monolingual.csv")
doc_ids = df_docs['doc_ids'].tolist()

In [59]:
df_queries = pd.read_csv("queries_monolingual.csv")
query_ids = df_queries['query_ids'].tolist()

In [61]:
df_docs.columns

Index(['fact_check_id', 'claim', 'instances', 'title', 'doc_ids'], dtype='object')

In [95]:
df_docs['doc_ids'][45603]

52474

In [92]:
df_docs[df_docs['doc_ids'] == 45603]

Unnamed: 0,fact_check_id,claim,instances,title,claim_srclang,title_srclang,doc,doc_ids
39716,45603,('Diesel price has been reduced by Rs 8 in Del...,"[(1627948680.0, 'https://newsmobile.in/article...",('Fact Check: Old News About Diesel Price Redu...,Diesel price has been reduced by Rs 8 in Delhi,Fact Check: Old News About Diesel Price Reduct...,Diesel price has been reduced by Rs 8 in Delhi...,45603


In [91]:
df_queries[df_queries['query_ids'] == 3]

Unnamed: 0,post_id,instances,ocr,verdicts,text,ocr_all_srclang,text_srclang,query,query_ids
0,3,"[(1645187790.0, 'ig')]","[('""Australia 50 MILLONES de dosis de ""vacuna""...",['False'],,"""Australia 50 MILLONES de dosis de ""vacuna"" re...",,"""Australia 50 MILLONES de dosis de ""vacuna"" re...",3
1,16,"[(1633129058.0, 'fb')]","[('""Estrictamente y hablando con sentido, la c...",['Partly false information'],,"""Estrictamente y hablando con sentido, la conq...",,"""Estrictamente y hablando con sentido, la conq...",16
2,30,"[(1598378047.0, 'fb')]","[('""No es necesario creer en Dios para ser una...",['False information'],,"""No es necesario creer en Dios para ser una bu...",,"""No es necesario creer en Dios para ser una bu...",30
3,60,"[(1631046537.0, 'fb')]",[('#Artés Presidente #Save Palestine ... [USER...,['Partly false information'],,#Artés Presidente #Save Palestine ... [USER] M...,,#Artés Presidente #Save Palestine ... [USER] M...,60
4,62,"[(1649941805.0, 'fb')]","[(""#CNN: Child soldiers are ok if they are to ...",['Altered photo'],,#CNN: Child soldiers are ok if they are to def...,,#CNN: Child soldiers are ok if they are to def...,62
...,...,...,...,...,...,...,...,...,...
4967,28056,"[(1643613736.0, 'fb')]","[('Foto: EPA Es ist nur eine Frage von Zeit, b...",['Altered photo'],"('🤮', '🤮', [('und', 1.0)])","Foto: EPA Es ist nur eine Frage von Zeit, bis ...",🤮,"Foto: EPA Es ist nur eine Frage von Zeit, bis ...",28056
4968,28074,"[(1629072979.0, 'fb'), (1629072967.0, 'fb')]",[],['False information'],('🦑 Ladrão desembarcou hoje em recife olha a m...,,🦑 Ladrão desembarcou hoje em recife olha a mul...,🦑 Ladrão desembarcou hoje em recife olha a mu...,28074
4969,28087,"[(1653138895.0, 'fb')]",[('bruising runny ed 1 e Contents of the pack ...,['Missing context'],"('🧐🧐🧐', '🧐🧐🧐', [('eng', 1.0)])",bruising runny ed 1 e Contents of the pack and...,🧐🧐🧐,bruising runny ed 1 e Contents of the pack and...,28087
4970,28090,"[(1646255245.0, 'tw')]",[('Number of Covid-19 Deaths 4500 4000 3500 30...,[],('🧵Enquanto você se distrai com a invasão da R...,Number of Covid-19 Deaths 4500 4000 3500 3000 ...,🧵Enquanto você se distrai com a invasão da Rús...,Number of Covid-19 Deaths 4500 4000 3500 3000 ...,28090


In [86]:
len(query_ids)

4972

In [90]:
df_docs

Unnamed: 0,fact_check_id,claim,instances,title,claim_srclang,title_srclang,doc,doc_ids
0,0,"(' Are avocados good for you?', ' Are avocados...","[(1525653998.0, 'https://metafact.io/factcheck...",,Are avocados good for you?,,Are avocados good for you?,0
1,1,"(' Can animals have headaches?', ' Can animals...","[(1617955634.0, 'https://metafact.io/factcheck...",,Can animals have headaches?,,Can animals have headaches?,1
2,2,"("" Can we help prevent Alzheimer's with diet?""...","[(1525653998.0, 'https://metafact.io/factcheck...",,Can we help prevent Alzheimer's with diet?,,Can we help prevent Alzheimer's with diet?,2
3,3,(' Do any benefits of alcohol outweigh the ris...,"[(1525653998.0, 'https://metafact.io/factcheck...",,Do any benefits of alcohol outweigh the risks?,,Do any benefits of alcohol outweigh the risks?,3
4,4,"(' Does acupuncture work for headaches?', ' Do...","[(1617955595.0, 'https://metafact.io/factcheck...",,Does acupuncture work for headaches?,,Does acupuncture work for headaches?,4
...,...,...,...,...,...,...,...,...
153738,205744,('🇫🇷 في فرنسا ، يقرر رجال الشرطة العسكرية والم...,"[(1617976680.0, 'https://factuel.afp.com/ar/Fr...",('هذا الفيديو ليس لتحرّك الشرطة الفرنسيّة ضدّ ...,🇫🇷 في فرنسا ، يقرر رجال الشرطة العسكرية والمدن...,هذا الفيديو ليس لتحرّك الشرطة الفرنسيّة ضدّ ال...,🇫🇷 في فرنسا ، يقرر رجال الشرطة العسكرية والمدن...,205744
153739,205745,('👆This little beautiful girl was seen in Mang...,"[(1576281540.0, 'https://youturn.in/articles/c...",('மங்களூரில் பிச்சை எடுக்கும் குழுவில் மீட்கப்...,👆This little beautiful girl was seen in Mangal...,மங்களூரில் பிச்சை எடுக்கும் குழுவில் மீட்கப்பட...,👆This little beautiful girl was seen in Mangal...,205745
153740,205747,('📌إيطاليين و أجانب رجال و نساء ، أطفال و عجزة...,"[(1616693700.0, 'https://factuel.afp.com/ar/th...",('هذه الصور لطابورٍ أمام مركز توزيع مساعدات غذ...,📌إيطاليين و أجانب رجال و نساء ، أطفال و عجزة ا...,هذه الصور لطابورٍ أمام مركز توزيع مساعدات غذائ...,📌إيطاليين و أجانب رجال و نساء ، أطفال و عجزة ا...,205747
153741,205749,('🔵Confirmado... Amanhã acabarão as mensagens ...,"[(1570924680.0, 'https://www.boatos.org/tecnol...",('WhatsApp vai cobrar 0.37 centavos por mensag...,🔵Confirmado... Amanhã acabarão as mensagens gr...,WhatsApp vai cobrar 0.37 centavos por mensagem...,🔵Confirmado... Amanhã acabarão as mensagens gr...,205749


In [124]:
qrels = {}
run = {} 
run[str(3)] = {} 
qrels[str(3)] = {}
qrels['3']['50973'] = 1 
run['3']['52474'] = '0.69548714'

In [115]:
type(qrels)

dict

In [116]:
run

{'3': {'52474': 0.69548714, '50973': 0.6737824}}

In [125]:
evaluator = pytrec_eval.RelevanceEvaluator(qrels,['P_3', 'P_5', 'P_10', 'map_cut_3', 'map_cut_5', 'map_cut_10', 'ndcg_cut_3', 'ndcg_cut_5', 'ndcg_cut_10'])
results_scores = evaluator.evaluate(run)
print(results_scores)

TypeError: Unable to extract query/object scores.

In [103]:
type(qrels)

dict

In [106]:
evaluator

<pytrec_eval.RelevanceEvaluator at 0x71a6e4293d80>