In [2]:
"""
This is just a quick script that is able to load the files. Just using pandas can be tricky because of the newline characters in the text data. Here it is handled via the `parse_col` method.
"""

import ast
import os

import pandas as pd

our_dataset_path = 'data/toy_data/sample_data'

posts_path = os.path.join(our_dataset_path, 'trial_posts.csv')
fact_checks_path = os.path.join(our_dataset_path, 'trial_fact_checks.csv')
fact_check_post_mapping_path = os.path.join(our_dataset_path, 'trial_data_mapping.csv')

for path in [posts_path, fact_checks_path, fact_check_post_mapping_path]:
    assert os.path.isfile(path)

# We need to apply t = t.replace('\n', '\\n') for text fields before using `ast.literal_eval`.
# `ast.literal_eval` has problems when there are new lines in the text, e.g.:
# `ast.literal_eval('("\n")')` effectively tries to interpret the following code:

# ```
# ("
# ")
# ```

# This raises a SyntaxError exception. By escaping new lines we are able to force it to interpret it properly. There might
# be some other way to do this more systematically, but it is a workable fix for now.

parse_col = lambda s: ast.literal_eval(s.replace('\n', '\\n')) if s else s

df_fact_checks = pd.read_csv(fact_checks_path).fillna('').set_index('fact_check_id')
for col in ['claim', 'instances', 'title']:
    df_fact_checks[col] = df_fact_checks[col].apply(parse_col)


df_posts = pd.read_csv(posts_path).fillna('').set_index('post_id')
for col in ['instances', 'ocr', 'verdicts', 'text']:
    df_posts[col] = df_posts[col].apply(parse_col)


df_fact_check_post_mapping = pd.read_csv(fact_check_post_mapping_path) 


In [4]:
df_fact_checks.head()

Unnamed: 0_level_0,claim,instances,title
fact_check_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,(South Africa's PRASA hiring workers for railw...,"[(1655739360.0, https://factcheck.afp.com/doc....",(Facebook posts falsely claim rail operator in...
1,(“Merck Scraps COVID Vaccines; Says It’s More ...,"[(1611619140.0, https://healthfeedback.org/cla...",(COVID-19 vaccines currently in use stimulate ...
2,"(Photos show NATO leaders in Madrid, Spain, Ph...","[(1657251780.0, https://factcheck.afp.com/doc....",(Doctored photos of world leaders with 'Satani...
3,"(Bus crash claims 18 lives in Milima, Zambia, ...","[(1579275420.0, https://factcheck.afp.com/no-d...",(No deaths were reported but several people su...
4,(U.S. pharmaceutical company Merck said it scr...,"[(1611871602.0, https://www.usatoday.com/story...",(Fact check: Merck discontinues COVID-19 vacci...


In [24]:
df_fact_checks.loc[0, "claim"]

("South Africa's PRASA hiring workers for railway",
 "South Africa's PRASA hiring workers for railway",
 [('eng', 1.0)])

In [5]:
df_posts.head()

Unnamed: 0_level_0,instances,ocr,verdicts,text
post_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"[(1593519481.0, ig)]",[],[],(La Filarmonica de Paris vivenciando un bolero...
1,"[(1620729739.0, ig)]",[(MOM\nImahalo\nPatriot news\n@Grahmptr\n...\n...,[Partly False],
2,"[(1585630842.0, fb)]",[],[False information],(Así luce la Torre Latinoamericana de cdmx des...
3,"[(1648580403.0, fb)]",[],[False information],(El hombre fingió ser sordo y mudo durante 62 ...
4,"[(1515949200.0, fb)]",[],[Missing context],"(A PESAR DE SUS LOGROS, ESTO NO ES NOTICIA.\nH..."


In [30]:
df_post_lens = df_posts.map(len)
df_post_lens.describe()

Unnamed: 0,instances,ocr,verdicts,text
count,47.0,47.0,47.0,47.0
mean,1.148936,0.255319,0.851064,2.489362
std,0.465259,0.440755,0.359875,1.139648
min,1.0,0.0,0.0,0.0
25%,1.0,0.0,1.0,3.0
50%,1.0,0.0,1.0,3.0
75%,1.0,0.5,1.0,3.0
max,3.0,1.0,1.0,3.0


In [31]:
df_fc_lens = df_fact_checks.map(len)
df_fc_lens.describe()

Unnamed: 0,claim,instances,title
count,50.0,50.0,50.0
mean,3.0,1.0,3.0
std,0.0,0.0,0.0
min,3.0,1.0,3.0
25%,3.0,1.0,3.0
50%,3.0,1.0,3.0
75%,3.0,1.0,3.0
max,3.0,1.0,3.0


In [11]:
df_fact_check_post_mapping[df_fact_check_post_mapping.pair_lang == "eng-eng"].head()

Unnamed: 0,post_id,fact_check_id,pair_lang
20,6,16,eng-eng
21,1,26,eng-eng
22,16,3,eng-eng
23,24,9,eng-eng
24,33,38,eng-eng


In [18]:
def display_row(df, i):
    for col in df.columns:
        print(f"{col}: {df.loc[i, col]}")

display_row(df_posts, 6)
display_row(df_fact_checks, 16)


instances: [(1608091401.0, 'fb')]
ocr: []
verdicts: ['False information']
text: ("I'm old enough to remember when hydroxychloroquine was a kook theory from a kook president.", "I'm old enough to remember when hydroxychloroquine was a kook theory from a kook president.", [('eng', 1.0)])
claim: ('The AMA “reversed course”, now “giving the green light to doctors prescribing HCQ to their COVID patients.”', 'The AMA “reversed course”, now “giving the green light to doctors prescribing HCQ to their COVID patients.”', [('eng', 1.0)])
instances: [(1608249540.0, 'https://healthfeedback.org/claimreview/the-american-medical-association-does-not-reject-nor-support-hydroxychloroquine-as-a-treatment-for-covid-19-nor-did-it-change-its-position-on-the-use-matter/#82043a81c975548cfd63ba8f15be4769')]
title: ('The American Medical Association does not reject nor support hydroxychloroquine as a treatment for COVID-19, nor did it change its position on the use matter', 'The American Medical Association doe

In [20]:
display_row(df_posts, 1)
display_row(df_fact_checks, 26)

instances: [(1620729739.0, 'ig')]
ocr: [('MOM\nImahalo\nPatriot news\n@Grahmptr\n...\n....\nJUST IN: US Census Bureau\nConfirms HUGE CONFLICT in\nTotal Number of Voters in\n2020 Election-More People\nVoted Than Reported in 2020\nCensus Data. there is a\ndiscrepancy of nearly four\nmillion votes', 'MOM\nImahalo\nPatriot news\n@Grahmptr\n...\n....\nJUST IN: US Census Bureau\nConfirms HUGE CONFLICT in\nTotal Number of Voters in\n2020 Election-More People\nVoted Than Reported in 2020\nCensus Data. there is a\ndiscrepancy of nearly four\nmillion votes', [('eng', 0.9305148124694824), ('fil', 0.03528721258044243)])]
verdicts: ['Partly False']
text: 
claim: ('Census Bureau confirms conflict in total number of voters in 2020 election', 'Census Bureau confirms conflict in total number of voters in 2020 election', [('eng', 1.0)])
instances: [(1621021140.0, 'https://factcheck.afp.com/census-bureau-voter-tally-self-reported-not-official-election-data#ef1a31ac2b741f5621f5d1e962410144')]
title: ('Cen