In [1]:
# sources:
# MultiFC - evidence based fact checking: https://competitions.codalab.org/competitions/21163 (pending)
# PHEME - rumour detection: https://figshare.com/articles/PHEME_dataset_for_Rumour_Detection_and_Veracity_Classification/6392078
# Fake news: https://www.kaggle.com/mrisdal/fake-news
# Liar, Liar Pants On Fire - fake news: https://github.com/thiagorainmaker77/liar_dataset
# Fake News Challenge: https://github.com/FakeNewsChallenge/fnc-1-baseline 
# Fever - fake news workshops: https://fever.ai/

# PHEME is absolutely massive and distributed across many files

#import files
import zipfile, tarfile
import requests

fake = zipfile.ZipFile('../data/fake-news.zip', 'r')
fnc = zipfile.ZipFile('../data/fnc-1-master.zip', 'r')
liar = zipfile.ZipFile('../data/liar_dataset.zip', 'r')
pheme = zipfile.ZipFile('../data/PHEME.zip', 'r')
fever = requests.get('https://s3-eu-west-1.amazonaws.com/fever.public/fever2-fixers-dev.jsonl')
rumor = zipfile.ZipFile('../data/rumor-citation.zip', 'r')

zips = [fake, fnc, liar, pheme, rumor]
for file in zips:
    file.printdir()
    print('\n')

pheme_tar = tarfile.open(pheme.extract('PHEME_veracity.tar.bz2'))
    
print(fever)
print("\n")
print(pheme_tar)

File Name                                             Modified             Size
fake.csv                                       2019-09-20 02:14:32     56680002


File Name                                             Modified             Size
fnc-1-master/                                  2017-06-14 23:58:08            0
fnc-1-master/README.md                         2017-06-14 23:58:08          962
fnc-1-master/competition_test_bodies.csv       2017-06-14 23:58:08      2045680
fnc-1-master/competition_test_stances.csv      2017-06-14 23:58:08      2177588
fnc-1-master/competition_test_stances_unlabeled.csv 2017-06-14 23:58:08      1940688
fnc-1-master/scorer.py                         2017-06-14 23:58:08         4617
fnc-1-master/test_bodies.csv                   2017-06-14 23:58:08      2045680
fnc-1-master/test_stances_unlabeled.csv        2017-06-14 23:58:08      1940688
fnc-1-master/train_bodies.csv                  2017-06-14 23:58:08      3752301
fnc-1-master/train_stances.csv   

In [4]:
# get data from files and convert to pandas

import io

fake_csv = io.TextIOWrapper(fake.open('fake.csv'))
fnc_train_csvs = [io.TextIOWrapper(fnc.open('fnc-1-master/train_bodies.csv')), io.TextIOWrapper(fnc.open('fnc-1-master/train_stances.csv'))]
fnc_test_csvs = [io.TextIOWrapper(fnc.open('fnc-1-master/test_bodies.csv')), io.TextIOWrapper(fnc.open('fnc-1-master/test_stances_unlabeled.csv'))]
liar_train_tsv = io.TextIOWrapper(liar.open('train.tsv'))
liar_test_tsv = io.TextIOWrapper(liar.open('test.tsv'))
rumor_csvs = io.TextIOWrapper(rumor.open('emergent.csv')), io.TextIOWrapper(rumor.open('politifact.csv')), io.TextIOWrapper(rumor.open('snopes.csv'))

import pandas as p

fake_df = p.read_csv(fake_csv)
fnc_train_dfs = [p.read_csv(fnc_train_csvs[0]), p.read_csv(fnc_train_csvs[1])]
fnc_test_dfs = [p.read_csv(fnc_test_csvs[0]), p.read_csv(fnc_test_csvs[1])]
liar_train_df = p.read_table(liar_train_tsv,
                             names = ['id','label','statement',
                                      'subject','speaker','job',
                                      'state','party','barely_true_c',
                                      'false_c','half_true_c','mostly_true_c',
                                      'pants_on_fire_c','venue'])
liar_test_df = p.read_table(liar_test_tsv,
                             names = ['id','label','statement',
                                      'subject','speaker','job',
                                      'state','party','barely_true_c',
                                      'false_c','half_true_c','mostly_true_c',
                                      'pants_on_fire_c','venue'])
rumor_dfs = [p.read_csv(rumor_csvs[0]), p.read_csv(rumor_csvs[1]), p.read_csv(rumor_csvs[2])]

import json
feverlines = [json.loads(line) for line in fever.text.splitlines()]
fever_df = p.DataFrame(feverlines)

In [3]:
pheme_json_names = []
for name in pheme_tar.getnames():
    if '.json' in name and not '_' in name:
        pheme_json_names.append(name)
        
import re
from itertools import groupby

keyfunc = lambda text: (re.findall(".*rumours?/(.*?)/.*", text) + [text])[0]
pheme_json_names = [list(filenames) for gr, filenames in groupby(sorted(pheme_json_names), key=keyfunc)]
pheme_json_names

[['all-rnr-annotated-threads/charliehebdo-all-rnr-threads/non-rumours/552784600502915072/annotation.json',
  'all-rnr-annotated-threads/charliehebdo-all-rnr-threads/non-rumours/552784600502915072/reactions/552785249420447745.json',
  'all-rnr-annotated-threads/charliehebdo-all-rnr-threads/non-rumours/552784600502915072/reactions/552786761534144512.json',
  'all-rnr-annotated-threads/charliehebdo-all-rnr-threads/non-rumours/552784600502915072/reactions/552786803884060672.json',
  'all-rnr-annotated-threads/charliehebdo-all-rnr-threads/non-rumours/552784600502915072/reactions/552786954656710656.json',
  'all-rnr-annotated-threads/charliehebdo-all-rnr-threads/non-rumours/552784600502915072/reactions/552787979224092672.json',
  'all-rnr-annotated-threads/charliehebdo-all-rnr-threads/non-rumours/552784600502915072/reactions/552788534269341696.json',
  'all-rnr-annotated-threads/charliehebdo-all-rnr-threads/non-rumours/552784600502915072/reactions/552790305263849472.json',
  'all-rnr-annotat

In [10]:
pheme_jsons = []
print(len(pheme_json_names))

import chardet

for items in pheme_json_names[:10]:
    items_json = []
    for name in items:
        items_json.append(json.loads(pheme_tar.extractfile(name).read().decode('utf_8')))
    pheme_jsons.append(items_json)

pheme_jsons

6425
here
here
here
here
here
here
here
here
here
here


[[{'is_rumour': 'nonrumour'},
  {'contributors': None,
   'truncated': False,
   'text': 'Now 10 dead in a shooting there today RT "@BBCDanielS: Charlie Hebdo became well known for publishing the Muhammed cartoons two years ago”',
   'in_reply_to_status_id': 552784600502915072,
   'id': 552785249420447745,
   'favorite_count': 0,
   'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
   'retweeted': False,
   'coordinates': None,
   'entities': {'symbols': [],
    'user_mentions': [{'id': 331658004,
      'indices': [42, 53],
      'id_str': '331658004',
      'screen_name': 'BBCDanielS',
      'name': 'Daniel Sandford'}],
    'hashtags': [],
    'urls': []},
   'in_reply_to_screen_name': 'BBCDanielS',
   'id_str': '552785249420447745',
   'retweet_count': 0,
   'in_reply_to_user_id': 331658004,
   'favorited': False,
   'user': {'follow_request_sent': False,
    'profile_use_background_image': True,
    'profile_text_color': '333333',
    'd

In [33]:
pheme_df = p.DataFrame()
for l in pheme_jsons:
    list_df = p.DataFrame()
    for file in l:
        list_df.append(p.json_normalize(file))
    pheme_df.append(list_df)
    
pheme_df

   is_rumour
0  nonrumour
  contributors  truncated                                               text  \
0         None      False  Now 10 dead in a shooting there today RT "@BBC...   

   in_reply_to_status_id                  id  favorite_count  \
0     552784600502915072  552785249420447745               0   

                                              source  retweeted coordinates  \
0  <a href="http://twitter.com/download/iphone" r...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0              BBCDanielS  ...                   300         robbylevy   

   user.notifications  user.url                 user.created_at  \
0               False      None  Thu Dec 25 05:12:43 +0000 2008   

  user.contributors_enabled              user.time_zone user.protected  \
0                     False  Central Time (US & Canada)          False   

  user.default_profile user.is_translator  
0                False              False  

[1 

[1 rows x 65 columns]
  contributors  truncated                                               text  \
0         None      False  @GabTarquini @BBCDanielS @BBCWorld Anyways, I ...   

   in_reply_to_status_id                  id  favorite_count  \
0     552792654262849537  552792748320100352               1   

                                              source  retweeted coordinates  \
0  <a href="http://twitter.com" rel="nofollow">Tw...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0              S_Jakobsen  ...                  2129        S_Jakobsen   

   user.notifications  user.url                 user.created_at  \
0               False      None  Mon May 09 17:27:22 +0000 2011   

  user.contributors_enabled user.time_zone user.protected  \
0                     False     Copenhagen          False   

  user.default_profile user.is_translator  
0                False              False  

[1 rows x 65 columns]
  contribut

  contributors  truncated                        text  in_reply_to_status_id  \
0         None      False  @usmanka fuck off you cunt     552792981020086272   

                   id  favorite_count  \
0  552864675743158272               0   

                                              source  retweeted coordinates  \
0  <a href="http://twitter.com" rel="nofollow">Tw...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0                 usmanka  ...                   553   pepsiplusconker   

   user.notifications  user.url                 user.created_at  \
0               False      None  Sat Jun 16 11:07:34 +0000 2012   

  user.contributors_enabled user.time_zone user.protected  \
0                     False      Amsterdam          False   

  user.default_profile user.is_translator  
0                 True              False  

[1 rows x 64 columns]
  contributors  truncated                                               text  \

  contributors  truncated                                               text  \
0         None      False  Charlie Hebdo became well known for publishing...   

  in_reply_to_status_id                  id  favorite_count  \
0                  None  552784600502915072              41   

                                              source  retweeted coordinates  \
0  <a href="http://twitter.com" rel="nofollow">Tw...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0                    None  ...                     0        BBCDanielS   

  user.notifications                user.url                 user.created_at  \
0              False  http://t.co/tPNR3GoVZJ  Fri Jul 08 14:32:54 +0000 2011   

  user.contributors_enabled user.time_zone user.protected  \
0                     False         Moscow          False   

  user.default_profile user.is_translator  
0                False              False  

[1 rows x 66 columns]
  5527846

  contributors  truncated                                               text  \
0         None      False  @JamesDelingpole Shove your racist website up ...   

   in_reply_to_status_id                  id  favorite_count  \
0     552785391653494784  552785793375539200               0   

                                              source  retweeted coordinates  \
0  <a href="http://twitter.com" rel="nofollow">Tw...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0         JamesDelingpole  ...                   180      Mr_JDTraynor   

   user.notifications                 user.url  \
0               False  https://t.co/szhDON9GcD   

                  user.created_at user.contributors_enabled user.time_zone  \
0  Sun Sep 23 23:17:26 +0000 2012                     False      Amsterdam   

  user.protected user.default_profile user.is_translator  
0          False                False              False  

[1 rows x 65 columns]
  c

  contributors  truncated                                            text  \
0         None      False  @Mr_JDTraynor @JamesDelingpole Oh the irony...   

   in_reply_to_status_id                  id  favorite_count  \
0     552790186904788992  552791519376470017               0   

                                              source  retweeted coordinates  \
0  <a href="http://twitter.com" rel="nofollow">Tw...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0            Mr_JDTraynor  ...                  1488       sollygratia   

   user.notifications                user.url                 user.created_at  \
0               False  http://t.co/8ftmf9XOqk  Thu Jun 19 14:38:43 +0000 2014   

  user.contributors_enabled user.time_zone user.protected  \
0                     False           None          False   

  user.default_profile user.is_translator  
0                False              False  

[1 rows x 66 columns]
  contribut

  contributors  truncated                                               text  \
0         None      False  @Mr_JDTraynor What do you think an acceptable ...   

   in_reply_to_status_id                  id  favorite_count  \
0     552803015443619840  552804453477519360               1   

                                              source  retweeted coordinates  \
0  <a href="http://twitter.com/download/android" ...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0            Mr_JDTraynor  ...                   304        mkingscott   

   user.notifications                user.url                 user.created_at  \
0               False  http://t.co/bvAHaFKBOj  Tue Jun 25 13:01:58 +0000 2013   

  user.contributors_enabled user.time_zone user.protected  \
0                     False         London          False   

  user.default_profile user.is_translator  
0                False              False  

[1 rows x 66 columns]
  con

  contributors  truncated                               text  \
0         None      False  @GuidoFawkes Absolutely terrible!   

   in_reply_to_status_id                  id  favorite_count  \
0     552786116404072448  552789980666662912               0   

                                              source  retweeted coordinates  \
0  <a href="http://www.twitter.com" rel="nofollow...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0             GuidoFawkes  ...                   133    SherrinThePain   

   user.notifications  user.url                 user.created_at  \
0               False      None  Tue Sep 28 12:50:02 +0000 2010   

  user.contributors_enabled user.time_zone user.protected  \
0                     False      Amsterdam          False   

  user.default_profile user.is_translator  
0                 True              False  

[1 rows x 64 columns]
  contributors  truncated          text  in_reply_to_status_id  \

[1 rows x 74 columns]
  contributors  truncated                                               text  \
0         None      False  @stefandevries UK press agency here. Can we se...   

   in_reply_to_status_id                  id  favorite_count  \
0     552786340958715904  552787020624691200               0   

                                              source  retweeted coordinates  \
0  <a href="http://twitter.com" rel="nofollow">Tw...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0           stefandevries  ...                    14       REXrequests   

   user.notifications                user.url                 user.created_at  \
0               False  http://t.co/JK20eldZqn  Fri Mar 27 17:18:52 +0000 2009   

  user.contributors_enabled user.time_zone user.protected  \
0                     False      Amsterdam          False   

  user.default_profile user.is_translator  
0                 True              False  

[1 ro

  contributors  truncated                                               text  \
0         None      False  “@Observers: Police car with bullet holes in f...   

   in_reply_to_status_id                  id  favorite_count  \
0     552787144373460992  552790720906788864               1   

                                              source  retweeted coordinates  \
0  <a href="http://twitter.com/#!/download/ipad" ...      False        None   

  in_reply_to_screen_name  ... user.screen_name  user.notifications  \
0               Observers  ...   Speaker_Bureau               False   

                 user.url                 user.created_at  \
0  http://t.co/YeRQhJ9kAv  Mon Sep 14 22:44:31 +0000 2009   

  user.contributors_enabled user.time_zone  user.protected  \
0                     False         London           False   

  user.default_profile user.is_translator  \
0                 True              False   

                             extended_entities.media  
0  [{'source_u

  contributors  truncated                                               text  \
0         None      False  Police car with bullet holes in front of Charl...   

  in_reply_to_status_id                  id  favorite_count  \
0                  None  552787144415404032              11   

                                              source  retweeted coordinates  \
0  <a href="https://about.twitter.com/products/tw...      False        None   

  in_reply_to_screen_name  ... user.screen_name  user.notifications  \
0                    None  ...       JulienPain               False   

                 user.url                 user.created_at  \
0  http://t.co/voM2SW5JHQ  Wed Oct 24 11:44:31 +0000 2007   

  user.contributors_enabled user.time_zone  user.protected  \
0                     False          Paris           False   

  user.default_profile user.is_translator  \
0                 True              False   

                             extended_entities.media  
0  [{'source_use

  contributors  truncated                                               text  \
0         None      False  @john_mcguirk @RobDoyle1 So we already know wh...   

   in_reply_to_status_id                  id  favorite_count  \
0     552787321561812992  552792788862246912               0   

                                              source  retweeted coordinates  \
0  <a href="http://twitter.com" rel="nofollow">Tw...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0            john_mcguirk  ...                    39         drmcarley   

   user.notifications                user.url                 user.created_at  \
0               False  http://t.co/PXA04RxXm4  Sat Jul 06 20:45:15 +0000 2013   

  user.contributors_enabled user.time_zone user.protected  \
0                     False         London          False   

  user.default_profile user.is_translator  
0                False              False  

[1 rows x 66 columns]
  con

  contributors  truncated                                               text  \
0         None      False  @hannahjames40 @drmcarley @john_mcguirk @RobDo...   

   in_reply_to_status_id                  id  favorite_count  \
0     552799077440946176  552808560665849856               0   

                                              source  retweeted coordinates  \
0  <a href="http://twitter.com/#!/download/ipad" ...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0           hannahjames40  ...                  1366        vexedeagle   

   user.notifications  user.url                 user.created_at  \
0               False      None  Wed Oct 22 17:00:27 +0000 2014   

  user.contributors_enabled user.time_zone user.protected  \
0                     False           None          False   

  user.default_profile user.is_translator  
0                 True              False  

[1 rows x 65 columns]
  contributors  truncated        

  contributors  truncated                                               text  \
0         None      False  @MrHarryCole @Eschertology knew NewsCorp be th...   

   in_reply_to_status_id                  id  favorite_count  \
0     552787361437057024  552790944743833600               0   

                                              source  retweeted coordinates  \
0  <a href="http://twitter.com/download/iphone" r...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0             MrHarryCole  ...                 72116       mackaysuzie   

   user.notifications  user.url                 user.created_at  \
0               False      None  Wed Dec 26 22:32:33 +0000 2012   

  user.contributors_enabled user.time_zone user.protected  \
0                     False           None          False   

  user.default_profile user.is_translator  
0                 True              False  

[1 rows x 64 columns]
  contributors  truncated        

  contributors  truncated                                               text  \
0         None      False  @Rolo_Tamasi @AllenStarr1 @MrHarryCole @GazThe...   

   in_reply_to_status_id                  id  favorite_count  \
0     552798501890170880  552806496502030336               1   

                                              source  retweeted coordinates  \
0  <a href="http://twitter.com/download/iphone" r...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0             Rolo_Tamasi  ...                  2497        Klartext89   

   user.notifications  user.url                 user.created_at  \
0               False      None  Sat Jun 08 14:13:53 +0000 2013   

  user.contributors_enabled user.time_zone user.protected  \
0                     False         Berlin          False   

  user.default_profile user.is_translator  
0                 True              False  

[1 rows x 65 columns]
  contributors  truncated        

  contributors  truncated                                               text  \
0         None      False  @NegiJew no-one owes you the right to listen, ...   

   in_reply_to_status_id                  id  favorite_count  \
0     552810475738914816  552811034910916608               0   

                                              source  retweeted coordinates  \
0  <a href="http://twitter.com/download/iphone" r...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0                 NegiJew  ...                 19629         lacatchat   

   user.notifications  user.url                 user.created_at  \
0               False      None  Tue Jun 16 22:27:33 +0000 2009   

  user.contributors_enabled user.time_zone user.protected  \
0                     False         London          False   

  user.default_profile user.is_translator  
0                False              False  

[1 rows x 65 columns]
  contributors  truncated        

  contributors  truncated                                               text  \
0         None      False  @NegiJew ha! You big hypocrite. As if that's w...   

   in_reply_to_status_id                  id  favorite_count  \
0     552812583166959616  552813140770324480               0   

                                              source  retweeted coordinates  \
0  <a href="http://twitter.com/download/iphone" r...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0                 NegiJew  ...                 19667         lacatchat   

   user.notifications  user.url                 user.created_at  \
0               False      None  Tue Jun 16 22:27:33 +0000 2009   

  user.contributors_enabled user.time_zone user.protected  \
0                     False         London          False   

  user.default_profile user.is_translator  
0                False              False  

[1 rows x 65 columns]
  contributors  truncated        

[1 rows x 65 columns]
  contributors  truncated                                               text  \
0         None      False  @Klartext89 that is one thing which, as a very...   

   in_reply_to_status_id                  id  favorite_count  \
0     552817596832415744  552819048443310080               0   

                                              source  retweeted coordinates  \
0  <a href="https://about.twitter.com/products/tw...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0              Klartext89  ...                  1124       Rolo_Tamasi   

   user.notifications  user.url                 user.created_at  \
0               False      None  Sat Dec 19 09:24:50 +0000 2009   

  user.contributors_enabled user.time_zone user.protected  \
0                     False         London          False   

  user.default_profile user.is_translator  
0                 True              False  

[1 rows x 65 columns]
  contribut

[1 rows x 64 columns]
  contributors  truncated                                               text  \
0         None      False  @fitfeather1903 you won't find anywhere that I...   

   in_reply_to_status_id                  id  favorite_count  \
0     552825251323998208  552825562772033537               0   

                                              source  retweeted coordinates  \
0  <a href="https://about.twitter.com/products/tw...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0          fitfeather1903  ...                  1124       Rolo_Tamasi   

   user.notifications  user.url                 user.created_at  \
0               False      None  Sat Dec 19 09:24:50 +0000 2009   

  user.contributors_enabled user.time_zone user.protected  \
0                     False         London          False   

  user.default_profile user.is_translator  
0                 True              False  

[1 rows x 65 columns]
  contribut

[1 rows x 65 columns]
  contributors  truncated                                               text  \
0         None      False  @dylanllyr Ond doedd neb yn gwybod adeg y tryd...   

   in_reply_to_status_id                  id  favorite_count  \
0     552864064188456961  552864274834792448               0   

                                              source  retweeted coordinates  \
0  <a href="http://twitter.com" rel="nofollow">Tw...      False        None   

  in_reply_to_screen_name  ... user.favourites_count  user.screen_name  \
0               dylanllyr  ...                   361           cridlyn   

   user.notifications  user.url                 user.created_at  \
0               False      None  Wed Jun 10 23:28:07 +0000 2009   

  user.contributors_enabled user.time_zone user.protected  \
0                     False         Hawaii          False   

  user.default_profile user.is_translator  
0                False              False  

[1 rows x 64 columns]
  contribut

In [6]:
print('Shapes\nFake:')
print(fake_df.shape)
print('\nFNC bodies train:')
print(fnc_train_dfs[0].shape)
print('\nFNC stances train:')
print(fnc_train_dfs[1].shape)
print('\nFNC bodies test:')
print(fnc_test_dfs[0].shape)
print('\nFNC stances test:')
print(fnc_test_dfs[1].shape)
print('\nLiar train:')
print(liar_train_df.shape)
print('\nLiar test:')
print(liar_test_df.shape)
print('\nFEVER:')
print(fever_df.shape)
print('\nRumor Citation Emergent:')
print(rumor_dfs[0].shape)
print('\nRumor Citation Politifact:')
print(rumor_dfs[1].shape)
print('\nRumor Citation Snopes:')
print(rumor_dfs[2].shape)

Shapes
Fake:
(12999, 20)

FNC bodies train:
(1683, 2)

FNC stances train:
(49972, 3)

FNC bodies test:
(904, 2)

FNC stances test:
(25413, 2)

Liar train:
(10240, 14)

Liar test:
(1267, 14)

FEVER:
(1174, 8)

Rumor Citation Emergent:
(2145, 15)

Rumor Citation Politifact:
(2923, 12)

Rumor Citation Snopes:
(16865, 12)
