Script that breaks down our original 'articles_and_books.json' file into many CSV files to enable better data ingestion with Apache Spark.

In [32]:
from scipy.stats import loguniform 
from random import randrange
import pandas as pd
import json
f = open('articles_and_books.json')
data = json.load(f)

publication = []
author = set()
venue = set()
authset = set()
writes = set()
reference = set()
orgs = set()

for key in data:
    keywords = []
    pub_tuple = [key.get('id'), key.get('title'),
                 key.get('page_start'), key.get('page_end'),
                 key.get('year'), key.get('n_citation')]
    if key.get('page_start') == '' or key.get ('page_end') == '':
        pub_tuple[2] = None
        pub_tuple[3] = None
    
    if key.get('venue') is not None:
        ven = key.get('venue')
        pub_tuple.append(ven.get('raw'))
        ven_tuple = [ven.get('raw'), ven.get('type')]
        if ven_tuple[1] is None:
            ven_tuple[1] = 'C'
        venue.add(tuple(ven_tuple))
        
    if key.get('fos') is not None:
        for keyword in key.get('fos'):
            keywords.append(keyword.get('name'))
    
    pub_tuple.append(keywords)
    publication.append(pub_tuple)
    
    if key.get('authors') is not None:
        for auth in key.get('authors'):
            # We assume an author can be affiliated to only one organization
            #auth_tuple = [auth.get('id'), auth.get('name'), auth.get('org')]
            auth_tuple = [auth.get('id'), auth.get('name')]
            authset.add(tuple(auth_tuple))
            write_tuple = [auth.get('id'), key.get('id')]
            writes.add(tuple(write_tuple))
            orgs.add(auth.get('org'))

    #if key.get('references') is not None:
     #   for ref in key.get('references'):
      #      ref_tuple = [key.get('id'), ref]
       #     reference.add(tuple(ref_tuple))
    
    
# Generate random references, we ignore the original references
num_ref = loguniform.rvs(1, 50, size=1999).astype(int)
for i in range(len(publication)):
    for j in range(num_ref[i]):
        while True:
            cites = randrange(1998)
            if i != cites:
                break
        cite_tuple = [publication[cites][0], publication[i][0]]
        reference.add(tuple(cite_tuple))

authset = list(authset)
orgs = list(orgs)
for i in range(len(authset)):
    org_index = randrange(len(orgs))
    author.add(tuple([authset[i][0], authset[i][1], orgs[org_index]]))
        

publication = pd.DataFrame(publication) 
publication.columns = ['id', 'title', 'page_start', 'page_end', 'year', 'citations', 'venue', 'keywords']

venue = pd.DataFrame(list(venue))
venue.columns = ['name', 'type']

author = pd.DataFrame(list(author))
author.columns = ['id', 'name', 'org']

writes = pd.DataFrame(list(writes))
writes.columns = ['author', 'publication']

reference = pd.DataFrame(list(reference))
reference.columns = ['references', 'referenced']


publication.to_csv('dataset/publication.csv', sep=';', index = False)
venue.to_csv('dataset/venue.csv', sep=';', index = False)
author.to_csv('dataset/author.csv', sep=';', index = False)
writes.to_csv('dataset/writes.csv', sep=';', index = False)
reference.to_csv('dataset/reference.csv', sep=';', index = False)