In [12]:
import json

In [13]:
DATA_PATH = "../data"

In [None]:
def extract_data(data_path):
    with(open(data_path, 'r') as file):
        data = json.load(file)

    paper_data = data['abstracts-retrieval-response']

    title = paper_data['coredata'].get('dc:title', '')
    description = paper_data['coredata'].get('dc:description', '')
    coverdate = paper_data['coredata'].get('prism:coverDate', '')
    publication_name = paper_data['coredata'].get('prism:publicationName', '')
    citation_count = paper_data['coredata'].get('citedby-count', '')

    subject_areas = []
    for subject in paper_data['subject-areas']['subject-area']:
        subject_areas.append(subject['$'])


    author_names = []
    for author in paper_data['authors']['author']:
        author_names.append(author['ce:indexed-name'])

    affiliations = []

    if isinstance(paper_data['affiliation'], dict):
        # paper_data['affiliation'] = [paper_data['affiliation']['affilname']]
        affiliations.append(paper_data['affiliation']['affilname'])
    else:
        for affiliation in paper_data['affiliation']:
            affiliations.append(affiliation['affilname'])

    countries = set()

    if isinstance(paper_data['affiliation'], dict):
        country = paper_data['affiliation'].get('affiliation-country', '')
        if country:
            countries.add(country)
    else:
        for affiliation in paper_data['affiliation']:
            country = affiliation.get('affiliation-country', '')
            if country:
                countries.add(country)

    return {
        "title": title,
        "description": description,
        "coverdate": coverdate,
        "publication_name": publication_name,
        "citation_count": citation_count,
        "subject_areas": subject_areas,
        "author_names": author_names,
        "affiliations": affiliations,
        "countries": list(countries)
    }

In [15]:
extract_data('../data/2018/201800000.json')

{'title': 'Public health and international epidemiology for radiology',
 'description': '',
 'coverdate': '2018-12-31',
 'publication_name': 'Radiology in Global Health: Strategies, Implementation, and Applications',
 'citation_count': '1',
 'subject_areas': ['Medicine (all)'],
 'author_names': ['Pongpirul K.', 'Lungren M.P.'],
 'affiliations': ['Stanford University School of Medicine',
  'Chulalongkorn University',
  'Bumrungrad International Hospital',
  'Stanford Healthcare',
  'Stanford University',
  'Johns Hopkins Bloomberg School of Public Health'],
 'countries': ['United States', 'Thailand']}

In [25]:
import os

f = open('paper_data.csv', 'w')

columns = ['title', 'description', 'year', 'coverdate', 'publication_name', 'citation_count', 'subject_areas', 'author_names', 'affiliations', 'countries', 'file_name']
f.write('\t'.join(columns) + '\n')

for year in os.listdir(DATA_PATH):
    print(f"Year: {year}")
    if year != '2018':
        continue

    year_path = os.path.join(DATA_PATH, year)
    if not os.path.isdir(year_path):
        continue

    for file_name in os.listdir(year_path):
        print(f" - {file_name}")
        extract_data_path = os.path.join(year_path, file_name)
        paper_info = extract_data(extract_data_path)
        row = [
            paper_info['title'],
            paper_info['description'],
            year,
            paper_info['coverdate'],
            paper_info['publication_name'],
            str(paper_info['citation_count']),
            ', '.join(paper_info['subject_areas']),
            ', '.join(paper_info['author_names']),
            ', '.join(paper_info['affiliations']),
            ', '.join(paper_info['countries']),
            file_name
        ]
        f.write('\t'.join(row) + '\n')        

f.close()

Year: 2018
 - 201802787.json


AttributeError: 'str' object has no attribute 'keys'