In [191]:
import json
import glob
import hashlib
import requests
from collections import defaultdict
import xml.etree.ElementTree as ET
from tqdm import tqdm
from datetime import datetime

In [234]:
def get_reviewer_info(xml_root):
    review_infos = []
    for review in xml_root.iter("sub-article"):
        reviewer_info={}
        if review.attrib["article-type"] == "ref-report":
            affiliation = review.find("./front-stub/contrib-group/aff")[0].tail
            reviewer_info["reviewer_info"] = {}
            reviewer_info["reviewer_info"]["affiliation"] = affiliation
            reviewer_info["reviewer_info"]["name"] = review.find("./front-stub/contrib-group/contrib/name/given-names").text
            reviewer_info["reviewer_info"]["surname"] = review.find("./front-stub/contrib-group/contrib/name/surname").text
            try:
                date = datetime.strptime(
                    review.find("./front-stub/pub-date/day").text + "/" +
                    review.find("./front-stub/pub-date/month").text + "/" +
                    review.find("./front-stub/pub-date/year").text,
                    "%d/%m/%Y"
                )
                date = int(date.timestamp())
            except:
                print(ET.tostring(review.find("./front-stub/pub-date")))
                date = None
            reviewer_info["cdate"]= date
            rel_article = review.find("./front-stub/related-article").attrib["{http://www.w3.org/1999/xlink}href"]
            reviewer_info["about"] = rel_article
            review_infos.append((review.attrib["id"], reviewer_info))
    return review_infos

In [235]:
def get_author_info(xml_root):
    author_infos = []
    contributors = xml_root.find(".//article-meta/contrib-group")
    affs = {aff.attrib["id"]:aff[0].tail for aff in contributors.findall("aff")}
    for contrib_node in contributors.iter("contrib"):
        if contrib_node.attrib["contrib-type"] == "author":
            author_info = {}
            orcid_node = contrib_node.find("uri")
            if orcid_node is not None:
                orcid = orcid_node.text
            else:
                orcid = None
            author_info["orcid"] = orcid

            name_node=contrib_node.find("name")
            if name_node is None:
                try:
                    name_node = contrib_node.find("name-alternatives").find('name[@{http://www.w3.org/XML/1998/namespace}lang="en"]')
                except AttributeError:
                    try:
                        author_info["name"] = contrib_node.find("collab").text
                        author_info["surname"] = None
                        author_info["affiliation"] = None
                        author_info["roles"] = []
                        author_info["is_collective"] = True
                        continue
                    except AttributeError:
                        print(f"no name found: {ET.tostring(contrib_node)}")
                
            author_info["name"] = name_node.find("given-names").text
            author_info["surname"] = name_node.find("surname").text
            aff_id = contrib_node.find("xref[@ref-type='aff']").attrib["rid"]
            author_info["affiliation"] = affs[aff_id]
            author_info["roles"] = [r.text for r in contrib_node.findall("role")]
            author_info["is_collective"] = False
            author_infos.append(author_info)
    return author_infos

In [236]:
papers = defaultdict(dict)
reviews = defaultdict(dict)
for i, filen in enumerate(glob.glob("nlpeer_v0.1_nopdf/F1000/data/*/v1/*")):
    paper_id = filen.split("/")[3]
    if filen.endswith("/meta.json"):
        with open(filen) as fh:
            paper_meta_obj = json.load(fh)
        papers[paper_id].update(paper_meta_obj)

    elif filen.endswith("/paper.xml"):
        root = ET.parse(filen)
        reviewer_infos= get_reviewer_info(root)
        author_infos = get_author_info(root)
        papers[paper_id]["authors_info"] = author_infos
        for id, info in reviewer_infos:
            reviews[id].update(info)

    elif filen.endswith("/reviews.json"):
        with open(filen) as fh:
            reviews_obj = json.load(fh)
        # reviews_by_year[1] = reviews_obj
        for review in reviews_obj:
            reviews[review["rid"]].update(review)

In [237]:
def create_author_id(author_dict):
    hash_alg = hashlib.sha256()
    hash_alg.update(author_dict["name"].encode('utf-8'))
    hash_alg.update(" ".encode('utf-8'))
    hash_alg.update(author_dict["surname"].encode('utf-8'))
    hash_alg.update(" ".encode('utf-8'))
    hash_alg.update(author_dict["affiliation"].encode('utf-8'))
    return hash_alg.hexdigest()[:20]

authors = {}
for paper in papers.values():
    paper_authors = paper["authors_info"]
    for author in paper_authors:
        id = create_author_id(author)
        author["pid"] = id
        if author['orcid'] is not None:
            # check if author is present without orcid somewhere
            prev_roles = set()
            prev_papers = []
            if id in authors:
                prev_roles = set(authors[id]["roles"])
                del authors[id]
            # create the author identified by orcid
            if not author['orcid'] in authors:
                authors[author['orcid']]=author
            # update the roles
            else:
                authors[author['orcid']]["roles"] = set(authors[author['orcid']]["roles"]) | set(author["roles"]) | prev_roles
        else:
            if not id in authors:
                authors[id] = author
            else:
                authors[id]["roles"] = set(authors[id]["roles"]) | set(author["roles"])


In [238]:
reviewers = {}

for review in reviews.values():
    id = create_author_id(review["reviewer_info"])
    review["reviewer_info"]["reviewer_id"] = id
    reviewers[id] = {
        "name": review["reviewer_info"]["name"],
        "surname": review["reviewer_info"]["surname"],
        "affiliation": review["reviewer_info"]["affiliation"],
    }

In [239]:
len(reviewers)

9923

In [240]:
print(len(authors))

22588


In [241]:
print(len(authors.keys()&reviewers.keys()))

19


In [242]:
print(len(reviews))

10418


In [243]:
for r in reviewers.values():
    print(json.dumps(r, indent=3))
    break

{
   "name": "Feng",
   "surname": "Shi",
   "affiliation": "Cedars-Sinai Medical Center, Los Angeles, CA, USA"
}


In [244]:
len(papers)

4949

In [217]:
papers['6-93']

{'authors_info': [{'orcid': 'https://orcid.org/0000-0003-2018-396X',
   'name': 'Hugues',
   'surname': 'Gentillon',
   'affiliation': 'Department of Radiology and Diagnostic Imaging, Barlicki University Hospital, Medical University of Łódź, Łódź, Poland',
   'roles': set(),
   'is_collective': False,
   'pid': '7b1f41cd8e56b9817344'},
  {'orcid': None,
   'name': 'Ludomir',
   'surname': 'Stefańczyk',
   'affiliation': 'Department of Radiology and Diagnostic Imaging, Barlicki University Hospital, Medical University of Łódź, Łódź, Poland',
   'roles': [],
   'is_collective': False,
   'pid': '60b11332ef3def5a727d'},
  {'orcid': None,
   'name': 'Michał',
   'surname': 'Strzelecki',
   'affiliation': 'Institute of Electronics, The Faculty of Electrical, Electronic, Computer and Control Engineering, Technical University of Łódź, Łódź, Poland',
   'roles': [],
   'is_collective': False,
   'pid': '1a8083e54b35a2b4dbe6'},
  {'orcid': None,
   'name': 'Maria',
   'surname': 'Respondek-Liber

In [218]:
reviews_by_year = {}
key_count = defaultdict(int)
for i, filen in enumerate(glob.glob("nlpeer_v0.1_nopdf/F1000/data/*/v1/reviews.json")):
    with open(filen) as fh:
        reviews_obj = json.load(fh)
    # reviews_by_year[1] = reviews_obj
    print(json.dumps(reviews_obj, indent = 3))
    break

[
   {
      "rid": "report23366",
      "reviewer": [
         "Feng Shi"
      ],
      "report": {
         "main": "The authors proposed a sample sorting method for fetal MR images. Below are several suggestions to potentially improve the clarity of the paper. \nThe Introduction stated that the goal of this work is to further improve the MaZda software. The authors may consider enlarging its audience size by introducing how this data could benefit other researchers in the fetal research community. \nThe sample sorting seems totally manual, which may be more efficient with the help of some machine learning algorithms. \nExperiments could be added to evaluate the performance/correctness of the sample sorting process. \nSome details of the data itself could be useful for readers, such as the final data number, demographic information. The dataset 1 for downloading seem only contain 3 subjects. \nAre sufficient details of methods and materials provided to allow replication by others? \

In [18]:
key_count

defaultdict(int, {'overall': 10418})

In [54]:
API_URL="https://f1000research.com/extapi/"
article_types = [    
# "BRIEF_REPORT",
"CASE_REPORT",
"CLINICAL_PRACTICE_ARTICLE",
# "CORRESPONDENCE",
# "DATA_NOTE",
# "EDITORIAL",
"METHOD_ARTICLE",
"RESEARCH_ARTICLE" ,
"REVIEW",
# "RESEARCH_NOTE",
# "STUDY_PROTOCOL",
"SYSTEMATIC_REVIEW",
# "OPINION_ARTICLE",
# "SOFTWARE_TOOLS",
# "ANTIBODY_VALIDATION_ARTICLE",
# "RETRACTION",
]
search_query = 'search?q=R_TY:"' + '" OR R_TY:"'.join(article_types[4:5]) + '"'
print('wt=json&' + search_query)

wt=json&search?q=R_TY:"REVIEW"


In [55]:
res = requests.get(f"{API_URL}{search_query}", headers = {'Accept': 'application/json'})

In [56]:
res.status_code

200

In [57]:
result = res.json()

In [70]:
print(len(result["doi"]))
for doi in result["doi"]:
    print(doi)
    article_query = f"article/xml?wt=json&doi={doi}"
    res = requests.get(f"{API_URL}{article_query}") 
                    #    headers={'Accept': 'application/json'})
    print(res.status_code)
    root = ET.fromstring(res.text)
    for sub_article in root.iter("sub-article"):
        if sub_article.attrib["article-type"]=="reviewer-report":
            review_id = 
    break


100
10.12688/f1000research.2-62.v1
200
<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.2 20190208//EN" "http://jats.nlm.nih.gov/publishing/1.2/JATS-journalpublishing1.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML"
         xmlns:xlink="http://www.w3.org/1999/xlink"
         article-type="review-article"
         dtd-version="1.2"
         xml:lang="en">
    <front>
        <journal-meta>
            <journal-id journal-id-type="pmc">F1000Research</journal-id>
            <journal-title-group>
                <journal-title>F1000Research</journal-title>
            </journal-title-group>
            <issn pub-type="epub">2046-1402</issn>
            <publisher>
                <publisher-name>F1000 Research Limited</publisher-name>
                <publisher-loc>London, UK</publisher-loc>
            </publisher>
        </journal-meta>
        <article-meta>
            <article-id pub-id-type="doi">10.1