# Setup

In [1]:
import requests
import json
import pandas as pd
import time
import datetime
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from crossref.restful import Works
works = Works()

def get_match(author, staff, min_score=75):
    m = process.extract(author, staff,
                        scorer=fuzz.token_sort_ratio)
    m = [item for item in m if item[1] > min_score]
    return m

In [2]:
staff = pd.read_excel("MCS_Author_Names.xlsx", skiprows=3)
pubs = pd.read_excel("MCS_publications_20210216.xlsx", sheet_name='query')
# How many staff?
print(len(staff))
staff = staff.drop_duplicates()
print(len(staff))
staff = pd.wide_to_long(staff, 'Variant', i='Name', j='Variant_Number', sep='_')

137
137


In [3]:
# How many pubs?
print(len(pubs))

print(pubs.columns)
print(pubs.groupby(['Fiscal Year'])['Fiscal Year'].count())

# How many missing DOIs?
dois = pubs.DOI.dropna()
print((len(dois)))
dois = dois.drop_duplicates()

# How many remaining dois?
print((len(dois)))

431
Index(['Title', 'CI Author', 'Publication', 'Publication Date', 'DOI',
       'Modified', 'Fiscal Year', 'Publication Status', 'Name', 'Item Type',
       'Path'],
      dtype='object')
Fiscal Year
126;#FY08     4
177;#FY18    35
190;#FY07     1
191;#FY20    51
196;#FY06     5
197;#FY05     2
199;#FY00     3
206;#FY19    47
242;#FY03     1
249;#FY02     2
253;#FY21    14
62;#FY16     47
63;#FY15     16
70;#FY17     49
81;#FY11     45
85;#FY12     25
93;#FY14      9
94;#FY09     17
97;#FY13      7
99;#FY10     30
Name: Fiscal Year, dtype: int64
364
358


# Experimenting...

In [9]:
w = works.doi('10.1016/j.forpol.2018.08.002')
print(w.keys())
journal = w['container-title'][0]
title = w['title'][0]
date = datetime.datetime.fromisoformat(w['created']['date-time'].rstrip('Z'))
print(date.year)
print(journal)
print(title)

dict_keys(['indexed', 'reference-count', 'publisher', 'license', 'funder', 'content-domain', 'short-container-title', 'published-print', 'DOI', 'type', 'created', 'page', 'update-policy', 'source', 'is-referenced-by-count', 'title', 'prefix', 'volume', 'author', 'member', 'reference', 'container-title', 'original-title', 'language', 'link', 'deposited', 'score', 'subtitle', 'short-title', 'issued', 'references-count', 'alternative-id', 'URL', 'relation', 'ISSN', 'issn-type', 'subject', 'assertion'])
2018
Forest Policy and Economics
Geographic factors predict wild food and nonfood NTFP collection by households across four African countries


In [6]:
citations = requests.post('https://w3id.org/oc/index/api/v1/citations/{}'.format('10.1016/j.forpol.2018.08.002'))
print(citations.text)
citation_count = requests.post('https://w3id.org/oc/index/api/v1/citation-count/{}'.format('10.1016/j.forpol.2018.08.002'))
print(citation_count.json()[0]['count'])

[
    {
        "oci": "coci => 02001030701361924302723102137252423143700020300060903-02001000106361937152427252421370200010837000837000002",
        "creation": "coci => 2020-04-07",
        "cited": "coci => 10.1016/j.forpol.2018.08.002",
        "author_sc": "coci => no",
        "journal_sc": "coci => no",
        "timespan": "coci => P1Y5M",
        "citing": "coci => 10.1371/journal.pone.0230693"
    },
    {
        "oci": "coci => 0200100080036000809040109020037020002003701070609020403-02001000106361937152427252421370200010837000837000002",
        "creation": "coci => 2020-05-22",
        "cited": "coci => 10.1016/j.forpol.2018.08.002",
        "author_sc": "coci => no",
        "journal_sc": "coci => no",
        "timespan": "coci => P1Y6M",
        "citing": "coci => 10.1080/08941920.2020.1769243"
    },
    {
        "oci": "coci => 02001000903361118242812183611181010000408-02001000106361937152427252421370200010837000837000002",
        "creation": "coci => 2020-06-24",
   

# Load data from APIs

In [18]:
out = pd.DataFrame(columns=("DOI", "Journal", "Year", "Title", "Pub_Author", "Matched_Author", "Author_Position", "Total_Authors", "Citations"))
skips = pd.DataFrame(columns=("DOI", "Reason"))
all_w = []
for doi in dois:
    w = works.doi(doi)
    if w is None:
        print('Skipping {}, not on crossref'.format(doi))
        continue
    citation_resp = requests.post('https://w3id.org/oc/index/api/v1/citation-count/{}'.format(doi))
    if citation_resp:
        w['citation_resp'] = citation_resp.json()
    else:
        w['citation_resp'] = None
    all_w.append(w)
with open('publications_raw.json', 'w') as outfile:
    json.dump(all_w, outfile)

KeyboardInterrupt: 

In [20]:
with open('publications_raw.json', 'r') as f:
    all_w = json.load(f)

In [38]:
pub_list = pd.DataFrame(columns=("DOI", "Journal", "Year", "Title", "Pub_Author", "Matched_Author", "Author_Position", "Total_Authors", "Citations"))
coauthors = {}
coauthor_institutions = {}
skips = pd.DataFrame(columns=("DOI", "Reason"))
for w in all_w:
    try:
        authors = w['author']
        journal = w.get('container-title', None)
        if isinstance(journal, list) and len(journal) > 0:
            journal = journal[0]
        title = w.get('title', None)
        if isinstance(title, list) and len(title) > 0:
            title = title[0]
        year = datetime.datetime.fromisoformat(w['created']['date-time'].rstrip('Z')).year
    except KeyError:
        print('Skipping {}, no author information on crossref'.format(w['DOI']))
        skips.loc[len(skips) + 1] = [w['DOI'], 'missing fields on crossref']
        continue
    if w['citation_resp']:
        citation_count = w['citation_resp'][0]['count']
    else:
        citation_count = None
    authors_cleaned = []
    for n in range(len(authors)):
        try:
            author = '{} {}'.format(authors[n]['given'], authors[n]['family'])
        except KeyError:
            print('Skipping author on {}, full author information not on crossref for {}'.format(w['DOI'], authors[n]))
            skips.loc[len(skips) + 1] = [w['DOI'], 'full author information not on crossref for {}'.format(w['DOI'], authors[n])]
            continue
        authors_cleaned.append(author)
    for author in authors_cleaned:
        #Take the first match - should be best match, so improve this
        m = get_match(author, staff['Variant'])
        if len(m) > 0:
            m = m[0]
            staff_name = staff.index[staff['Variant'] == m[0]][0][0]
            pub_list.loc[len(pub_list) + 1] = [w['DOI'], journal, year, title, author, staff_name, n + 1, len(authors), citation_count]
            if staff_name in coauthors.keys():
                for a in authors_cleaned:
                    coauthors[staff_name].add(a)
            else:
                coauthors[staff_name] = set(authors_cleaned)
            # Don't list people as coauthors of themselves...
            coauthors[staff_name].remove(author)

Skipping author on 10.1371/journal.pone.0121040, full author information not on crossref for {'name': 'The PLOS ONE Staff', 'sequence': 'first', 'affiliation': []}
Skipping author on 10.1073/pnas.1714977115, full author information not on crossref for {'family': 'Onrizal', 'sequence': 'additional', 'affiliation': []}
Skipping author on 10.1073/pnas.1714977115, full author information not on crossref for {'family': 'Supriyadi', 'sequence': 'additional', 'affiliation': []}
Skipping 10.1111/j.1523-1739.2009.01300.x, no author information on crossref


# Make adjacency matrix for network analysis

In [39]:
edges = [(a, b) for a, bs in coauthors.items() for b in bs]
df = pd.DataFrame(edges)
adj_matrix = pd.crosstab(df[0], df[1])
adj_matrix.to_csv('MCS_publications_coauthor_matrix.csv')

In [41]:
pub_list['Authorship_Type'] = 'Other'
pub_list['Authorship_Type'][pub_list['Author_Position'] == 1] = 'First'
pub_list['Authorship_Type'][pub_list['Author_Position'] == 2] = 'Second'
pub_list['Authorship_Type'][(pub_list['Total_Authors'] > 3) & (pub_list['Author_Position'] == pub_list['Total_Authors'])] = 'Senior'
pub_list.to_csv('MCS_publications_author_order.csv', index=False)
skips.to_csv('MCS_publications_skipped.csv', index=False)