# Setup

In [3]:
import requests
import json
import pandas as pd
import time
import datetime
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from crossref.restful import Works
works = Works()

def get_match(author, staff, min_score=75):
    m = process.extract(author, staff,
                        scorer=fuzz.token_sort_ratio)
    m = [item for item in m if item[1] > min_score]
    return m

In [8]:
staff = pd.read_excel("MCS_Author_Names.xlsx", skiprows=3)
pubs = pd.read_excel("MCS_publications_20220401.xlsx", sheet_name='query')
print('Number of pubs {}'.format(len(pubs)))
# How many staff?
print('Number of staff before dropping dupes {}'.format(len(staff)))
staff = staff.drop_duplicates()
print('Number of staff after dropping dupes {}'.format(len(staff)))
staff = pd.wide_to_long(staff, 'Variant', i='Name', j='Variant_Number', sep='_')

Number of pubs 493
Number of staff before dropping dupes 137
Number of staff after dropping dupes 137


In [9]:
# How many pubs?
print(len(pubs))

print(pubs.columns)
print(pubs.groupby(['Fiscal Year'])['Fiscal Year'].count())

# How many missing DOIs?
dois = pubs.DOI.dropna()
print((len(dois)))
dois = dois.drop_duplicates()

# How many remaining dois?
print((len(dois)))

493
Index(['Title', 'CI Author', 'Publication', 'Publication Date', 'DOI',
       'Modified', 'Fiscal Year', 'Publication Status', 'Name',
       'Contains/Mentions Data From Indonesia',
       'Does this study have spatial outputs?', 'URL to spatial data',
       'Item Type', 'Path'],
      dtype='object')
Fiscal Year
126;#FY08     4
177;#FY18    35
190;#FY07     1
191;#FY20    52
196;#FY06     5
197;#FY05     2
199;#FY00     3
206;#FY19    51
242;#FY03     1
249;#FY02     2
253;#FY21    43
268;#FY22    27
62;#FY16     47
63;#FY15     16
70;#FY17     49
81;#FY11     45
85;#FY12     25
93;#FY14      9
94;#FY09     17
97;#FY13      7
99;#FY10     30
Name: Fiscal Year, dtype: int64
423
417


# Experimenting...

In [4]:
w = works.doi('10.1016/j.forpol.2018.08.002')
print(w.keys())

dict_keys(['indexed', 'reference-count', 'publisher', 'license', 'funder', 'content-domain', 'short-container-title', 'published-print', 'DOI', 'type', 'created', 'page', 'update-policy', 'source', 'is-referenced-by-count', 'title', 'prefix', 'volume', 'author', 'member', 'reference', 'container-title', 'original-title', 'language', 'link', 'deposited', 'score', 'subtitle', 'short-title', 'issued', 'references-count', 'alternative-id', 'URL', 'relation', 'ISSN', 'issn-type', 'subject', 'assertion'])


In [6]:
altmetrics_key = '3372dd4fc4d9473c96543743c404e0ca'
doi = '10.1016/j.forpol.2018.08.002'
resp = requests.get('https://api.altmetric.com/v1/doi/{}?key={}'.format(doi, altmetrics_key))
print(resp)
print(resp.json())

<Response [401]>


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [7]:
journal = w['container-title'][0]
title = w['title'][0]
date = datetime.datetime.fromisoformat(w['created']['date-time'].rstrip('Z'))
print(date.year)
print(w['author'])
print(title)

2018
[{'given': 'Matthew', 'family': 'Cooper', 'sequence': 'first', 'affiliation': []}, {'ORCID': 'http://orcid.org/0000-0002-6008-4918', 'authenticated-orcid': False, 'given': 'Alex', 'family': 'Zvoleff', 'sequence': 'additional', 'affiliation': []}, {'given': 'Mariano', 'family': 'Gonzalez-Roglich', 'sequence': 'additional', 'affiliation': []}, {'given': 'Felly', 'family': 'Tusiime', 'sequence': 'additional', 'affiliation': []}, {'given': 'Mark', 'family': 'Musumba', 'sequence': 'additional', 'affiliation': []}, {'given': 'Monica', 'family': 'Noon', 'sequence': 'additional', 'affiliation': []}, {'given': 'Peter', 'family': 'Alele', 'sequence': 'additional', 'affiliation': []}, {'given': 'Madeleine', 'family': 'Nyiratuza', 'sequence': 'additional', 'affiliation': []}]
Geographic factors predict wild food and nonfood NTFP collection by households across four African countries


In [8]:
import sidewall
from sidewall import dimensions

ModuleNotFoundError: No module named 'sidewall'

In [9]:
citations = requests.post('https://w3id.org/oc/index/api/v1/coauthorship/{}'.format('10.1016/j.forpol.2018.08.002'))
print(citations.text)
#citation_count = requests.post('https://w3id.org/oc/index/api/v1/citation-count/{}'.format('10.1016/j.forpol.2018.08.002'))
#print(citation_count.json()[0]['count'])

<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>Unhandled Exception</title>
</head><body>
<h1>Unhandled Exception</h1>
<p>An unhandled exception was thrown by the application.</p>
</body></html>



# Load data from APIs

In [10]:
out = pd.DataFrame(columns=("DOI", "Journal", "Year", "Title", "Pub_Author", "Matched_Author", "Author_Position", "Total_Authors", "Citations"))
skips = pd.DataFrame(columns=("DOI", "Reason"))
all_w = []
for doi in dois:
    w = works.doi(doi)
    if w is None:
        print('Skipping {}, not on crossref'.format(doi))
        continue
    citation_resp = requests.post('https://w3id.org/oc/index/api/v1/citation-count/{}'.format(doi))
    if citation_resp:
        w['citation_resp'] = citation_resp.json()
    else:
        w['citation_resp'] = None
    all_w.append(w)
with open('publications_raw.json', 'w') as outfile:
    json.dump(all_w, outfile)

Skipping 10.17159/2222-3436/2016/v19n5a2, not on crossref
Skipping 10.1126/science.aau2650, not on crossref
Skipping 10.5281/zenodo.1463063, not on crossref
Skipping 10.5281/zenodo.3386441, not on crossref
Skipping https://doi.org/10.1007/s00267- 021-01446-1, not on crossref
Skipping 10.2305/IUCN.CH.2021.PARKS‐27‐1HJ.en, not on crossref
Skipping https://doi.org/10.1016/j.gloenvcha.2021.1023680959-3780, not on crossref
Skipping https://doi.org/10.1007/s13280- 021-01628-5., not on crossref


In [11]:
with open('publications_raw.json', 'r') as f:
    all_w = json.load(f)

In [12]:
pub_list = pd.DataFrame(columns=("DOI",
                                 "Journal",
                                 "Year",
                                 "Title",
                                 "Pub_Author",
                                 "Matched_Author",
                                 "Author_Position",
                                 "Total_Authors",
                                 "Citations"))
coauthors = {}
coauthor_institutions = {}
skips = pd.DataFrame(columns=("DOI", "Reason"))
for w in all_w:
    try:
        authors = w['author']
        journal = w.get('container-title', None)
        if isinstance(journal, list) and len(journal) > 0:
            journal = journal[0]
        title = w.get('title', None)
        if isinstance(title, list) and len(title) > 0:
            title = title[0]
        year = datetime.datetime.fromisoformat(w['created']['date-time'].rstrip('Z')).year
    except KeyError:
        print('Skipping {}, no author information on crossref'.format(w['DOI']))
        skips.loc[len(skips) + 1] = [w['DOI'], 'missing fields on crossref']
        continue
    if w['citation_resp']:
        citation_count = w['citation_resp'][0]['count']
    else:
        citation_count = None
    authors_cleaned = []
    for n in range(len(authors)):
        try:
            author = '{} {}'.format(authors[n]['given'], authors[n]['family'])
        except KeyError:
            print('Skipping author on {}, full author information not on crossref for {}'.format(w['DOI'], authors[n]))
            skips.loc[len(skips) + 1] = [w['DOI'], 'full author information not on crossref for {}'.format(w['DOI'], authors[n])]
            continue
        authors_cleaned.append((author, n))
    for author, position in authors_cleaned:
        #Take the first match - should be best match, so improve this
        m = get_match(author, staff['Variant'])
        if len(m) > 0:
            m = m[0]
            staff_name = staff.index[staff['Variant'] == m[0]][0][0]
            pub_list.loc[len(pub_list) + 1] = [w['DOI'], journal, year, title, author, staff_name, position + 1, len(authors), citation_count]
            if staff_name in coauthors.keys():
                if year in coauthors[staff_name].keys():
                    for a in authors_cleaned:
                        coauthors[staff_name][year].add(a)
                else:
                    coauthors[staff_name][year] = set(authors_cleaned)
            else:
                coauthors[staff_name] = {year: set(authors_cleaned)}
            # Don't list people as coauthors of themselves...
            coauthors[staff_name][year].remove((author, position))

Skipping author on 10.1371/journal.pone.0121040, full author information not on crossref for {'name': 'The PLOS ONE Staff', 'sequence': 'first', 'affiliation': []}
Skipping author on 10.1073/pnas.1714977115, full author information not on crossref for {'family': 'Onrizal', 'sequence': 'additional', 'affiliation': [{'name': 'Faculty of Forestry, Universitas Sumatera Utara, Medan 20155, Indonesia;'}]}
Skipping author on 10.1073/pnas.1714977115, full author information not on crossref for {'family': 'Supriyadi', 'sequence': 'additional', 'affiliation': [{'name': 'Faculty of Forestry, Universitas Gadjah Mada, Yogyakarta 55281, Indonesia;'}]}
Skipping 10.1111/j.1523-1739.2009.01300.x, no author information on crossref
Skipping author on 10.1016/j.biocon.2020.108849, full author information not on crossref for {'family': 'ForestPlots.net', 'sequence': 'first', 'affiliation': []}


In [13]:
pub_list['Authorship_Type'] = 'Other'
pub_list['Authorship_Type'][pub_list['Author_Position'] == 1] = 'First'
pub_list['Authorship_Type'][pub_list['Author_Position'] == 2] = 'Second'
pub_list['Authorship_Type'][(pub_list['Total_Authors'] > 3) & (pub_list['Author_Position'] == pub_list['Total_Authors'])] = 'Senior'
pub_list.to_csv('MCS_publications_author_order.csv', index=False)
skips.to_csv('MCS_publications_skipped.csv', index=False)

# Make adjacency matrix for network analysis

In [14]:
# Long format
edges = []
for author, year in coauthors.items():
    for year, coauthors_by_year in year.items():
        for coauthor in coauthors_by_year:
            edges.append((author, year, coauthor))

df = pd.DataFrame(edges, columns=('CI_Author', 'Year', 'Coauthor'))
df.to_csv('MCS_publications_coauthor_matrix_long.csv', index=False)

# Wide format
edges_wide = [(a, b) for a, bs in coauthors.items() for b in bs]
df_wide = pd.DataFrame(edges_wide)
adj_matrix = pd.crosstab(df_wide[0], df_wide[1])
adj_matrix.to_csv('MCS_publications_coauthor_matrix_wide.csv')

In [19]:
edges[1]


('Donatti, Camila', 2016, ('Jason P. Landrum', 12))