In [11]:
import pandas as pd
import os
import uuid
import requests
import requests_cache
from datetime import timedelta
from typing import Union

session = requests_cache.CachedSession('paper_cache', expire_after=timedelta(hours=24))

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Text mining a CrossRef DOI

In [13]:
# FCDO - http://dx.doi.org/10.13039/501100020171

# def get_doi_by_orcid(orcid):
orcid = "0000-0002-3363-8620"
message = session.get(f"https://api.crossref.org/works?filter=orcid:{orcid}&select=DOI,title,prefix,published")
items = message.json()['message']['items']
for item in items:
   work = get_doi(item['DOI'])
   authors = work.json()['message']['author']

In [14]:
work = get_doi(items[0]['DOI'])
work.json()['message']['author']

[{'ORCID': 'http://orcid.org/0000-0002-3363-8620',
  'authenticated-orcid': False,
  'given': 'Liu',
  'family': 'Yang',
  'sequence': 'first',
  'affiliation': []},
 {'ORCID': 'http://orcid.org/0000-0002-4879-9259',
  'authenticated-orcid': False,
  'given': 'Koen',
  'family': 'van Dam',
  'sequence': 'additional',
  'affiliation': []},
 {'given': 'Lufeng',
  'family': 'Zhang',
  'sequence': 'additional',
  'affiliation': []}]

In [15]:
work = get_doi("10.21203/rs.3.rs-478581/v1")

In [16]:
def get_doi(doi: str) -> Union[requests_cache.CachedSession, None]:

    url = f"https://api.crossref.org/works/{doi}/agency"

    agency = session.get(url)
    if agency.json()['message']['agency']['id'] == 'crossref':
        return session.get(f"https://api.crossref.org/works/{doi}")
    else:
        print(agency.status_code)
        return None

In [18]:
def get_titles(df):
    doi = df['DOI']
    work = get_doi(doi)
    if work:
        message = work.json()['message']
        title = message['title'][0]
        license = [x['URL'] for x in message['license'] if x['content-version'] == 'vor']
        if license:
            license = license[0]
        return (title, license)
    else:
        return (None, None)

papers = pd.read_csv('data/papers.csv')

papers[['title', 'license']] = papers.apply(get_titles, axis=1, result_type='expand')

papers.to_csv('data/papers.csv', index=False)

In [None]:
doi = "10.1016/j.jag.2022.102748"
work = requests.get(f"https://api.crossref.org/works/{doi}")

message = work.json()['message']

In [None]:
vor_license = [x['URL'] for x in message['license'] if x['content-version'] == 'vor'][0]
vor_license

In [None]:
authors = pd.read_csv('data/authors.csv')

In [None]:
test = authors[authors['uuid'] == "bed6dcc4-101a-44df-893e-f6148c96afb9"]['Orcid'].values

In [None]:
pd.isna(test)

array([False])

In [None]:
test

array(['https://orcid.org/0000-0002-3363-8620'], dtype=object)

In [None]:
pd.isna(test[0])

False

In [None]:
from create_graph import main
from rdflib import SDO, Namespace

g = main()

In [None]:
res = g.query(
    """
PREFIX schema: <https://schema.org/>
PREFIX org: <http://www.w3.org/ns/org#>

SELECT ?orcid
WHERE {
    ?orcid org:memberOf ?organisation .
    ?orcid a org:Organization .
    FILTER NOT EXISTS {
    ?orcid schema:name ?name .
		}
    }
    """
)


In [None]:
for row in res:
    print(f"{row.orcid}")

http://climatecompatiblegrowth.com/id/oxford
http://climatecompatiblegrowth.com/id/kth
http://climatecompatiblegrowth.com/id/cambridge
http://climatecompatiblegrowth.com/id/loughborough
http://climatecompatiblegrowth.com/id/imperial
http://climatecompatiblegrowth.com/id/cge
http://climatecompatiblegrowth.com/id/ou
http://climatecompatiblegrowth.com/id/ucl
http://climatecompatiblegrowth.com/id/cp


# Try the crossref commons python package


In [41]:
import crossref_commons.retrieval

def get_doi_datacite(doi: str) -> Union[requests_cache.CachedSession, None]:

    url = f"https://api.datacite.org/dois/{doi}"

    agency = session.get(url)
    return agency.json()['data']['attributes']

def get_data(doi: str):
    try:
        result = crossref_commons.retrieval.get_publication_as_json(doi)
    except ValueError as ex:
        # try Zenodo API
        result = get_doi_datacite(doi)
    return result

doi = "10.5281/zenodo.10411123"
zenodo_dataset = get_data(doi)

doi = "10.3390/en14041209"
publication = get_data(doi)


In [42]:
zenodo_dataset

{'doi': '10.5281/zenodo.10411123',
 'prefix': '10.5281',
 'suffix': 'zenodo.10411123',
 'identifiers': [{'identifier': 'oai:zenodo.org:10411123',
   'identifierType': 'oai'}],
 'alternateIdentifiers': [{'alternateIdentifierType': 'oai',
   'alternateIdentifier': 'oai:zenodo.org:10411123'}],
 'creators': [{'name': 'Tan, Naomi',
   'nameType': 'Personal',
   'givenName': 'Naomi',
   'familyName': 'Tan',
   'affiliation': ['Loughborough University', 'Imperial College London'],
   'nameIdentifiers': [{'nameIdentifier': '0000-0001-7957-8451',
     'nameIdentifierScheme': 'ORCID'}]},
  {'name': 'Ambunda, Robert',
   'nameType': 'Personal',
   'givenName': 'Robert',
   'familyName': 'Ambunda',
   'affiliation': ['SLOCAT Partnership'],
   'nameIdentifiers': []},
  {'name': 'Medimorec, Nikola',
   'nameType': 'Personal',
   'givenName': 'Nikola',
   'familyName': 'Medimorec',
   'affiliation': ['SLOCAT Partnership'],
   'nameIdentifiers': [{'nameIdentifier': '0000-0003-2935-7495',
     'nameIde

In [28]:
result['data'].keys()


dict_keys(['id', 'type', 'attributes', 'relationships'])

In [35]:
result['data']['attributes']['types']

{'ris': 'DATA',
 'bibtex': 'misc',
 'citeproc': 'dataset',
 'schemaOrg': 'Dataset',
 'resourceType': '',
 'resourceTypeGeneral': 'Dataset'}

In [20]:
result['author']

[{'ORCID': 'http://orcid.org/0000-0001-7537-5470',
  'authenticated-orcid': False,
  'given': 'Ioannis',
  'family': 'Pappis',
  'sequence': 'first',
  'affiliation': []},
 {'ORCID': 'http://orcid.org/0000-0002-1565-2752',
  'authenticated-orcid': False,
  'given': 'Andreas',
  'family': 'Sahlberg',
  'sequence': 'additional',
  'affiliation': []},
 {'given': 'Tewodros',
  'family': 'Walle',
  'sequence': 'additional',
  'affiliation': []},
 {'ORCID': 'http://orcid.org/0000-0002-6179-927X',
  'authenticated-orcid': False,
  'given': 'Oliver',
  'family': 'Broad',
  'sequence': 'additional',
  'affiliation': []},
 {'ORCID': 'http://orcid.org/0000-0001-8801-9686',
  'authenticated-orcid': False,
  'given': 'Elusiyan',
  'family': 'Eludoyin',
  'sequence': 'additional',
  'affiliation': []},
 {'given': 'Mark',
  'family': 'Howells',
  'sequence': 'additional',
  'affiliation': []},
 {'ORCID': 'http://orcid.org/0000-0001-9367-1791',
  'authenticated-orcid': False,
  'given': 'Will',
  'fam

In [21]:
print(result.keys())

dict_keys(['indexed', 'reference-count', 'publisher', 'issue', 'license', 'content-domain', 'short-container-title', 'abstract', 'DOI', 'type', 'created', 'page', 'source', 'is-referenced-by-count', 'title', 'prefix', 'volume', 'author', 'member', 'published-online', 'reference', 'container-title', 'original-title', 'language', 'link', 'deposited', 'score', 'resource', 'subtitle', 'short-title', 'issued', 'references-count', 'journal-issue', 'alternative-id', 'URL', 'relation', 'ISSN', 'issn-type', 'subject', 'published'])


In [22]:
print(f'{result["title"][0]} {result["publisher"]} {result["volume"]}')

Influence of Electrification Pathways in the Electricity Sector of Ethiopia—Policy Implications Linking Spatial Electrification Analysis and Medium to Long-Term Energy Planning MDPI AG 14


In [23]:
result["title"][0]

'Influence of Electrification Pathways in the Electricity Sector of Ethiopia—Policy Implications Linking Spatial Electrification Analysis and Medium to Long-Term Energy Planning'

In [24]:
result

{'indexed': {'date-parts': [[2024, 2, 16]],
  'date-time': '2024-02-16T23:28:22Z',
  'timestamp': 1708126102284},
 'reference-count': 104,
 'publisher': 'MDPI AG',
 'issue': '4',
 'license': [{'start': {'date-parts': [[2021, 2, 23]],
    'date-time': '2021-02-23T00:00:00Z',
    'timestamp': 1614038400000},
   'content-version': 'vor',
   'delay-in-days': 0,
   'URL': 'https://creativecommons.org/licenses/by/4.0/'}],
 'content-domain': {'domain': [], 'crossmark-restriction': False},
 'short-container-title': ['Energies'],
 'abstract': '<jats:p>Ethiopia is a low-income country, with low electricity access (45%) and an inefficient power transmission network. The government aims to achieve universal access and become an electricity exporter in the region by 2025. This study provides an invaluable perspective on different aspects of Ethiopia’s energy transition, focusing on achieving universal access and covering the country’s electricity needs during 2015–2065. We co-developed and investig

In [25]:
from habanero import Crossref
cr = Crossref()
cr.works(ids = doi)

{'status': 'ok',
 'message-type': 'work',
 'message-version': '1.0.0',
 'message': {'indexed': {'date-parts': [[2024, 2, 16]],
   'date-time': '2024-02-16T23:28:22Z',
   'timestamp': 1708126102284},
  'reference-count': 104,
  'publisher': 'MDPI AG',
  'issue': '4',
  'license': [{'start': {'date-parts': [[2021, 2, 23]],
     'date-time': '2021-02-23T00:00:00Z',
     'timestamp': 1614038400000},
    'content-version': 'vor',
    'delay-in-days': 0,
    'URL': 'https://creativecommons.org/licenses/by/4.0/'}],
  'content-domain': {'domain': [], 'crossmark-restriction': False},
  'short-container-title': ['Energies'],
  'abstract': '<jats:p>Ethiopia is a low-income country, with low electricity access (45%) and an inefficient power transmission network. The government aims to achieve universal access and become an electricity exporter in the region by 2025. This study provides an invaluable perspective on different aspects of Ethiopia’s energy transition, focusing on achieving universal a

In [27]:
from habanero import counts
counts.citation_count(doi=doi)

23