In [3]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os
import re
import sys
import copy
import requests
import subprocess

import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
from PyPDF2 import PdfFileReader
from PyPDF2.utils import PdfReadError

import doi
import ads

from crossref.restful import Works

ads.config.token=os.environ['ADS_TOKEN']

# Reference:
# https://pypi.org/project/python-doi/
# https://gist.github.com/jrsmith3/5513926
# https://github.com/fabiobatalha/crossrefapi

In [5]:
def check_pdf_file(pdf_file, verbose=True):
    """Make sure a PDF file exists and is valid."""
    # Check the PDF file exist
    if not os.path.isfile(pdf_file):
        if verbose:
            print("# Can not find {:s}".format(pdf_file))
        return False
    try:
        pdf = PdfFileReader(open(pdf_file, 'rb'))
        pdf.stream.close()
        return True
    except PdfReadError:
        if verbose:
            print("# {:s} is not a valid PDF file".format(pdf_file))
        return False

def extract_doi(pdf_file, verbose=False):
    """Try to extract DOI string from the PDF file."""
    if not check_pdf_file(pdf_file, verbose=True):
        return None
    
    doi_use = None
    pdf = PdfFileReader(open(pdf_file, 'rb'))
    
    # Try to manually extract from the first page
    first_page = pdf.getPage(0).extractText()

    # Some ApJ paper has this issue...
    first_page = re.sub(r'©', ' ', first_page)

    if 'doi' in first_page:
        doi_str = doi.find_doi_in_text(first_page)
    else:
        doi_str = None

    try: 
        if doi.validate_doi(doi_str) is not None:
            doi_use = doi_str
        else:
            doi_use = None
    except ValueError:
        doi_use = None
        
    # Directly extract from PDF using doi.pdf_to_doi
    if doi_use is None:
        try: 
            doi_str = doi.pdf_to_doi(pdf_file)
            try: 
                if doi.validate_doi(doi_str) is not None:
                    doi_use = doi_str
                else:
                    doi_use = None
            except ValueError:
                doi_use = None 
        except Exception:
            doi_use = None
            
    if doi_use is None:
        print("# No valid DOI for {:s}".format(pdf_file))

    pdf.stream.close()
    
    if doi_use is not None and verbose:
        print("# DOI for {:s} is {:s}".format(pdf_file, doi_use))
    
    return doi_use

## Understand the metadata inside a PDF paper

### First author name and publication year based on DOI 

#### What is DOI?

* `DOI` (digital object identifier) is a ["persistent identifier or handle used to identify objects uniquely, standardized by the International Organization for Standardization (ISO)"](https://en.wikipedia.org/wiki/Digital_object_identifier)
* You can use [this website](https://www.doi.org/) to verify and resolve a DOI number

#### Goal:

* Try to extract DOI information from a downloaded paper in PDF format.
* Resolve the DOI, get the bibliographic information. 
* Get the family name of the first author and the year of publication
* Rename the PDF file based on these information

In [6]:
pdf_file = '/Users/song/Downloads/1904.07524.pdf'
pdf_file = '/Users/song/Downloads/352-2-L6.pdf'
#pdf_file = '/Users/song/Downloads/wu2019.pdf'
#pdf_file = '/Users/song/Downloads/Bleem_2020_ApJS_247_25.pdf'
#pdf_file = '/Users/song/Downloads/galaxies-05-00033.pdf'
#pdf_file = '/Users/song/Downloads/j.1745-3933.2008.00567.x.pdf'
#pdf_file = '/Users/song/Downloads/sensors-20-02031.pdf'
#pdf_file = '/Users/song/Downloads/dong2019.pdf'
pdf_file = '/Users/song/Downloads/stv264.pdf'
pdf_file = '/Users/song/Downloads/van_der_Wel_2014_ApJL_792_L6.pdf'

In [7]:
doi_use = extract_doi(pdf_file, verbose=True)



# DOI for /Users/song/Downloads/van_der_Wel_2014_ApJL_792_L6.pdf is 10.1088/2041-8205/792/1/L6


In [8]:
doi_url = "http://dx.doi.org/" + doi_use

headers = {"accept": "text/bibliography; style=bibtex"}

r = requests.get(doi_url, headers=headers)
re.split(' |, |{', r.text)[2]

'van_der_Wel_2014'

In [9]:
r.text

' @article{van_der_Wel_2014, title={GEOMETRY OF STAR-FORMING GALAXIES FROM SDSS, 3D-HST, AND CANDELS}, volume={792}, ISSN={2041-8213}, url={http://dx.doi.org/10.1088/2041-8205/792/1/L6}, DOI={10.1088/2041-8205/792/1/l6}, number={1}, journal={The Astrophysical Journal}, publisher={IOP Publishing}, author={van der Wel, A. and Chang, Yu-Yen and Bell, E. F. and Holden, B. P. and Ferguson, H. C. and Giavalisco, M. and Rix, H.-W. and Skelton, R. and Whitaker, K. and Momcheva, I. and et al.}, year={2014}, month={Aug}, pages={L6}}\n'

#### Try to get journal from ADS

In [243]:
a = list(ads.SearchQuery(q='doi:{:s}'.format(doi_use)))
print(a[0].bibcode)

2014ApJ...792L...6V


#### Test other regular expression for DOI

In [245]:
pattern = r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+)\b'
pattern = r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'])\S)+)\b'
pattern = r'\b(10.\d{4,9}/[-._;()/:A-Z0-9]+)\b'

In [247]:
re.match(pattern, doi_use)

<re.Match object; span=(0, 26), match='10.1088/2041-8205/792/1/L6'>