In [7]:
from unidecode import unidecode as decode
import bibtexparser as bp
import numpy as np
import re
from string import ascii_lowercase
import itertools
from scholarly import scholarly as gs

from urllib import request as get

In [8]:
def remove_accents_and_hyphens(s):
    replace = {'{\\l}': 'l',
               '{\\o}': 'o',
               '{\\i}': 'i',
               '{\\t}': 't'}
    for key, val in replace.items():
        s = s.replace(key, val)
    
    accents = r'''\{?\\[`'^"~=.uvHtcdbkr]?\s?\{?\\?(\w*)\}?'''
    accented_chars = [x for x in re.finditer(accents, s)]
    
    s_list = [c for c in s]
    hyphen = '-'
    for a in accented_chars:
        next_len = a.end() - a.start()
        s_list[a.start()] = a.group(1)
        s_list[(a.start() + 1):a.end()] = hyphen * (next_len - 1)
    
    return ''.join([c for c in s_list if c != hyphen])

In [9]:
def match(names, template):
    #check if list of strings names contains a match to the potentially multi-word
    #template.  return the position of the start of the left-most match
    template = template.split(' ')
    for i, x in enumerate(names[:len(names) - len(template) + 1]):
        found_match = False
        for j, t in enumerate(template):
            if names[i + j].lower() != t:
                found_match = False
                break
            else:
                found_match = True
        if found_match:
            return i
    return -1

prefixes = ['de', 'da', 'di', 'von', 'zu', 'van', 'du', 'des',
            'del', 'de la', 'della', 'la', 'le', 'der', 'af',
            'st.', 'saint', 'st', 'dom', 'do', 'das', 'dos', 'of', 'al',
            'el', 'dei', 'tot', 'thoe', 'aw', 'na', 'sri', 'phra',
            'si', 'shri', 'lo', 'no', 'op', 'lopes', 'gonzalez', 'vom', 'castro']

suffixes = ['sr.', 'jr.', 'sr', 'jr', 'senior', 'junior', 'iii', 'iv', 'v', 
            'vi', 'vii', 'viii', 'ix', 'x', 'the', 'third', 'fourth', 'fifth',
            'sixth', 'seventh', 'eighth', 'ninth', 'tenth', 'great']

In [10]:
def remove_curlies(s): #only removes *matching* curly braces
    for i, c in enumerate(s):    
        if c == '{':
            j = i
            
            curly_count = 1
            while (j < len(s) - 1):
                j += 1
                if s[j] == '{':
                    curly_count += 1

                if s[j] == '}':
                    curly_count -= 1

                if curly_count == 0:
                    break

            if curly_count == 0:
                return remove_curlies(s[:i] + ''.join(s[(i+1):j].split(' ')) + s[(j+1):])
    return s

In [76]:
def remove_non_letters(s):
    remove_chars = [',', '.', '!', '?', "'", '"', '{', '}', '-', '\\']    
    for c in remove_chars:
        s = s.replace(c, '')
    return s

In [77]:
def rearrange(name):
    original_name = name

    #remove suffixes and convert to list
    name = [remove_non_letters(x.strip()) for x in name.split(',')]
    sxs = [n for n in name if n.lower() in suffixes]
    name = [n for n in name if n.lower() not in suffixes]

    if len(name) == 2: #last, first (+ middle)
        x = ' '.join([name[1], name[0]])
    elif len(name) == 1: #first (+ middle) + last
        x = name[0]
    elif len(name) == 0:
        raise Exception(f'no non-suffix names: {original_name}')
    elif len(name) > 2:
        raise Exception(f'too many commas: {original_name}')
    return x

In [78]:
def last_name(names):
    #remove suffixes and non-letters
    names = [remove_non_letters(n) for n in rearrange(names).split(' ') if not (n.lower() in suffixes)]

    #start at the end and move backward
    x = []
    found_prefix = False
    for n in reversed(names):
        if n.lower() in prefixes:
            found_prefix = True
        elif found_prefix or len(x) > 0:
            break
        x.append(n)
    if found_prefix:
        return ''.join(reversed(x))
    else:
        return x[0]

In [79]:
def last_names_from_str(x):
    #pass in a single string (and-separated) or list of authors and get back a list of last names
    if type(x) == str:
        return [last_name(n) for n in x.split(' and ')]
    elif type(x) == list:
        return [last_name(n) for n in x]
    else:
        return ['']

In [80]:
def authors2key(authors, year):
    def key(author):
        #convert accented unicode characters to closest ascii equivalent
        author = decode(author)

        #re-arrange author name to FIRST [MIDDLE] LAST [SUFFIX]
        author = remove_accents_and_hyphens(author)
        author = remove_curlies(author)

        #get first 4 letters of last name
        return last_name(author)[:4]
    
    yr_str = str(year)[-2:]
    
    authors = authors.split(' and ')
    if len(authors) == 0:
        raise Exception('Author information missing, no key generated')
    elif len(authors) == 1:
        return key(authors[0]) + yr_str
    elif len(authors) == 2:
        return key(authors[0]) + key(authors[1]) + yr_str
    elif len(authors) >= 3:
        return key(authors[0]) + 'Etal' + yr_str
    else:
        raise Exception('Something went wrong...')

In [81]:
bibfile = '../memlab.bib'
parser = bp.bparser.BibTexParser(ignore_nonstandard_types=True, common_strings=True, homogenize_fields=True)

with open(bibfile, 'r') as b:
    bibdata = bp.load(b, parser=parser)

In [82]:
bd = bibdata.get_entry_dict()

In [83]:
#get all fields
fields = {}
for k in bd.keys():
    next_entry = bd[k]
    for field, vals in next_entry.items():
        if not (field in fields.keys()):
            fields[field] = [vals]
        else:
            fields[field].append(vals)

for k in fields.keys():
    fields[k] = list(np.unique(fields[k]))

In [84]:
def get_vals(bd, field):
    def safe_get(item, field):
        if field in item.keys():
            return item[field]
        else:
            return ''
    
    return [safe_get(i, field) for k, i in bd.items()]

In [85]:
def same_id(a, b, ignore_special=False):
    if ignore_special and (('\\' in a) or ('\\' in b)):
        return True
    
    if len(a) > len(b):
        return same_id(b, a)
    elif a == b:
        return True
    else:
        return a == b[:len(a)]

In [86]:
authors = get_vals(bd, 'author')
years = get_vals(bd, 'year')
ids = get_vals(bd, 'ID')
titles = [remove_curlies(t) for t in get_vals(bd, 'title')]

In [87]:
gen_ids = [authors2key(a, y) for a, y in zip(authors, years)]

In [88]:
#check key intergrity
tofix = [f'{h}. [{i}] should be [{g}]' for i, g, h in zip(ids, gen_ids, range(0, len(ids))) if not same_id(i, g, ignore_special=False)] #check these over carefully...
if len(tofix) == 0:
    print('Congrats!  No keys to fix.')
else:
    print(f'Need to fix {len(tofix)} keys: \n')
    print('\n'.join(tofix))

Congrats!  No keys to fix.


In [89]:
#check for duplicate keys
unique_ids, counts = np.unique(ids, return_counts=True)
duplicate_keys = unique_ids[np.where(counts > 1)[0]]
if len(duplicate_keys) > 0:
    print('Multiple entries for the following keys:')
    print('\n'.join(duplicate_keys))
else:
    print('No duplicated keys!')

No duplicated keys!


In [90]:
def duplicate_inds(x):
    # for the list x, return a new list containing 0 or more
    # lists of the indices of matching (non-unique) elements
    y = []
    unique_vals, counts = np.unique(x, return_counts=True)
    for v in [v for i, v in enumerate(unique_vals) if counts[i] > 1]:
        y.append([i for i, j in enumerate(x) if j == v])
    return y

In [91]:
duplicates = []
duplicate_title_inds = duplicate_inds(titles)
ignore_authors = []

last_names = [' and '.join(last_names_from_str(a)) for a in authors]
#last_names = [' and '.join([last_name(a) for a in authors[i].split(' and ') if not (a in ignore_authors)]) for i in range(len(authors))]
duplicate_authors = duplicate_inds(last_names)

for i in duplicate_title_inds:
    duplicate_author_inds = duplicate_inds([last_names[j] for j in i])
    for a in duplicate_author_inds:
        duplicates.append(a)

In [93]:
if len(duplicates) > 0:
    for d in duplicates:
        print(f'Duplicated authors/titles found for the following keys [ind, key]: {[[i, ids[i]] for i in d]}')
else:
    print('Congrats, no duplicated authors/titles!')

Congrats, no duplicated authors/titles!


In [94]:
def get_key_suffixes(n):
    #source: https://stackoverflow.com/questions/29351492/how-to-make-a-continuous-alphabetic-list-python-from-a-z-then-from-aa-ab-ac-e/29351603
    if n <= 1:
        return ''
    
    def generate_id():
        i = 1
        while True:
            for s in itertools.product(ascii_lowercase, repeat=i):
                yield ''.join(s)
            i += 1
    
    gen = generate_id()
    
    def helper():
        for s in gen:
            return s
    
    return [helper() for i in range(n)]

In [95]:
#Check bibtex keys. Duplicates should be assigned a suffix of 'a', 'b', etc.
#If keys match aside from suffix then still allow the bibtex file to "pass"
#as long as all "matching" keys are unique and all have suffixes and the 
#suffixes span a, b, c, ..., etc. without gaps

def check_keys(ids, gen_ids):
    checked = []
    bad_keys = []
    
    #for duplicate base keys, ensure correct suffixes
    same_base = duplicate_inds(gen_ids)
    for inds in same_base:
        next_base = gen_ids[inds[0]]
        target_keys = [next_base + x for x in get_key_suffixes(len(inds))]
        actual_keys = list(np.array(ids)[inds])
        
        correct_keys = [a for a in actual_keys if a in target_keys]
        missing_keys = [t for t in target_keys if t not in actual_keys]
        i = 0
        
        for a in actual_keys:
            checked.append(a)
            if a not in target_keys:
                bad_keys.append([a, missing_keys[i]])
                i += 1                
        
    #for non-duplicate base keys, ensure *no* suffixes
    for i, g in zip(ids, gen_ids):
        if i not in checked:
            if not (i == g):
                bad_keys.append([i, g])
    
    return bad_keys

In [96]:
bad_keys = check_keys(ids, gen_ids)

if len(bad_keys) > 0:
    print(f'{len(bad_keys)} incorrect key sequence(s) found:\n')
    for x in bad_keys:
        print(f'{x[0]} should be renamed to {x[1]}')
else:
    print('No incorrect key sequences found!')

No incorrect key sequences found!


In [112]:
# 1. numbers separated by n-dash with no spaces; right number larger than left number
# 2. zero or more lowercase letter(s) + sequence of digits
# 3. two combinations of letter(s) + sequence of digits:
#   - same letters at the beginning
#   - right number larger than left number
# 4. two uppercase letters, hypthen, digit, letter, ., two digits (e.g., PS-2B.16)
# 5. empty string
# 6. doi

def valid_pages(p):
    def valid_page(p): #single page, no hyphens
        if len(p) == 0: #empty string
            return True, 'empty', None
        
        try:
            v = int(p) #integer
            return True, 'int', v
        except:
            pass
        
        #prefix of one or more letters, followed by a sequence of digits
        r1 = re.compile(r'''(?P<prefix>[a-zA-Z]+)(?P<digits>\d+)''')
        x = r1.fullmatch(p)
        if not (x is None):
            return True, 'prefixed', [x.group('prefix'), int(x.group('digits'))]
        
        #two uppercase letters, hyphen, digit, letter, ., two digits
        r2 = re.compile(r'''(?P<prefix>[A-Z]{2}-[\dA-Z]{2}).(?P<digits>\d+)''')
        x = r2.fullmatch(p)
        if not (x is None):
            return True, 'conference', [x.group('prefix'), int(x.group('digits'))]
        
        #doi address
        r3 = re.compile(r'''doi\.org/(?P<doi>[a-z\d\-\./]+)''')
        x = r3.fullmatch(p)
        if not (x is None):
            return True, 'doi', None
        
        #arXiv section
        r4 = re.compile(r'''((?P<subject>[a-z]{2,})/)?(?P<article>[\d]+(v\d)?)''')
        x = r4.fullmatch(p)
        if not (x is None):
            return True, 'arxiv', None
        
        return False, 'invalid', None
    
    valid, kind, val = valid_page(p)
    if valid: #"single" page
        return True, p
    else: #page range
        #split by hyphen
        ps = [x.strip() for x in p.split('-') if len(x.strip()) > 0]
        if len(ps) == 2:
            valid1, kind1, val1 = valid_page(ps[0])
            valid2, kind2, val2 = valid_page(ps[1])
            
            if (not (valid1 and valid2)) or (not (kind1 == kind2)):
                return False, p
            
            if kind1 == 'int':
                if val1 < val2:
                    return True, '--'.join(ps)
                else:
                    return False, p
            elif kind1 in ['prefixed', 'conference']:
                return val1, val2, kind1, kind2
                if (val1[0] == val2[0]) and (val1[1] < val2[1]):
                    return True, '--'.join(ps)
                else:
                    return False, p
            #dois and arxiv sections can't be specified as ranges
        return False, p

In [132]:
def scholar_lookup(pub, force_n_authors=3):
    #return None if there are ambiguities, or a dict of info if
    #a high-probability match is found
    #
    #force_n_authors (default: 3) specifies that the first "n" authors
    #must match in order for a publication to be verified.  if the author
    #list specified by the pub dictionary is longer than the list obtained
    #from google scholar, check only up to min(len(authors_from_scholar), force_n_authors)
    #authors' last names.  this overcomes a "bug" in google scholar that prevents
    #all authors' names from being returned when the author list is long
    try:
        x = gs.search_pubs(query=pub['title'] + ' AND ' + pub['author'])
        results = [r for r in x]
        if len(pubs) != 1:
            return None
    except:
        return None
    
    if not ('title' in pub.keys() and 'author' in pub.keys()):
        return None
    
    if not ('bib' in results[0].keys() and 
            type(results[0]['bib']) == dict and 
            'title' in results[0]['bib'].keys() and 
            'author' in results[0]['bib'].keys()):
        return None
    
    if remove_non_letters(pub['title'].lower()) != remove_non_letters(results[0]['bib']['title'].lower()):
        return None
    
    #check authors
    bibtex_last_names = last_names_from_str(pub['author'])
    scholar_last_names = last_names_from_str(results[0]['bib']['author'])
    
    if len(scholar_last_names) != len(bibtex_last_names):
        if len(scholar_last_names) < force_n_authors:
            return None
    n = np.min([len(scholar_last_names), len(bibtex_last_names)])
    for b, s in zip(bibtex_last_names[:n], scholar_last_names[:n]):
        if not (b == s):
            return None
    try:
        return gs.fill(pubs[0])['bib']
    except:
        return pubs[0]['bib']

In [119]:
#page numbering
pages = get_vals(bd, 'pages')

#first check for invalid pages/entries
bad_pages = [(ids[i], valid_pages(p)) for i, p in enumerate(pages) if not valid_pages(p)[0]]

if len(bad_pages) > 0:
    print(f'{len(bad_pages)} page numbering problems found (*s can be fixed by converting to lowercase):\n')
    for b in bad_pages:
        #see if converting to lowercase fixes the issue
        recheck = valid_pages(b[1][1].lower())
        if recheck[0]:
            suffix = ' (*)'
        else:
            suffix = ''
        print(f'{b[0]}: {b[1][1]}{suffix}')
else:
    print('No incorrect page numberings found!')

192 page numbering problems found (*s can be fixed by converting to lowercase):

MadrEtal17: 1706.06083
CarlWagn18: 1801.01944
UnglGrac12: 422-30
SchaMoor07: 554-551
ChaoEtal95: 157-68
LaxpGros10: N17-20
GiveOlto95: 269-76
JaggEtal04: 104--14
SchaSing00: 135--68
vanAEtal06: 1185-94
Mill10: 6477-9
BartEtal11b: 17562-7
MortEtal13: 2407--2402
Gold95: 477-85
SohaHass98a: 869-82
HashEtal12: 141-8
LegaEtal00: 1635--42
GiveOlto90: 849-55
FrieGold94: 2775-88
McNaShel03: 333--35
BullEtal90: 609--19
EdwaEtal09: 90954--2008
BurgHitc05: 535-41
KahnAgui12: 2294-9
EkstEtal07: 606--17
CraiEtal94: 864--864
SiapWils98: 1123-8
delaEtal07: 802-6
EeckFree90: 238--44
DellEtal00: 8410-6
TehaHump96: 719-32
MachEtal06: S247-S258.
OkunEtal07: 310--14
GothEtal01: 7284-92
DrucAgui09: 2269-80
WhitWang83: 1054--57
FiorEtal03: 1898-902
EdelEtal90: 145-55
WolbBuch05: 3333-40
PikeEtal00: 205--13
FiorEtal13: 4693-709
NakaEtal04: 269--80
CentEtal01: 1071--77
GothEtal96: 8027-40
WilsMcNa93: 1055-8
JacoEtal06: 978--87
Ha

In [133]:
debug = True
if debug:
    info = [scholar_lookup(bd[b[0]]) for b in bad_pages[:5]] #attempt to auto-correct using google scholar
else:
    info = [scholar_lookup(bd[b[0]]) for b in bad_pages]

In [138]:
info #FIXME: google scholar seems to be timing out...

[None, None, None, None, None]

In [32]:
print('\n'.join(np.unique(pages)))


016006
026010
026013
026109
046017
046028
051917
056103
1
1 - 15
1 - 16
1 - 28
1 - 48
1 - 57
1 - 6
1 - 9
1 -- 16
1 -- 305
1 -- 37
1--10
1--11
1--12
1--13
1--14
1--15
1--16
1--17
1--18
1--19
1--20
1--21
1--23
1--25
1--26
1--27
1--3
1--32
1--34
1--37
1--39
1--4
1--46
1--54
1--58
1--6
1--60
1--7
1--8
1--9
1--97
1-10
1-11
1-12
1-13
1-15
1-16
1-17
1-18
1-21
1-24
1-25
1-26
1-3
1-300
1-32
1-38
1-43
1-48
1-52
1-6
1-67
1-7
1-8
1-9
1-90
1-91
1-97
10
10--12
10-13
100--105
100--108
100--109
100-107
1001-1012
1001-1021
1002--1016
1002-1014
1003--1014
1003-1018
1004-1005
1006--1020
1007-1012
101--107
101--117
101--128
101-106
101-108
1011--1018
10120
1014--1018
1015--1022
1017--1025
1017--1044
10186-10190
102 - 111
102--106
102-105
102-111
10203--10214
1021--1057
1023--1024
1023-1035
10232
1024--1027
1024-1031
1024-1039
10240--10245
1025--1034
1025--1044
1026--1031
1026--1034
1026--1037
103 - 107
103 - 112
103--107
103--109
103--117
103--118
103--119
103-105
103-107
103-109
103-128
103-189
1030--10

In [33]:
remove_fields = ['date-modified', 'date-added',
                 'note', 'bdsk-url-1', 'pst', 'pmid', 'pmc',
                 'mesh', 'keyword', 'journal-full', 'abstract',
                 'mendeley-groups', 'file', 'bdsk-url-1', 'bdsk-url-2', 'eprint',
                 'arxivid', 'archiveprefix', 'ty', 'm3', 'l3',
                 'howpublished', 'lccn', 'read', 'annote', 'owner',
                 'timestamp', 'pii', 'zb', 'z9', 'z8' ,'times-cited',
                 'publication-type', 'isi', 'language', 'stat', 'so',
                 'sb', 'rf', 'pubm', 'pt', 'pl', 'phst', 'own', 'mhda',
                 'jid', 'ip', 'edat', 'dcom', 'da', 'au', 'aid', 'affiliation',
                 'lr', 'gr', 'jt', 'local-url', 'dep', 'mh', 'cin', 'ci',
                 'comment', 'con', 'card', 'oto', 'ot', 'unique-id',
                 'subject-category', 'number-of-cited-references', 'rating',
                 'rn', 'oid', 'issn', 'isbn', 'doc-delivery-number']

In [34]:
keep_fields = [k for k in fields.keys() if k not in remove_fields]

In [35]:
keep_fields

['year',
 'volume',
 'title',
 'pages',
 'number',
 'journal',
 'author',
 'ENTRYTYPE',
 'ID',
 'booktitle',
 'publisher',
 'editor',
 'school',
 'chapter',
 'address',
 'month',
 'organization',
 'doi',
 'url',
 'date',
 'series',
 'bdsk-file-1',
 'type',
 'institution',
 'edition',
 'location']

To do:
- ~~handle first-author surnames that start with lowercase characters~~
- ~~better handling of curly braces to preserve name ordering~~
- ~~better handling of escape characters~~
- ~~ensure bibtex file compiles and gets parsed correctly~~
- ~~Check duplicate titles/authors (fail if detected)~~
- ~~Check bibtex keys.  Duplicates should be assigned a suffix of 'a', 'b', etc.~~
- ~~If keys match aside from suffix then still allow the bibtex file to "pass" as long as all "matching" keys are unique *and* all have suffixes *and* the suffixes span a, b, c, ..., etc. without gaps~~
- Correct page numberings:
  - Change hyphens to n-dashes
  - Change m-dashes to n-dashes
  - Remove spaces in page ranges
  - Print a warning for strangely formatted pages: leading 0, non-digits, very large numbers (greater than 10K?) but allow bibtex file to pass
- remove everything except for "keep_fields"
  - Also add a "force" field that, when present, causes the checker to automatically pass that entry (if set to True); this will be used as a workaround for special cases not handled by the parser/checker
- change all urls, dois, and page numbers to lowercase

Clean up:
- correct all "strings" to full journal names
- correct all journal abbreviations to full journal names
- Ensure no periods in journal names
- Ensure no periods at the ends of affiliations or titles
- Check for compressed initials (AA --> A A; A.A. --> A A; etc.)

Bonus:
- Use scholarly to verify information.  However, this is currently unreliable (server seems to hang frequently) and too slow to be viable