In [141]:
from unidecode import unidecode as decode
import bibtexparser as bp
import numpy as np
import re
from string import ascii_lowercase
import itertools

from urllib import request as get

In [70]:
def remove_accents_and_hyphens(s):
    replace = {'{\\l}': 'l',
               '{\\o}': 'o',
               '{\\i}': 'i',
               '{\\t}': 't'}
    for key, val in replace.items():
        s = s.replace(key, val)
    
    accents = r'''\{?\\[`'^"~=.uvHtcdbkr]?\s?\{?\\?(\w*)\}?'''
    accented_chars = [x for x in re.finditer(accents, s)]
    
    s_list = [c for c in s]
    hyphen = '-'
    for a in accented_chars:
        next_len = a.end() - a.start()
        s_list[a.start()] = a.group(1)
        s_list[(a.start() + 1):a.end()] = hyphen * (next_len - 1)
    
    return ''.join([c for c in s_list if c != hyphen])

In [3]:
def match(names, template):
    #check if list of strings names contains a match to the potentially multi-word
    #template.  return the position of the start of the left-most match
    template = template.split(' ')
    for i, x in enumerate(names[:len(names) - len(template) + 1]):
        found_match = False
        for j, t in enumerate(template):
            if names[i + j].lower() != t:
                found_match = False
                break
            else:
                found_match = True
        if found_match:
            return i
    return -1

prefixes = ['de', 'da', 'di', 'von', 'zu', 'van', 'du', 'des',
            'del', 'de la', 'della', 'la', 'le', 'der', 'af',
            'st.', 'saint', 'st', 'dom', 'do', 'das', 'dos', 'of', 'al',
            'el', 'dei', 'tot', 'thoe', 'aw', 'na', 'sri', 'phra',
            'si', 'shri', 'lo', 'no', 'op', 'lopes', 'gonzalez', 'vom', 'castro']

suffixes = ['sr.', 'jr.', 'sr', 'jr', 'senior', 'junior', 'iii', 'iv', 'v', 
            'vi', 'vii', 'viii', 'ix', 'x', 'the', 'third', 'fourth', 'fifth',
            'sixth', 'seventh', 'eighth', 'ninth', 'tenth', 'great']

In [83]:
def remove_curlies(s): #only removes *matching* curly braces
    for i, c in enumerate(s):    
        if c == '{':
            j = i
            
            curly_count = 1
            while (j < len(s) - 1):
                j += 1
                if s[j] == '{':
                    curly_count += 1

                if s[j] == '}':
                    curly_count -= 1

                if curly_count == 0:
                    break

            if curly_count == 0:
                return remove_curlies(s[:i] + ''.join(s[(i+1):j].split(' ')) + s[(j+1):])
    return s

In [84]:
def remove_non_letters(s):
    remove_chars = [',', '.', '!', '?', "'", '"', '{', '}', '-']    
    for c in remove_chars:
        s = s.replace(c, '')
    return s

In [85]:
def rearrange(name):
    original_name = name

    #remove suffixes and convert to list
    name = [remove_non_letters(x.strip()) for x in name.split(',')]
    sxs = [n for n in name if n.lower() in suffixes]
    name = [n for n in name if n.lower() not in suffixes]

    if len(name) == 2: #last, first (+ middle)
        x = ' '.join([name[1], name[0]])
    elif len(name) == 1: #first (+ middle) + last
        x = name[0]
    elif len(name) == 0:
        raise Exception(f'no non-suffix names: {original_name}')
    elif len(name) > 2:
        raise Exception(f'too many commas: {original_name}')
    return x

In [74]:
def last_name(names):
    #remove suffixes and non-letters
    names = [remove_non_letters(n) for n in rearrange(names).split(' ') if not (n.lower() in suffixes)]

    #start at the end and move backward
    x = []
    found_prefix = False
    for n in reversed(names):
        if n.lower() in prefixes:
            found_prefix = True
        elif found_prefix or len(x) > 0:
            break
        x.append(n)
    if found_prefix:
        return ''.join(reversed(x))
    else:
        return x[0]

In [75]:
def authors2key(authors, year):
    def key(author):
        #convert accented unicode characters to closest ascii equivalent
        author = decode(author)

        #re-arrange author name to FIRST [MIDDLE] LAST [SUFFIX]
        author = remove_accents_and_hyphens(author)
        author = remove_curlies(author)

        #get first 4 letters of last name
        return last_name(author)[:4]
    
    yr_str = str(year)[-2:]
    
    authors = authors.split(' and ')
    if len(authors) == 0:
        raise Exception('Author information missing, no key generated')
    elif len(authors) == 1:
        return key(authors[0]) + yr_str
    elif len(authors) == 2:
        return key(authors[0]) + key(authors[1]) + yr_str
    elif len(authors) >= 3:
        return key(authors[0]) + 'Etal' + yr_str
    else:
        raise Exception('Something went wrong...')

In [116]:
bibfile = '../memlab.bib'
parser = bp.bparser.BibTexParser(ignore_nonstandard_types=True, common_strings=True, homogenize_fields=True)

with open(bibfile, 'r') as b:
    bibdata = bp.load(b, parser=parser)

Entry type webpage not standard. Not considered.
Entry type webpage not standard. Not considered.


In [117]:
bd = bibdata.get_entry_dict()

In [118]:
#get all fields
fields = {}
for k in bd.keys():
    next_entry = bd[k]
    for field, vals in next_entry.items():
        if not (field in fields.keys()):
            fields[field] = [vals]
        else:
            fields[field].append(vals)

for k in fields.keys():
    fields[k] = list(np.unique(fields[k]))

In [119]:
def get_vals(bd, field):
    def safe_get(item, field):
        if field in item.keys():
            return item[field]
        else:
            return ''
    
    return [safe_get(i, field) for k, i in bd.items()]

In [120]:
def same_id(a, b, ignore_special=False):
    if ignore_special and (('\\' in a) or ('\\' in b)):
        return True
    
    if len(a) > len(b):
        return same_id(b, a)
    elif a == b:
        return True
    else:
        return a == b[:len(a)]

In [121]:
authors = get_vals(bd, 'author')
years = get_vals(bd, 'year')
ids = get_vals(bd, 'ID')

In [122]:
gen_ids = [authors2key(a, y) for a, y in zip(authors, years)]

In [123]:
#check keys
tofix = [f'{h}. [{i}] should be [{g}]' for i, g, h in zip(ids, gen_ids, range(0, len(ids))) if not same_id(i, g, ignore_special=False)] #check these over carefully...
if len(tofix) == 0:
    print('Congrats!  No keys to fix.')
else:
    print(f'Need to fix {len(tofix)} keys: \n')
    print('\n'.join(tofix))

Congrats!  No keys to fix.


In [124]:
def duplicate_inds(x):
    # for the list x, return a new list containing 0 or more
    # lists of the indices of matching (non-unique) elements
    y = []
    unique_vals, counts = np.unique(x, return_counts=True)
    for v in [v for i, v in enumerate(unique_vals) if counts[i] > 1]:
        y.append([i for i, j in enumerate(x) if j == v])
    return y

In [125]:
duplicates = []
duplicate_title_inds = duplicate_inds(titles)

last_names = [' and '.join([last_name(a) for a in authors[i].split(' and ') if not (a in ignore_authors)]) for i in range(len(authors))]
duplicate_authors = duplicate_inds(last_names)

for i in duplicate_title_inds:
    duplicate_author_inds = duplicate_inds([last_names[j] for j in i])
    for a in duplicate_author_inds:
        duplicates.append(a)

In [126]:
if len(duplicates) > 0:
    for d in duplicates:
        print(f'Duplicate IDs [ind, key]: {[[i, ids[i]] for i in d]}')
else:
    print('Congrats, no duplicates!')

Congrats, no duplicates!


In [165]:
def get_key_suffixes(n):
    #source: https://stackoverflow.com/questions/29351492/how-to-make-a-continuous-alphabetic-list-python-from-a-z-then-from-aa-ab-ac-e/29351603
    if n <= 1:
        return ''
    
    def generate_id():
        i = 1
        while True:
            for s in itertools.product(ascii_lowercase, repeat=i):
                yield ''.join(s)
            i += 1
    
    gen = generate_id()
    
    def helper():
        for s in gen:
            return s
    
    return [helper() for i in range(n)]

In [176]:
list(np.array(ids)[duplicate_inds(gen_ids)[0]])

['Adey67', 'Adey67b']

In [180]:
#Check bibtex keys. Duplicates should be assigned a suffix of 'a', 'b', etc.
#If keys match aside from suffix then still allow the bibtex file to "pass"
#as long as all "matching" keys are unique and all have suffixes and the 
#suffixes span a, b, c, ..., etc. without gaps

def check_keys(ids, gen_ids):
    checked = []
    bad_keys = []
    
    #for duplicate base keys, ensure correct suffixes
    same_base = duplicate_inds(gen_ids)
    for inds in same_base:
        next_base = gen_ids[inds[0]]
        target_keys = [next_base + x for x in get_key_suffixes(len(inds))]
        actual_keys = list(np.array(ids)[inds])
        actual_keys.sort()
        
        for a, t in zip(actual_keys, target_keys):
            checked.append(a)
            if not (a == t): #FIXME: the check isn't quite right-- if a is *somewhere* in target_keys, it could still be OK
                bad_keys.append([a, t]) #instead, figure out which keys are present and/or duplicated and/or missing.
                                #if there's an extra actual key (not in target_keys), rename it to one of the "free"
                                #target keys, leaving all other actual keys unchanged
        
    #for non-duplicate base keys, ensure *no* suffixes
    for i, g in zip(ids, gen_ids):
        if i not in checked:
            if not (i == g):
                bad_keys.append([i, g])
    
    return bad_keys

In [183]:
bad_keys = check_keys(ids, gen_ids)

if len(bad_keys) > 0:
    print(f'{len(bad_keys)} incorrect keys found:\n')
    for x in bad_keys:
        print(f'{x[0]} should be renamed to {x[1]}')
else:
    print('No incorrect keys found!')
    


250 incorrect keys found:

Adey67 should be renamed to Adey67a
AndeEtal04 should be renamed to AndeEtal04a
AndeEtal98 should be renamed to AndeEtal98a
AndeEtal99 should be renamed to AndeEtal99a
AndeEtal99a should be renamed to AndeEtal99b
AndeEtal99b should be renamed to AndeEtal99c
Bala98 should be renamed to Bala98a
BarrEtal12 should be renamed to BarrEtal12a
BarrEtal12a should be renamed to BarrEtal12b
BarrEtal12b should be renamed to BarrEtal12c
BartEtal04 should be renamed to BartEtal04a
BartEtal04a should be renamed to BartEtal04b
BartEtal04b should be renamed to BartEtal04c
BartEtal11 should be renamed to BartEtal11a
BartEtal11a should be renamed to BartEtal11b
BernEtal01 should be renamed to BernEtal01a
BernEtal01a should be renamed to BernEtal01b
Blan86 should be renamed to Blan86a
Blan86a should be renamed to Blan86b
BrowEtal07 should be renamed to BrowEtal07a
BrunEtal02 should be renamed to BrunEtal02a
BrunEtal02a should be renamed to BrunEtal02b
BrunEtal08 should be rename

In [21]:
remove_fields = ['date-modified', 'date-added',
                 'note', 'bdsk-url-1', 'pst', 'pmid', 'pmc',
                 'mesh', 'keyword', 'journal-full', 'abstract',
                 'mendeley-groups', 'file', 'bdsk-url-2', 'eprint',
                 'arxivid', 'archiveprefix', 'ty', 'm3', 'l3',
                 'howpublished', 'lccn', 'read', 'annote', 'owner',
                 'timestamp', 'pii', 'zb', 'z9', 'z8' ,'times-cited',
                 'publication-type', 'isi', 'language', 'stat', 'so',
                 'sb', 'rf', 'pubm', 'pt', 'pl', 'phst', 'own', 'mhda',
                 'jid', 'ip', 'edat', 'dcom', 'da', 'au', 'aid', 'affiliation',
                 'lr', 'gr', 'jt', 'local-url', 'dep', 'mh', 'cin', 'ci',
                 'comment', 'con', 'card', 'oto', 'ot', 'unique-id',
                 'subject-category', 'number-of-cited-references', 'rating',
                 'rn', 'oid', 'issn', 'isbn', 'doc-delivery-number']

In [22]:
keep_fields = [k for k in fields.keys() if k not in remove_fields]

In [23]:
keep_fields

['year',
 'volume',
 'title',
 'pages',
 'number',
 'journal',
 'author',
 'ENTRYTYPE',
 'ID',
 'booktitle',
 'publisher',
 'editor',
 'school',
 'chapter',
 'address',
 'month',
 'organization',
 'doi',
 'url',
 'date',
 'series',
 'bdsk-file-1',
 'type',
 'institution',
 'edition',
 'location']

To do:
- ~~handle first-author surnames that start with lowercase characters~~
- ~~better handling of curly braces to preserve name ordering~~
- ~~better handling of escape characters~~
- ensure bibtex file compiles and gets parsed correctly
- ~~Check duplicate titles/authors (fail if detected)~~
- Check bibtex keys.  Duplicates should be assigned a suffix of 'a', 'b', etc.
- If keys match aside from suffix then still allow the bibtex file to "pass" as long as all "matching" keys are unique *and* all have suffixes *and* the suffixes span a, b, c, ..., etc. without gaps
- Correct page numberings:
  - Change hyphens to n-dashes
  - Change m-dashes to n-dashes
  - Remove spaces in page ranges
  - Print a warning for strangely formatted pages: leading 0, non-digits, very large numbers (greater than 10K?) but allow bibtex file to pass
- remove everything except for "keep_fields"
  - Also add a "force" field that, when present, causes the checker to automatically pass that entry (if set to True); this will be used as a workaround for special cases not handled by the parser/checker
- change all urls, dois, and page numbers to lowercase

Clean up:
- correct all "strings" to full journal names
- correct all journal abbreviations to full journal names
- Ensure no periods in journal names
- Ensure no periods at the ends of affiliations or titles
- Check for compressed initials (AA --> A A; A.A. --> A A; etc.)

Bonus:
- Use scholarly to verify information.  However, this is currently unreliable (server seems to hang frequently) and too slow to be viable