In [1]:
from unidecode import unidecode as decode
import bibtexparser as bp
import numpy as np
import pandas as pd
import re
from string import ascii_lowercase
import itertools

In [2]:
def remove_accents_and_hyphens(s):
    replace = {'{\\l}': 'l',
               '{\\o}': 'o',
               '{\\i}': 'i',
               '{\\t}': 't'}
    for key, val in replace.items():
        s = s.replace(key, val)
    
    accents = r'''\{?\\[`'^"~=.uvHtcdbkr]?\s?\{?\\?(\w*)\}?'''
    accented_chars = [x for x in re.finditer(accents, s)]
    
    s_list = [c for c in s]
    hyphen = '-'
    for a in accented_chars:
        next_len = a.end() - a.start()
        s_list[a.start()] = a.group(1)
        s_list[(a.start() + 1):a.end()] = hyphen * (next_len - 1)
    
    return ''.join([c for c in s_list if c != hyphen])

In [3]:
def match(names, template):
    #check if list of strings names contains a match to the potentially multi-word
    #template.  return the position of the start of the left-most match
    template = template.split(' ')
    for i, x in enumerate(names[:len(names) - len(template) + 1]):
        found_match = False
        for j, t in enumerate(template):
            if names[i + j].lower() != t:
                found_match = False
                break
            else:
                found_match = True
        if found_match:
            return i
    return -1

prefixes = ['de', 'da', 'di', 'von', 'zu', 'van', 'du', 'des',
            'del', 'de la', 'della', 'la', 'le', 'der', 'af',
            'st.', 'saint', 'st', 'dom', 'do', 'das', 'dos', 'of', 'al',
            'el', 'dei', 'tot', 'thoe', 'aw', 'na', 'sri', 'phra',
            'si', 'shri', 'lo', 'no', 'op', 'lopes', 'gonzalez', 'vom', 'castro']

suffixes = ['sr.', 'jr.', 'sr', 'jr', 'senior', 'junior', 'iii', 'iv', 'v', 
            'vi', 'vii', 'viii', 'ix', 'x', 'the', 'third', 'fourth', 'fifth',
            'sixth', 'seventh', 'eighth', 'ninth', 'tenth', 'great']

In [4]:
def remove_curlies(s): #only removes *matching* curly braces
    for i, c in enumerate(s):    
        if c == '{':
            j = i
            
            curly_count = 1
            while (j < len(s) - 1):
                j += 1
                if s[j] == '{':
                    curly_count += 1

                if s[j] == '}':
                    curly_count -= 1

                if curly_count == 0:
                    break

            if curly_count == 0:
                return remove_curlies(s[:i] + ''.join(s[(i+1):j].split(' ')) + s[(j+1):])
    return s

In [5]:
def remove_non_letters(s):
    remove_chars = [',', '.', '!', '?', "'", '"', '{', '}', '-', '\\']    
    for c in remove_chars:
        s = s.replace(c, '')
    return s

In [6]:
def rearrange(name, preserve_non_letters=False):
    original_name = name

    #remove suffixes and convert to list
    if preserve_non_letters:
        name = [x.strip() for x in name.split(',')]
    else:
        name = [remove_non_letters(x.strip()) for x in name.split(',')]
    sxs = [n for n in name if n.lower() in suffixes]
    name = [n for n in name if n.lower() not in suffixes]

    if len(name) == 2: #last, first (+ middle)
        if preserve_non_letters and len(name[0].split(' ')) > 1:
            if not ((len(name[0]) >= 2) and (name[0][0] == '{') and (name[0][-1] == '}')):
                name[0] = '{' + name[0] + '}'
        x = ' '.join([name[1], name[0]])
    elif len(name) == 1: #first (+ middle) + last
        x = name[0]
    elif len(name) == 0:
        raise Exception(f'no non-suffix names: {original_name}')
    elif len(name) > 2:
        raise Exception(f'too many commas: {original_name}')
    return x

In [7]:
def last_name(names):
    #remove suffixes and non-letters
    names = [remove_non_letters(n) for n in rearrange(names).split(' ') if not (n.lower() in suffixes)]

    #start at the end and move backward
    x = []
    found_prefix = False
    for n in reversed(names):
        if n.lower() in prefixes:
            found_prefix = True
        elif found_prefix or len(x) > 0:
            break
        x.append(n)
    if found_prefix:
        return ''.join(reversed(x))
    else:
        return x[0]

In [8]:
def last_names_from_str(x):
    #pass in a single string (and-separated) or list of authors and get back a list of last names
    if type(x) == str:
        return [last_name(n) for n in x.split(' and ')]
    elif type(x) == list:
        return [last_name(n) for n in x]
    else:
        return ['']

In [9]:
def authors2key(authors, year):
    def key(author):
        #convert accented unicode characters to closest ascii equivalent
        author = decode(author)

        #re-arrange author name to FIRST [MIDDLE] LAST [SUFFIX]
        author = remove_accents_and_hyphens(author)
        author = remove_curlies(author)

        #get first 4 letters of last name
        return last_name(author)[:4]
    
    yr_str = str(year)[-2:]
    
    authors = authors.split(' and ')
    if len(authors) == 0:
        raise Exception('Author information missing, no key generated')
    elif len(authors) == 1:
        return key(authors[0]) + yr_str
    elif len(authors) == 2:
        return key(authors[0]) + key(authors[1]) + yr_str
    elif len(authors) >= 3:
        return key(authors[0]) + 'Etal' + yr_str
    else:
        raise Exception('Something went wrong...')

In [10]:
bibfile = '../memlab.bib'
parser = bp.bparser.BibTexParser(ignore_nonstandard_types=True, common_strings=True, homogenize_fields=True)

with open(bibfile, 'r') as b:
    bibdata = bp.load(b, parser=parser)

In [11]:
bd = bibdata.get_entry_dict()

In [12]:
#get all fields
fields = {}
for k in bd.keys():
    next_entry = bd[k]
    for field, vals in next_entry.items():
        if not (field in fields.keys()):
            fields[field] = [vals]
        else:
            fields[field].append(vals)

for k in fields.keys():
    fields[k] = list(np.unique(fields[k]))

In [13]:
def get_vals(bd, field):
    def safe_get(item, field):
        if field in item.keys():
            return item[field]
        else:
            return ''
    
    return [safe_get(i, field) for k, i in bd.items()]

In [14]:
def same_id(a, b, ignore_special=False):
    if ignore_special and (('\\' in a) or ('\\' in b)):
        return True
    
    if len(a) > len(b):
        return same_id(b, a)
    elif a == b:
        return True
    else:
        return a == b[:len(a)]

In [15]:
authors = get_vals(bd, 'author')
years = get_vals(bd, 'year')
ids = get_vals(bd, 'ID')
titles = [remove_curlies(t) for t in get_vals(bd, 'title')]

In [16]:
gen_ids = [authors2key(a, y) for a, y in zip(authors, years)]

In [17]:
#check key intergrity
tofix = [f'{h}. [{i}] should be [{g}]' for i, g, h in zip(ids, gen_ids, range(0, len(ids))) if not same_id(i, g, ignore_special=False)] #check these over carefully...
if len(tofix) == 0:
    print('Congrats!  No keys to fix.')
else:
    print(f'Need to fix {len(tofix)} keys: \n')
    print('\n'.join(tofix))

Congrats!  No keys to fix.


In [18]:
#check for duplicate keys
unique_ids, counts = np.unique(ids, return_counts=True)
duplicate_keys = unique_ids[np.where(counts > 1)[0]]
if len(duplicate_keys) > 0:
    print('Multiple entries for the following keys:')
    print('\n'.join(duplicate_keys))
else:
    print('No duplicated keys!')

No duplicated keys!


In [19]:
def duplicate_inds(x):
    # for the list x, return a new list containing 0 or more
    # lists of the indices of matching (non-unique) elements
    y = []
    unique_vals, counts = np.unique(x, return_counts=True)
    for v in [v for i, v in enumerate(unique_vals) if counts[i] > 1]:
        y.append([i for i, j in enumerate(x) if j == v])
    return y

In [20]:
duplicates = []
duplicate_title_inds = duplicate_inds(titles)
ignore_authors = []

last_names = [' and '.join(last_names_from_str(a)) for a in authors]
#last_names = [' and '.join([last_name(a) for a in authors[i].split(' and ') if not (a in ignore_authors)]) for i in range(len(authors))]
duplicate_authors = duplicate_inds(last_names)

for i in duplicate_title_inds:
    duplicate_author_inds = duplicate_inds([last_names[j] for j in i])
    for a in duplicate_author_inds:
        duplicates.append(a)

In [21]:
if len(duplicates) > 0:
    for d in duplicates:
        print(f'Duplicated authors/titles found for the following keys [ind, key]: {[[i, ids[i]] for i in d]}')
else:
    print('Congrats, no duplicated authors/titles!')

Congrats, no duplicated authors/titles!


In [22]:
def get_key_suffixes(n):
    #source: https://stackoverflow.com/questions/29351492/how-to-make-a-continuous-alphabetic-list-python-from-a-z-then-from-aa-ab-ac-e/29351603
    if n <= 1:
        return ''
    
    def generate_id():
        i = 1
        while True:
            for s in itertools.product(ascii_lowercase, repeat=i):
                yield ''.join(s)
            i += 1
    
    gen = generate_id()
    
    def helper():
        for s in gen:
            return s
    
    return [helper() for i in range(n)]

In [23]:
#Check bibtex keys. Duplicates should be assigned a suffix of 'a', 'b', etc.
#If keys match aside from suffix then still allow the bibtex file to "pass"
#as long as all "matching" keys are unique and all have suffixes and the 
#suffixes span a, b, c, ..., etc. without gaps

def check_keys(ids, gen_ids):
    checked = []
    bad_keys = []
    
    #for duplicate base keys, ensure correct suffixes
    same_base = duplicate_inds(gen_ids)
    for inds in same_base:
        next_base = gen_ids[inds[0]]
        target_keys = [next_base + x for x in get_key_suffixes(len(inds))]
        actual_keys = list(np.array(ids)[inds])
        
        correct_keys = [a for a in actual_keys if a in target_keys]
        missing_keys = [t for t in target_keys if t not in actual_keys]
        i = 0
        
        for a in actual_keys:
            checked.append(a)
            if a not in target_keys:
                bad_keys.append([a, missing_keys[i]])
                i += 1                
        
    #for non-duplicate base keys, ensure *no* suffixes
    for i, g in zip(ids, gen_ids):
        if i not in checked:
            if not (i == g):
                bad_keys.append([i, g])
    
    return bad_keys

In [24]:
bad_keys = check_keys(ids, gen_ids)

if len(bad_keys) > 0:
    print(f'{len(bad_keys)} incorrect key sequence(s) found:\n')
    for x in bad_keys:
        print(f'{x[0]} should be renamed to {x[1]}')
else:
    print('No incorrect key sequences found!')

3 incorrect key sequence(s) found:

LohnEtal11b should be renamed to LohnEtal11
NelsEtal98a should be renamed to NelsEtal98
Nair92b should be renamed to Nair92


In [25]:
# 1. numbers separated by n-dash with no spaces; right number larger than left number
# 2. zero or more lowercase letter(s) + sequence of digits
# 3. two combinations of letter(s) + sequence of digits:
#   - same letters at the beginning
#   - right number larger than left number
# 4. two uppercase letters, hypthen, digit, letter, ., two digits (e.g., PS-2B.16)
# 5. empty string
# 6. doi
def valid_page(p): #single page, no hyphens
    if len(p) == 0: #empty string
        return True, 'empty', None

    try:
        v = int(p) #integer
        return True, 'int', v
    except:
        pass

    #prefix of one or more letters, followed by a sequence of digits
    r1 = re.compile(r'''(?P<prefix>[a-zA-Z]+)(?P<digits>\d+)''')
    x = r1.fullmatch(p)
    if not (x is None):
        return True, 'prefixed', [x.group('prefix'), int(x.group('digits'))]

    #two uppercase letters, hyphen, digit, letter, ., two digits
    r2 = re.compile(r'''(?P<prefix>[A-Z]{2}-[\dA-Z]{2}).(?P<digits>\d+)''')
    x = r2.fullmatch(p)
    if not (x is None):
        return True, 'conference', [x.group('prefix'), int(x.group('digits'))]

    #doi address
    r3 = re.compile(r'''doi\.org/(?P<doi>[a-z\d\-\./]+)''')
    x = r3.fullmatch(p)
    if not (x is None):
        return True, 'doi', None

    #arXiv section
    r4 = re.compile(r'''((?P<subject>[a-z]{2,})/)?(?P<article>[\d\.]+(v[\d]+)?)''')
    x = r4.fullmatch(p)
    if not (x is None):
        return True, 'arxiv', None
    
    #roman numeral
    def mixed_case(s):
        return not ((s == s.lower()) or (s == s.upper()))
    
    def roman2int(s):
        #source: https://www.w3resource.com/python-exercises/class-exercises/python-class-exercise-2.php
        vals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
        x = 0
        for i in range(len(s)):
            if i > 0 and vals[s[i]] > vals[s[i - 1]]:
                x += vals[s[i]] - 2 * vals[s[i - 1]]
            else:
                x += vals[s[i]]
        return x
    
    #source: https://www.geeksforgeeks.org/validating-roman-numerals-using-regular-expression/
    r5 = re.compile(r'''^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$''')
    x = r5.fullmatch(p.upper())
    if not ((x is None) or mixed_case(p)):
        return True, 'roman', roman2int(p.upper())

    return False, 'invalid', None


def valid_pages(p):
    valid, kind, val = valid_page(p)
    if valid: #"single" page
        return True, 'empty', p
    else: #page range
        #split by hyphen
        ps = [x.strip() for x in p.split('-') if len(x.strip()) > 0]
        if len(ps) == 2:
            if ps[0] == ps[1]:
                return False, [p, ps[0]]
            
            valid1, kind1, val1 = valid_page(ps[0])
            valid2, kind2, val2 = valid_page(ps[1])
            
            if (not (valid1 and valid2)) or (not (kind1 == kind2)):
                if (kind1 == 'prefixed') and (kind2 == 'int'):
                    return False, [p, '--'.join([val1[0] + str(val1[1]), val1[0] + str(val2)])]
                return False, p
            
            if kind1 in ['int', 'roman']:
                if val1 < val2:
                    return True, '--'.join(ps)
                elif kind1 == 'int':
                    #attempt to autocorrect
                    p1 = str(val1)
                    p2 = str(val2)                    
                    if len(p2) < len(p1):
                        return False, [p, '--'.join([p1, p1[:-len(p2)] + p2])]
                    return False, [p, '--'.join(ps)]
            elif kind1 in ['prefixed', 'conference']:
                if (val1[0] == val2[0]) and (val1[1] < val2[1]):
                    return True, '--'.join(ps)
                else:
                    return False, [p, '--'.join(ps)]
            #dois and arxiv sections can't be specified as ranges
        return False, [p, '--'.join(ps)]

In [26]:
#page numbering
pages = get_vals(bd, 'pages')

#first check for invalid pages/entries
bad_pages = [(ids[i], valid_pages(p)) for i, p in enumerate(pages) if not valid_pages(p)[0]]

if len(bad_pages) > 0:
    print(f'{len(bad_pages)} page numbering problems found (*s can be fixed automatically):\n')
    for b in bad_pages:
        recheck = valid_pages(b[1][1][1])        
        if recheck[0]:
            suffix = f' (*)'
        else:
            suffix = ''
        print(f'{b[0]}: {b[1][1]}{suffix}')
else:
    print('No incorrect page numberings found!')

143 page numbering problems found (*s can be fixed automatically):

JacoEtal06: ['978--87', '978--987'] (*)
HardEtal84: ['185-99', '185--199'] (*)
LeonShad99: ['415-25', '415--425'] (*)
UndeFreu68: ['50--4', '50--54'] (*)
DeBaVoge10: ['2145-65', '2145--2165'] (*)
Helm02: ['439--53', '439--453'] (*)
YoneEtal02: ['1236-41', '1236--1241'] (*)
JoEtal13: ['8159-71', '8159--8171'] (*)
VanSEtal07: ['205-14', '205--214'] (*)
DAleEtal03: ['603--15', '603--615'] (*)
KotcEtal96: ['530-40', '530--540'] (*)
MaccBuck04: ['1625-32', '1625--1632'] (*)
FurlEtal07: ['300-10', '300--310'] (*)
RazEtal00: ['8559--71', '8559--8571'] (*)
LarsSmit12: ['567-76', '567--576'] (*)
WeinEtal10: ['3349-65', '3349--3365'] (*)
BartEtal10: ['1412-5', '1412--1415'] (*)
McGe32: ['352-70', '352--370'] (*)
Will01: ['283-6', '283--286'] (*)
AlexCrut90: ['266--71', '266--271'] (*)
AbboBlum96: ['406-16', '406--416'] (*)
Buzs97: ['508-15', '508--515'] (*)
SohaHass98b: ['171-93', '171--193'] (*)
FrinEtal06a: ['417-21', '417--42

In [27]:
journals = get_vals(bd, 'journal')

In [28]:
uncaps = ['and', 'or', 'of', 'the', 'de', 'in', 'on', 'for', 'with']
replace = {' \\& ': ' and ', ' {\\&} ': ' and ', '{ieee}': 'ieee', '{plos}': 'plos'}
force_caps = ['arXiv', 'bioRxiv', 'PsyArXiv', 'PLoS', 'BMC', 'EEG', 'CNS', 'fMRI',
              'COSYNE', 'eLife', 'IEEE', 'ACM', 'i-Perception', 'NeuroImage', 'NeuroReport',
              'NMR', 'OpenAI', 'USA', 'NIPS', 'NeurIPS', 'AISTATS', 'ICML', 'A', 'B', 'C', 'D', 'E', 'I', 'II', 'III', 'Xplore']

In [29]:
journal_key = pd.read_excel('journal_key.xls', header=0, index_col='orig')
journal_key = journal_key.to_dict()['corrected']

In [30]:
def format_journal_name(n):
    if (n.lower() in journal_key.keys()) and (type(journal_key[n.lower()]) == str):
        n = journal_key[n.lower()]
    else:
        n = n.lower()
    
    words = n.split(' ')
    #next line isn't working...
    #words = ['-'.join([format_journal_name(x) for x in w.split('-')]) if len(w.split('-')) > 1 else w for w in words] #deal with hyphens
    
    for i, w in enumerate(words):
        words[i] = w.capitalize()
        
        #deal with hyphens
        if len(w.split('-')) > 1:
            words[i] = '-'.join(format_journal_name(c) for c in w.split('-'))
        
        if (i > 0) and (w.lower() in uncaps):
            words[i] = words[i].lower()
        
        if words[i].lower() in replace.keys():
            words[i] = replace[words[i].lower()]
        
        correct_caps = [f for f in force_caps if f.lower() == words[i].lower()]
        if len(correct_caps) >= 1:
            words[i] = correct_caps[-1]
    return ' '.join(words)

In [31]:
correct_journals = [format_journal_name(j) for j in journals]

In [32]:
journal_corrections = [(i, j, c) for i, j, c in zip(ids, journals, correct_journals) if j != c]

In [33]:
if len(journal_corrections) > 0:
    print(f'{len(journal_corrections)} journal corrections will be made:\n')
    for j in journal_corrections:
        print(f'{j[0]}: {j[1]} --> {j[2]}')
else:
    print('No incorrect journal entries found!')

1796 journal corrections will be made:

BeatEtal16: {Trends in Cognitive Sciences} --> Trends in Cognitive Sciences
LeeEtal20: {bioRxiv} --> bioRxiv
TianEtal20: Nature {N}euroscience --> Nature Neuroscience
ArzyScha19: {Trends in Cognitive Sciences} --> Trends in Cognitive Sciences
GautEtal18: {Cerebral Cortex} --> Cerebral Cortex
KnigEich13: Nature {N}euroscience --> Nature Neuroscience
GautEtal12: {The Journal of Neuroscience} --> The Journal of Neuroscience
LeeEtal19: {bioRxiv} --> bioRxiv
FentEtal08: {The Journal of Neuroscience} --> The Journal of Neuroscience
DevlEtal18: {arXiv} --> arXiv
WietKiel19: {arXiv} --> arXiv
ConnEtal18: {arXiv} --> arXiv
PeteEtal18: {arXiv} --> arXiv
MikoEtal13b: Proceedings of the NAACL-HLT --> Proceedings of the National Association for Computational Linguistics
Schr26: The Physical Review --> Physical Review
SchaEtal07: {Nature Reviews Neuroscience} --> Nature Reviews Neuroscience
HassEtal07b: {Proceedings of the National Academy of Science USA} --> 

HalgEtal77c: Brain Res. --> Brain Research
Schu86: Journal of Neurophsyiology --> Journal of Neurophysiology
SchuEtal93: Journal of Neuroscience --> The Journal of Neuroscience
BaylEtal79: Journal of Physiology (London) --> Journal of Physiology
Jahn74: Journal of Experimental Psychology --> Journal of Experimental Psychology: General
GreeUnde50: J Exp Psychol --> Journal of Experimental Psychology: General
ShepTegh61: Journal of Experimental Psychology --> Journal of Experimental Psychology: General
BahrPhel87: Journal of Experimental Pyschology: Learning, Memory, and Cognition --> Journal of Experimental Psychology: Learning, Memory, and Cognition
Lock69: Journal of Experimental Psychology --> Journal of Experimental Psychology: General
HintEtal98: Memory \& Cognition --> Memory and Cognition
DoshEtal89: Learning, Memory --> Learning and Memory
Fish79: Memory \& Cognition --> Memory and Cognition
NobeShif01: Journal of Experimental Psychology: Learning Memory, and Cognition --> Journ

In [34]:
#rearrange author name (first middle last suffix)
#get rid of (any number of) clumped initials:
# AA --> A A
# A.A. --> A A
# A.A --> A A
# AA. --> A A
# ...
# AAA --> A A A
def reformat_author(author):
    if len(author.split(' and ')) > 1:
        return ' and '.join([reformat_author(a) for a in author.split(' and ')])
    
    try:
        author = rearrange(author, preserve_non_letters=True)
    except:
        pass
    
    unclumped = []
    names = author.split(' ')
    
    for n in names:
        #remove periods
        n = n.replace('.', '')
        if (n.lower() not in suffixes) and (n == n.upper()): #suffixes don't seem to be handled correctly...
            if n.find('-') >= 0:
                n = '-'.join([reformat_author(c) for c in n.split('-')])
            else:
                for c in list(n):
                    unclumped.append(c)
                continue
        unclumped.append(n)
    
    return ' '.join(unclumped)

In [36]:
corrected_authors = [reformat_author(a) for a in authors]

In [37]:
author_corrections = [(i, a, c) for i, a, c in zip(ids, authors, corrected_authors) if a != c]
if len(author_corrections) > 0:
    print(f'{len(author_corrections)} author corrections will be made:\n')
    for a in author_corrections:
        print(f'{a[0]}: {a[1]} --> {a[2]}')
else:
    print('No incorrect authors found!')

5208 author corrections will be made:

IyyeEtal15: M Iyyer and V Manjunatha and J Boyd-Graber and H {Daum\'{e} III} --> M Iyyer and V Manjunatha and J Boyd-Graber and H {Daum\'{e} I I I }
Gros88: Grossberg, S. --> S Grossberg
Fris00: KJ Friston --> K J Friston
MedfCrit10: Medford, N. and Critchley, H. D. --> N Medford and H D Critchley
KellEtal07b: Kelly, S. and Lloyd, D. and Nurmikko, T. and Roberts, N. --> S Kelly and D Lloyd and T Nurmikko and N Roberts
JackWool18: Jackson, J. B. and Woolgar, A. --> J B Jackson and A Woolgar
CohnRang17: Cohn-Sheely, B I and Ranganath, C --> B I Cohn-Sheely and C Ranganath
EngeEtal10: A K Engel and K Friston and J A S Kelso and P K\"{o}nig and I Kov\'{a}cs and A {MacDonald III} and E K Miller and W A Phillips and S M Silverstein and C Tallon-Baudry and J Triesch and P Uhlhaas --> A K Engel and K Friston and J A S Kelso and P K\"{o}nig and I Kov\'{a}cs and A {MacDonald I I I } and E K Miller and W A Phillips and S M Silverstein and C Tallon-Baudry and

LaukEtal13: Laukka, Erika J. and L{\"{o}}vd{\'{e}}n, Martin and Herlitz, Agneta and Karlsson, Sari and Ferencz, Beata and Pantzar, Alexandra and Keller, Lina and Graff, Caroline and Fratiglioni, Laura and B{\"{a}}ckman, Lars --> Erika J Laukka and Martin L{\"{o}}vd{\'{e}}n and Agneta Herlitz and Sari Karlsson and Beata Ferencz and Alexandra Pantzar and Lina Keller and Caroline Graff and Laura Fratiglioni and Lars B{\"{a}}ckman
LautEtal10: Lautenschlager, Nicola T and Cox, Kay and Kurz, Alexander F --> Nicola T Lautenschlager and Kay Cox and Alexander F Kurz
LiuEtal17: Liu, Fengqin and Sulpizio, Simone and Kornpetpanee, Suchada and Job, Remo --> Fengqin Liu and Simone Sulpizio and Suchada Kornpetpanee and Remo Job
LucaEtal15: Lucas, Samuel JE and Cotter, James D and Brassard, Patrice and Bailey, Damian M --> Samuel J E Lucas and James D Cotter and Patrice Brassard and Damian M Bailey
MandEtal11: Mandel, Abigail L and Ozdener, Hakan and Utermohlen, Virginia --> Abigail L Mandel and Hakan

UlbeEtal04: Ulbert, I. and Heit, G. and Madsen, J. R. and Karmos, G. and Halgren, E. --> I Ulbert and G Heit and J R Madsen and G Karmos and E Halgren
GrosPear08: Grossberg, S. and Pearson, L.R. --> S Grossberg and L R Pearson
WingStin00: Wingfield,A. and Stine-Morrow, E. A. --> A Wingfield and E A Stine-Morrow
WoerEtal03: Woermann, FG and Jokeit, H. and Luerding, R. and Freitag, H. and Schulz, R. and Guertler, S. and Okujava, M. and Wolf, P. and Tuxhorn, I. and Ebner, A. --> F G Woermann and H Jokeit and R Luerding and H Freitag and R Schulz and S Guertler and M Okujava and P Wolf and I Tuxhorn and A Ebner
Ande76: Anderson, J. R. --> J R Anderson
LucaEtal08: Lucas, T. H. and Dodrill, C. B. and Ojemann, G. A. --> T H Lucas and C B Dodrill and G A Ojemann
McCaEtal95: McCarthy, G. and Nobre, A. C. and Bentin, S. and Spencer, D. D. --> G McCarthy and A C Nobre and S Bentin and D D Spencer
BresMeno10: Bressler, S.L. and Menon, V. --> S L Bressler and V Menon
DoesEtal08: Doesburg, S. M. and

MandEtal89: Mandler, G. and G. R. Goethals and C. M. Kelley and B. R. Stephens --> G Mandler and G R Goethals and C M Kelley and B R Stephens
AlvaSqui94: P. Alvarez and L. R. Squire --> P Alvarez and L R Squire
Buzs98: Buzs\'aki, G. --> G Buzs\'aki
NadeMosc97: L. Nadel and M. Moscovitch --> L Nadel and M Moscovitch
ChunPhel99: Chun, M.M. and Phelps, E.A. --> M M Chun and E A Phelps
SchaEtal11: Schacter, D. L. and Guerin, S. A. and St. Jacques, P. L. --> D L Schacter and S A Guerin and P L {St Jacques}
BudsPric05: Budson, A. E. and Price, B. H. --> A E Budson and B H Price
HalgEtal91: Halgren, E. and Stapleton, J. and Domalski, P. and Swartz, B. E. and Delgado-Escueta, A. V. and Walsh, G. O. and Mandelkern, M. and Blahd, W. and Ropchan, J. --> E Halgren and J Stapleton and P Domalski and B E Swartz and A V Delgado-Escueta and G O Walsh and M Mandelkern and W Blahd and J Ropchan
DulaEtal06: Dulay, M.F. and York, M.K. and Soety, E.M. and Hamilton, W.J. and Mizrahi, E.M. and Goldsmith, I.L

Rang06: Ranganath, C. --> C Ranganath
Badd03: Baddeley, A. --> A Baddeley
RangEtal05: Ranganath, C. and Cohen, M. X. and Brozinsky, C. J. --> C Ranganath and M X Cohen and C J Brozinsky
LustEtal01: Lustig, C and May, C P and Hasher, L --> C Lustig and C P May and L Hasher
ConwEtal05: A. R. A. Conway and Kane, M. J. and Bunting, M. F. and Hambrick, D. Z. and Wilhelm, O. and Engle, R. W. --> A R A Conway and M J Kane and M F Bunting and D Z Hambrick and O Wilhelm and R W Engle
SreeEtal11: Sreenivasan, Kartik K and Sambhara, Deepak and Jha, Amishi P --> Kartik K Sreenivasan and Deepak Sambhara and Amishi P Jha
AxmaEtal09: Axmacher, N. and Elger, C. E. and Fell, J. --> N Axmacher and C E Elger and J Fell
PellMoor00: Pelleg, D. and Moore, A. --> D Pelleg and A Moore
Mand91: Mandler, G. --> G Mandler
WageEtal04a: Wagenmakers, Eric-Jan and Ratcliff, Roger and Gomez, Pablo and Iverson, Geoffrey J --> Eric-Jan Wagenmakers and Roger Ratcliff and Pablo Gomez and Geoffrey J Iverson
YiEtal06: DJ Yi

In [38]:
remove_fields = ['date-modified', 'date-added',
                 'note', 'bdsk-url-1', 'pst', 'pmid', 'pmc',
                 'mesh', 'keyword', 'journal-full', 'abstract',
                 'mendeley-groups', 'file', 'bdsk-url-1', 'bdsk-url-2', 'eprint',
                 'arxivid', 'archiveprefix', 'ty', 'm3', 'l3',
                 'howpublished', 'lccn', 'read', 'annote', 'owner',
                 'timestamp', 'pii', 'zb', 'z9', 'z8' ,'times-cited',
                 'publication-type', 'isi', 'language', 'stat', 'so',
                 'sb', 'rf', 'pubm', 'pt', 'pl', 'phst', 'own', 'mhda',
                 'jid', 'ip', 'edat', 'dcom', 'da', 'au', 'aid', 'affiliation',
                 'lr', 'gr', 'jt', 'local-url', 'dep', 'mh', 'cin', 'ci',
                 'comment', 'con', 'card', 'oto', 'ot', 'unique-id',
                 'subject-category', 'number-of-cited-references', 'rating',
                 'rn', 'oid', 'issn', 'isbn', 'doc-delivery-number']

In [39]:
keep_fields = [k for k in fields.keys() if k not in remove_fields]
if 'force' not in keep_fields:
    keep_fields.append('force')

In [40]:
keep_fields

['year',
 'volume',
 'title',
 'pages',
 'number',
 'journal',
 'author',
 'ENTRYTYPE',
 'ID',
 'booktitle',
 'publisher',
 'editor',
 'school',
 'chapter',
 'address',
 'month',
 'organization',
 'doi',
 'url',
 'date',
 'series',
 'bdsk-file-1',
 'type',
 'institution',
 'edition',
 'location',
 'force']

To do:
- ~~handle first-author surnames that start with lowercase characters~~
- ~~better handling of curly braces to preserve name ordering~~
- ~~better handling of escape characters~~
- ~~ensure bibtex file compiles and gets parsed correctly~~
- ~~Check duplicate titles/authors (fail if detected)~~
- ~~Check bibtex keys.  Duplicates should be assigned a suffix of 'a', 'b', etc.~~
- ~~If keys match aside from suffix then still allow the bibtex file to "pass" as long as all "matching" keys are unique *and* all have suffixes *and* the suffixes span a, b, c, ..., etc. without gaps~~
~~- Correct page numberings:~~
~~  - Change hyphens to n-dashes~~
~~  - Change m-dashes to n-dashes~~
~~  - Remove spaces in page ranges~~
~~  - Print a warning for strangely formatted pages: leading 0, non-digits, very large numbers (greater than 10K?) but allow bibtex file to pass~~
- remove everything except for "keep_fields"
  - Also add a "force" field that, when present, causes the checker to automatically pass that entry (if set to True); this will be used as a workaround for special cases not handled by the parser/checker
- ~~change all urls, dois, and page numbers to lowercase~~

Clean up:
- ~~correct all "strings" to full journal names~~
- ~~correct capitalization of journal names~~
- ~~correct all journal abbreviations to full journal names~~
- ~~Ensure no periods in journal names~~
- ~~Ensure no periods at the ends of affiliations or titles~~
- Check for compressed initials (AA --> A A; A.A. --> A A; etc.)

Bonus:
- Use scholarly to verify information.  However, this is currently unreliable (server seems to hang frequently) and too slow to be viable