In [2]:
from unidecode import unidecode as decode
import bibtexparser as bp
import numpy as np
import re

from urllib import request as get

In [3]:
def remove_accents_and_hyphens(s):
  replace = {'{\\l}': 'l',
             '{\\o}': 'o',
             '{\\i}': 'i',
             '{\\t}': 't'}
    
  for key, val in replace.items():
    s = s.replace(key, val)
  
  accents = r'''\{?\\[`'^"~=.uvHtcdbkr]?\s?\{?\\?(\w*)\}?'''
  accented_chars = [x for x in re.finditer(accents, s)]

  s_list = [c for c in s]
  hyphen = '-'
  for a in accented_chars:
    next_len = a.end() - a.start()
    s_list[a.start()] = a.group(1)
    s_list[(a.start() + 1):a.end()] = hyphen * (next_len - 1)
  return ''.join([c for c in s_list if c != hyphen])

In [4]:
def match(names, template):
  #check if list of strings names contains a match to the potentially multi-word
  #template.  return the position of the start of the left-most match
  template = template.split(' ')
  for i, x in enumerate(names[:len(names) - len(template) + 1]):
    found_match = False
    for j, t in enumerate(template):
      if names[i + j].lower() != t:
        found_match = False
        break
      else:
        found_match = True
    if found_match:
      return i
  return -1

prefixes = ['de', 'da', 'di', 'von', 'zu', 'van', 'du', 'des',
            'del', 'de la', 'della', 'la', 'le', 'der', 'af',
            'st.', 'saint', 'st', 'dom', 'do', 'das', 'dos', 'of', 'al',
            'el', 'dei', 'tot', 'thoe', 'aw', 'na', 'sri', 'phra',
            'si', 'shri', 'lo', 'no', 'op', 'lopes', 'gonzalez', 'vom', 'castro']

suffixes = ['sr.', 'jr.', 'sr', 'jr', 'senior', 'junior', 'iii', 'iv', 'v', 
            'vi', 'vii', 'viii', 'ix', 'x', 'the', 'third', 'fourth', 'fifth',
            'sixth', 'seventh', 'eighth', 'ninth', 'tenth', 'great']

In [5]:
def remove_curlies(s): #only removes *matching* curly braces
  for i, c in enumerate(s):    
    if c == '{':
      j = i
      
      curly_count = 1
      while (j < len(s) - 1):
        j += 1
        if s[j] == '{':
          curly_count += 1

        if s[j] == '}':
          curly_count -= 1
        
        if curly_count == 0:
          break
      
      if curly_count == 0:
        return remove_curlies(s[:i] + ''.join(s[(i+1):j].split(' ')) + s[(j+1):])

  return s

In [6]:
def remove_non_letters(s):
  remove_chars = [',', '.', '!', '?', "'", '"', '{', '}', '-']
  
  for c in remove_chars:
    s = s.replace(c, '')
  return s

In [7]:
def rearrange(name):
  original_name = name

  #remove suffixes and convert to list
  name = [remove_non_letters(x.strip()) for x in name.split(',')]
  sxs = [n for n in name if n.lower() in suffixes]
  name = [n for n in name if n.lower() not in suffixes]
  
  if len(name) == 2: #last, first (+ middle)
    x = ' '.join([name[1], name[0]])
  elif len(name) == 1: #first (+ middle) + last
    x = name[0]
  elif len(name) == 0:
    raise Exception(f'no non-suffix names: {original_name}')
  elif len(name) > 2:
    raise Exception(f'too many commas: {original_name}')
  return x

In [8]:
def last_name(names):
  #remove suffixes and non-letters
  names = [remove_non_letters(n) for n in rearrange(names).split(' ') if not (n.lower() in suffixes)]

  #start at the end and move backward
  x = []
  found_prefix = False
  for n in reversed(names):
    if n.lower() in prefixes:
      found_prefix = True
    elif found_prefix or len(x) > 0:
      break
    x.append(n)
  if found_prefix:
    return ''.join(reversed(x))
  else:
    return x[0]

In [9]:
def authors2key(authors, year):
  def key(author):
    #convert accented unicode characters to closest ascii equivalent
    author = decode(author)

    #re-arrange author name to FIRST [MIDDLE] LAST [SUFFIX]
    author = remove_accents_and_hyphens(author)
    author = remove_curlies(author)

    #get first 4 letters of last name
    return last_name(author)[:4]
  
  yr_str = str(year)[-2:]

  authors = authors.split(' and ')
  if len(authors) == 0:
    raise Exception('Author information missing, no key generated')
  elif len(authors) == 1:
    return key(authors[0]) + yr_str
  elif len(authors) == 2:
    return key(authors[0]) + key(authors[1]) + yr_str
  elif len(authors) >= 3:
    return key(authors[0]) + 'Etal' + yr_str
  else:
    raise Exception('Something went wrong...')

In [14]:
bibfile = '../memlab.bib'
parser = bp.bparser.BibTexParser(ignore_nonstandard_types=True, common_strings=True, homogenize_fields=True)

with open(bibfile, 'r') as b:
    bibdata = bp.load(b, parser=parser)

Entry type webpage not standard. Not considered.
Entry type webpage not standard. Not considered.


In [15]:
bd = bibdata.get_entry_dict()

In [16]:
#get all fields
fields = {}
for k in bd.keys():
  next_entry = bd[k]
  for field, vals in next_entry.items():
    if not (field in fields.keys()):
      fields[field] = [vals]
    else:
      fields[field].append(vals)

for k in fields.keys():
  fields[k] = list(np.unique(fields[k]))

In [17]:
def get_vals(bd, field):
  def safe_get(item, field):
    if field in item.keys():
      return item[field]
    else:
      return ''
  
  return [safe_get(i, field) for k, i in bd.items()]

In [18]:
def same_id(a, b, ignore_special=False):
  if ignore_special:
    if ('\\' in a) or ('\\' in b):
      return True
  
  if len(a) > len(b):
    return same_id(b, a)
  elif a == b:
    return True
  else:
    return a == b[:len(a)]

In [19]:
authors = get_vals(bd, 'author')
years = get_vals(bd, 'year')
ids = get_vals(bd, 'ID')

In [20]:
gen_ids = [authors2key(a, y) for a, y in zip(authors, years)]

In [21]:
#check keys
tofix = [f'{h}. [{i}] should be [{g}]' for i, g, h in zip(ids, gen_ids, range(0, len(ids))) if not same_id(i, g, ignore_special=False)] #check these over carefully...
if len(tofix) == 0:
  print('Congrats!  No keys to fix.')
else:
  print(f'Need to fix {len(tofix)} keys: \n')
  print('\n'.join(tofix))

Congrats!  No keys to fix.


In [22]:
#check duplicates: same title + author
titles = get_vals(bd, 'title')
unique_titles, title_counts = np.unique(titles, return_counts=True)

In [23]:
ignore_authors = []

last_names = [' and '.join([last_name(a) for a in authors[i].split(' and ') if not (a in ignore_authors)]) for i in range(len(authors))]
duplicates = []
for t in [t for i, t in enumerate(unique_titles) if title_counts[i] > 1]:
  inds = [i for i, x in enumerate(titles) if x == t]
  author_names, author_counts = np.unique([last_names[i] for i in inds], return_counts=True)
  for a in [a for i, a in enumerate(author_names) if author_counts[i] > 1]:
    duplicates.append([i for i, x in enumerate(last_names) if x == a])

In [24]:
duplicates

[]

In [25]:
for d in duplicates:
  print(f'Duplicate IDs: {[[i, ids[i]] for i in d]}')

In [26]:
remove_fields = ['date-modified', 'date-added',
                 'note', 'bdsk-url-1', 'pst', 'pmid', 'pmc',
                 'mesh', 'keyword', 'journal-full', 'abstract',
                 'mendeley-groups', 'file', 'bdsk-url-2', 'eprint',
                 'arxivid', 'archiveprefix', 'ty', 'm3', 'l3',
                 'howpublished', 'lccn', 'read', 'annote', 'owner',
                 'timestamp', 'pii', 'zb', 'z9', 'z8' ,'times-cited',
                 'publication-type', 'isi', 'language', 'stat', 'so',
                 'sb', 'rf', 'pubm', 'pt', 'pl', 'phst', 'own', 'mhda',
                 'jid', 'ip', 'edat', 'dcom', 'da', 'au', 'aid', 'affiliation',
                 'lr', 'gr', 'jt', 'local-url', 'dep', 'mh', 'cin', 'ci',
                 'comment', 'con', 'card', 'oto', 'ot', 'unique-id',
                 'subject-category', 'number-of-cited-references', 'rating',
                 'rn', 'oid', 'issn', 'isbn', 'doc-delivery-number']

In [27]:
keep_fields = [k for k in fields.keys() if k not in remove_fields]

In [28]:
keep_fields

['year',
 'volume',
 'title',
 'pages',
 'number',
 'journal',
 'author',
 'ENTRYTYPE',
 'ID',
 'booktitle',
 'publisher',
 'editor',
 'school',
 'chapter',
 'address',
 'month',
 'organization',
 'doi',
 'url',
 'date',
 'series',
 'bdsk-file-1',
 'type',
 'institution',
 'edition',
 'location']

To do:
- handle first-author surnames that start with lowercase characters
- better handling of curly braces to preserve name ordering
- better handling of escape characters
- ensure bibtex file compiles and gets parsed correctly
- Check duplicate titles/authors (fail if detected)
- Check bibtex keys.  Duplicates should be assigned a suffix of 'a', 'b', etc.
- If keys match aside from suffix then still allow the bibtex file to "pass" as long as all "matching" keys are unique *and* all have suffixes *and* the suffixes span a, b, c, ..., etc. without gaps
- Correct page numberings:
  - Change hyphens to n-dashes
  - Change m-dashes to n-dashes
  - Remove spaces in page ranges
  - Print a warning for strangely formatted pages: leading 0, non-digits, very large numbers (greater than 10K?) but allow bibtex file to pass
- remove everything except for "keep_fields"
  - Also add a "force" field that, when present, causes the checker to automatically pass that entry (if set to True); this will be used as a workaround for special cases not handled by the parser/checker
- change all urls, dois, and page numbers to lowercase

Clean up:
- correct all "strings" to full journal names
- correct all journal abbreviations to full journal names
- Ensure no periods in journal names
- Ensure no periods at the ends of affiliations or titles
- Check for compressed initials (AA --> A A; A.A. --> A A; etc.)

Bonus:
- Use scholarly to verify information.  However, this is currently unreliable (server seems to hang frequently) and too slow to be viable