In [1]:
from unidecode import unidecode as decode
import bibtexparser as bp
import numpy as np
import pandas as pd
import re
from string import ascii_lowercase
import itertools

In [16]:
from helpers import load_bibliography, get_vals, remove_curlies, authors2key, same_id, check_entries, last_names_from_str, find_duplicates, check_key_suffixes, generate_correct_pages, valid_pages, format_journal_name, reformat_author, journal_key, publisher_key

In [3]:
bd = load_bibliography('../memlab.bib')

In [4]:
#template = load_bibliography('https://raw.githubusercontent.com/ContextLab/CDL-bibliography/master/memlab.bib')

In [5]:
bib_keys = list(bd.keys())
#template_keys = list(template.keys())

In [19]:
ids = get_vals(bd, 'ID')
authors = get_vals(bd, 'author')
years = get_vals(bd, 'year')
titles = get_vals(bd, 'title', proc=remove_curlies)
pages = get_vals(bd, 'pages')
journals = get_vals(bd, 'journal')
book_titles = get_vals(bd, 'booktitle')
publishers = get_vals(bd, 'publisher')
editors = get_vals(bd, 'editor')

Check for duplicate keys

In [7]:
duplicate_keys, redundant_keys = find_duplicates(ids, authors, titles)

Checking for duplicated keys and entries...
No keys with multiple entries were found.
No entries with duplicated authors/titles were found.


In [8]:
fix_dict = {}
fix_dict['ID'] = check_entries('ID', bd, [authors2key(a, y) for a, y in zip(authors, years)], same=same_id);

running check: ID...
no IDs to fix!


Check for correct key suffixes

In [9]:
target_keys = check_key_suffixes(bd)
fix_dict['ID'].extend(check_entries('ID', bd, target_keys))

running check: ID...
3 errors detected:
LohnEtal11b: 	ID "LohnEtal11b" should be "LohnEtal11"
NelsEtal98a: 	ID "NelsEtal98a" should be "NelsEtal98"
Nair92b: 	ID "Nair92b" should be "Nair92"


Check page numbering

In [10]:
target_pages, unfixable = generate_correct_pages(bd)

In [11]:
if len(unfixable) > 0:
    print(f'The following page numbers are ambiguous or incorrect: \n')
    print('\n'.join([f'{i}: {p}' for i, p in unfixable]))
else:
    print('No ambiguous page numbers were found.')

No ambiguous page numbers were found.


In [12]:
fix_dict['pages'] = check_entries('pages', bd, target_pages)

running check: pages...
2441 errors detected:
MantEtal07: 	pages "13170-13175" should be "13170--13175"
Dobr70: 	pages "458-486" should be "458--486"
ByliEtal15: 	pages "165-178" should be "165--178"
vanKEtal13: 	pages "2352-2359" should be "2352--2359"
SereBoyn07: 	pages "301-312" should be "301--312"
LiuManc11: 	pages "26-33" should be "26--33"
Leml75: 	pages "235 -- 238" should be "235--238"
DroiWear15: 	pages "77 -- 82" should be "77--82"
Wear15: 	pages "165 -- 171" should be "165--171"
Howa18: 	pages "124-136" should be "124--136"
TigaEtal16: 	pages "5663-5671" should be "5663--5671"
TompEtal15: 	pages "7326-7331" should be "7326--7331"
NazaReza13: 	pages "54-61" should be "54--61"
DiazEtal19: 	pages "1-12" should be "1--12"
KitaEtal17: 	pages "73-78" should be "73--78"
TambDava13: 	pages "19591-19596" should be "19591--19596"
TambEtal10: 	pages "280-290" should be "280--290"
PlonEtal00: 	pages "1211-1216" should be "1211--1216"
HumpEtal07: 	pages "924-932" should be "924--932"
Oc

Kok01: 	pages "557-577" should be "557--577"
RubiWenz96: 	pages "734-760" should be "734--760"
BoldEtal04: 	pages "353-361" should be "353--361"
NelsEtal97: 	pages "785-796" should be "785--796"
DawEtal02: 	pages "603-616" should be "603--616"
GradEtal09: 	pages "354-359" should be "354--359"
BlanEtal08b: 	pages "41-56" should be "41--56"
BassEtal10: 	pages "1344-1352" should be "1344--1352"
Wall07: 	pages "31-56" should be "31--56"
LeeEste77: 	pages "395-418" should be "395--418"
RaaiShif81b: 	pages "403-415" should be "403--415"
Rund78: 	pages "91-101" should be "91--101"
John72a: 	pages "125-159" should be "125--159"
ReitRuet80: 	pages "554-581" should be "554--581"
StusEtal94: 	pages "355--355" should be "355"
John72b: 	pages "300-307" should be "300--307"
TlauWils96: 	pages "647-664" should be "647--664"
RiekBayl00: 	pages "181-186" should be "181--186"
Free06b: 	pages "572-589" should be "572--589"
GlanGree07: 	pages "365-371" should be "365--371"
SaneDono93: 	pages "4470 - 4474"

PittMyun02: 	pages "421-425" should be "421--425"
GoddBadd80: 	pages "99-104" should be "99--104"
FrieEtal03: 	pages "123-4" should be "123--124"
YooEtal12: 	pages "846 - 852" should be "846--852"
KensScha99: 	pages "399-415" should be "399--415"
LaufEtal06: 	pages "1408-1418" should be "1408--1418"
PeloEtal98: 	pages "205-218" should be "205--218"
SommLewi99: 	pages "83-108" should be "83--108"
WataEtal12: 	pages "858-873" should be "858--873"
ColeEtal01: 	pages "173-189" should be "173--189"
McClEtal95: 	pages "419--57" should be "419--457"
PellFare99: 	pages "647-53" should be "647--653"
NelsSchr92: 	pages "237-260" should be "237--260"
GardEtal88: 	pages "687-693" should be "687--693"
ArndRede02: 	pages "830-42" should be "830--842"
Sumb63: 	pages "443-450" should be "443--450"
HeatEtal06: 	pages "826-838" should be "826--838"
MacLKamp96: 	pages "132-142" should be "132--142"
BaddEtal75: 	pages "575-589" should be "575--589"
Unde72b: 	pages "276-283" should be "276--283"
HulmEtal97

Check journal names

In [13]:
fix_dict['journal'] = check_entries('journal', bd, [format_journal_name(j) for j in journals])

running check: journal...
1977 errors detected:
BeatEtal16: 	journal "{Trends in Cognitive Sciences}" should be "Trends in Cognitive Sciences"
TianEtal20: 	journal "Nature {N}euroscience" should be "Nature Neuroscience"
ArzyScha19: 	journal "{Trends in Cognitive Sciences}" should be "Trends in Cognitive Sciences"
GautEtal18: 	journal "{Cerebral Cortex}" should be "Cerebral Cortex"
KnigEich13: 	journal "Nature {N}euroscience" should be "Nature Neuroscience"
GautEtal12: 	journal "{The Journal of Neuroscience}" should be "The Journal of Neuroscience"
FentEtal08: 	journal "{The Journal of Neuroscience}" should be "The Journal of Neuroscience"
MikoEtal13b: 	journal "Proceedings of the NAACL-HLT" should be "Proceedings of the National Association for Computational Linguistics"
DeerEtal90: 	journal "Journal of the American Society for Information Science" should be "Journal of the {American} Society for Information Science"
Schr26: 	journal "The Physical Review" should be "Physical Review"
Sc

Check book titles

In [14]:
fix_dict['booktitle'] = check_entries('booktitle', bd, [format_journal_name(b) for b in book_titles])

running check: booktitle...
180 errors detected:
PennEtal14: 	booktitle "Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing" should be "Proceedings of the Conference on Empirical Methods in Natural Language Processing"
IyyeEtal15: 	booktitle "Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing" should be "Proceedings of the Association for Computational Linguistics"
ViswEtal17: 	booktitle "31st Conference on Neural Information Processing Systems" should be "Advances in Neural Information Processing Systems"
SzpuTulv11: 	booktitle "Predictions in the brain: using our past to generate a future" should be "Predictions in the Brain: Using Our Past to Generate {a} Future"
Tulv02b: 	booktitle "Principles of frontal lobe function" should be "Principles of Frontal Lobe Function"
BobrNorm75: 	booktitle "Representation and understanding" should be "Rep

Check publishers

In [17]:
fix_dict['publisher'] = check_entries('publisher', bd, [format_journal_name(p, publisher_key) for p in publishers])

running check: publisher...
811 errors detected:
SzpuTulv11: 	publisher "Oxford University Press" should be "{Oxford} {University} Press"
Tulv02b: 	publisher "Oxford University Press" should be "{Oxford} {University} Press"
Mann20: 	publisher "Oxford University Press" should be "{Oxford} {University} Press"
Nune95: 	publisher "Oxford University Press" should be "{Oxford} {University} Press"
Rund71: 	publisher "American Psychological Association" should be "{American} Psychological Association"
PescEtal09: 	publisher "Elsevier Ltd" should be "Elsevier"
GajeFalk15: 	publisher "Elsevier Inc." should be "Elsevier"
ScudEtal14: 	publisher "Elsevier Inc." should be "Elsevier"
SkriEtal14: 	publisher "Elsevier Inc." should be "Elsevier"
BuddEtal10: 	publisher "Elsevier Ltd" should be "Elsevier"
SaadEtal14b: 	publisher "Elsevier Inc." should be "Elsevier"
NiemEtal13: 	publisher "Elsevier Ltd" should be "Elsevier"
McMoEtal11: 	publisher "Elsevier Inc." should be "Elsevier"
CookBeav13: 	publisher 

Check author names

In [None]:
fix_dict['author'] = check_entries('author', bd, [reformat_author(a) for a in authors])

Check editor names

In [20]:
fix_dict['editor'] = check_entries('editor', bd, [reformat_author(e) for e in editors])

running check: editor...
189 errors detected:
Kaha17: 	editor "J. H. Byrne" should be "J H Byrne"
ChenEtal15: 	editor "{C. Cortes and N. D. Lawrence and D. D. Lee and M. Sugiyama and R. Garnett}" should be "{ C Cortes and N D Lawrence and D D Lee and M Sugiyama and R Garnett}"
Frie06: 	editor "{C. Chen and W. H\"ardle and A Unwin}" should be "{ C Chen and W H\"ardle and A Unwin}"
Tulv07: 	editor "Gluck, M.A. and Anderson, J.R. and Kosslyn, S. M." should be "M A Gluck and JR Anderson and S M Kosslyn"
Heal14: 	editor "S. K., Whitbourne" should be "Whitbourne { S K }"
OReiEtal99: 	editor "Miyake, A. and Shah, P." should be "A Miyake and P Shah"
RehmNaus90: 	editor "R. E. Ingram" should be "R E Ingram"
Bowe67: 	editor "K. W. Spence and J. T. Spence" should be "K W Spence and J T Spence"
Mont98: 	editor "Egenhofer, Max J. and Golledge, Reginald G." should be "Max J Egenhofer and Reginald G Golledge"
BjorBjor92: 	editor "Healy, A. F.and Kosslyn, S. M." should be "Healy, A Fand Kosslyn, S M"


In [None]:
def reformat_author(author):
    if len(author.split(' and ')) > 1:
        return ' and '.join([reformat_author(a) for a in author.split(' and ')])
    
    try:
        author = rearrange(author, preserve_non_letters=True)
    except:
        pass
    
    unclumped = []
    names = author.split(' ')
    
    for n in names:
        #remove periods
        n = n.replace('.', '')
        if (remove_non_letters(n.lower()) not in suffixes) and (n == n.upper()):
            if n.find('-') >= 0:
                n = '-'.join([reformat_author(c) for c in n.split('-')])
            else:
                for c in list(n):
                    unclumped.append(c)
                continue
        unclumped.append(n)
    
    return ' '.join(unclumped)

In [21]:
e = '{C. Cortes and N. D. Lawrence and D. D. Lee and M. Sugiyama and R. Garnett}'

In [22]:
reformat_author(e)

'{ C Cortes and N D Lawrence and D D Lee and M Sugiyama and R Garnett}'

In [None]:
def get_fields(bd):
    #get all fields
    fields = {}
    for k in bd.keys():
        next_entry = bd[k]
        for field, vals in next_entry.items():
            if not (field in fields.keys()):
                fields[field] = [vals]
            else:
                fields[field].append(vals)
    
    for k in fields.keys():
        fields[k] = list(np.unique(fields[k]))
    
    return fields

In [None]:
fields = get_fields(bd)

In [None]:
fields.keys()

In [None]:
print('\n'.join(list(np.unique([b.lower() for b in fields['publisher']]))))

In [None]:
remove_fields = ['date-modified', 'date-added',
                 'note', 'bdsk-url-1', 'pst', 'pmid', 'pmc',
                 'mesh', 'keyword', 'journal-full', 'abstract',
                 'mendeley-groups', 'file', 'bdsk-url-1', 'bdsk-url-2', 'eprint',
                 'arxivid', 'archiveprefix', 'ty', 'm3', 'l3',
                 'howpublished', 'lccn', 'read', 'annote', 'owner',
                 'timestamp', 'pii', 'zb', 'z9', 'z8' ,'times-cited',
                 'publication-type', 'isi', 'language', 'stat', 'so',
                 'sb', 'rf', 'pubm', 'pt', 'pl', 'phst', 'own', 'mhda',
                 'jid', 'ip', 'edat', 'dcom', 'da', 'au', 'aid', 'affiliation',
                 'lr', 'gr', 'jt', 'local-url', 'dep', 'mh', 'cin', 'ci',
                 'comment', 'con', 'card', 'oto', 'ot', 'unique-id',
                 'subject-category', 'number-of-cited-references', 'rating',
                 'rn', 'oid', 'issn', 'isbn', 'doc-delivery-number']

In [None]:
keep_fields = [k for k in fields.keys() if k not in remove_fields]
if 'force' not in keep_fields:
    keep_fields.append('force')

In [None]:
keep_fields

To do:
- ~~handle first-author surnames that start with lowercase characters~~
- ~~better handling of curly braces to preserve name ordering~~
- ~~better handling of escape characters~~
- ~~ensure bibtex file compiles and gets parsed correctly~~
- ~~Check duplicate titles/authors (fail if detected)~~
- ~~Check bibtex keys.  Duplicates should be assigned a suffix of 'a', 'b', etc.~~
- ~~If keys match aside from suffix then still allow the bibtex file to "pass" as long as all "matching" keys are unique *and* all have suffixes *and* the suffixes span a, b, c, ..., etc. without gaps~~
~~- Correct page numberings:~~
~~  - Change hyphens to n-dashes~~
~~  - Change m-dashes to n-dashes~~
~~  - Remove spaces in page ranges~~
~~  - Print a warning for strangely formatted pages: leading 0, non-digits, very large numbers (greater than 10K?) but allow bibtex file to pass~~
- remove everything except for "keep_fields"
  - Also add a "force" field that, when present, causes the checker to automatically pass that entry (if set to True); this will be used as a workaround for special cases not handled by the parser/checker
- ~~change all urls, dois, and page numbers to lowercase~~

Clean up:
- ~~correct all "strings" to full journal names~~
- ~~correct capitalization of journal names~~
- ~~correct all journal abbreviations to full journal names~~
- ~~Ensure no periods in journal names~~
- ~~Ensure no periods at the ends of affiliations or titles~~
- ~~Check for compressed initials (AA --> A A; A.A. --> A A; etc.)~~

Bonus:
- Use scholarly to verify information.  However, this is currently unreliable (server seems to hang frequently) and too slow to be viable