In [1]:
from helpers import load_bibliography, read, get_vals, remove_curlies, authors2key, same_id, check_entries, find_duplicates, check_key_suffixes, generate_correct_pages, format_journal_name, reformat_author, journal_key, publisher_key, address_codes, address_key, polish_database

In [2]:
#bd = load_bibliography('../memlab.bib')
bd = load_bibliography('cleaned.bib')

In [3]:
#template = load_bibliography('https://raw.githubusercontent.com/ContextLab/CDL-bibliography/master/memlab.bib')

In [4]:
bib_keys = list(bd.entries_dict.keys())
#template_keys = list(template.keys())

In [5]:
ids = get_vals(bd, 'ID')
authors = get_vals(bd, 'author')
years = get_vals(bd, 'year')
titles = get_vals(bd, 'title', proc=remove_curlies)
pages = get_vals(bd, 'pages')
journals = get_vals(bd, 'journal')
book_titles = get_vals(bd, 'booktitle')
publishers = get_vals(bd, 'publisher')
editors = get_vals(bd, 'editor')
addresses = get_vals(bd, 'address')

Check for duplicate keys

In [6]:
duplicate_keys, redundant_keys = find_duplicates(ids, authors, titles)

Checking for duplicated keys and entries...
No keys with multiple entries were found.
No entries with duplicated authors/titles were found.


In [7]:
fix_dict = {}
fix_dict['ID'] = check_entries('ID', bd, [authors2key(a, y) for a, y in zip(authors, years)], same=same_id);

running check: ID...
3 errors detected:
BakeEtal08: 	ID "BakeEtal08" should be "dBakEtal08"
LeeEtal02: 	ID "LeeEtal02" should be "TingEtal02"
Caja91: 	ID "Caja91" should be "yCaj91"


Check for correct key suffixes

In [8]:
target_keys = check_key_suffixes(bd)
fix_dict['ID'].extend(check_entries('ID', bd, target_keys))

running check: ID...
3 errors detected:
BakeEtal08: 	ID "BakeEtal08" should be "dBakEtal08"
LeeEtal02: 	ID "LeeEtal02" should be "TingEtal02"
Caja91: 	ID "Caja91" should be "yCaj91"


Check page numbering

In [9]:
target_pages, unfixable = generate_correct_pages(bd)

In [10]:
if len(unfixable) > 0:
    print(f'The following page numbers are ambiguous or incorrect: \n')
    print('\n'.join([f'{i}: {p}' for i, p in unfixable]))
else:
    print('No ambiguous page numbers were found.')

No ambiguous page numbers were found.


In [11]:
fix_dict['pages'] = check_entries('pages', bd, target_pages)

running check: pages...
no pagess to fix!


Check journal names

In [12]:
fix_dict['journal'] = check_entries('journal', bd, [format_journal_name(j) for j in journals])

running check: journal...
no journals to fix!


Check book titles

In [13]:
fix_dict['booktitle'] = check_entries('booktitle', bd, [format_journal_name(b) for b in book_titles])

running check: booktitle...
no booktitles to fix!


Check publishers

In [14]:
fix_dict['publisher'] = check_entries('publisher', bd, [format_journal_name(p, key=publisher_key) for p in publishers])

running check: publisher...
no publishers to fix!


Check author names

In [15]:
fix_dict['author'] = check_entries('author', bd, [reformat_author(a) for a in authors])

running check: author...
no authors to fix!


Check editor names

In [16]:
fix_dict['editor'] = check_entries('editor', bd, [reformat_author(e) for e in editors])

running check: editor...
no editors to fix!


Check addresses

In [17]:
fix_dict['address'] = check_entries('address', bd, [format_journal_name(a, key=address_key, force_caps=address_codes) for a in addresses])

running check: address...
no addresss to fix!


In [18]:
polished_bd = polish_database(bd, fix_dict, autofix=True)

removing extra fields...
processing BeatEtal16...
processing LeeEtal20...
processing ConsEtal16...
processing BellEtal20...
processing BellEtal18...
processing GagnEtal20...
processing TianEtal20...
processing ArzyScha19...
processing GautEtal18...
processing GautvanW16...
processing ArzyEtal09...
processing PanzEtal10...
processing KnigEich13...
processing CaruEtal18...
processing vanWEtal07...
processing GautEtal12...
processing ViemWake91...
processing GrieEtal20...
processing LeeEtal19...
processing RichEtal14b...
processing FentEtal08...
processing KhreEtal18...
processing DevlEtal18...
processing WietKiel19...
processing ConnEtal18...
processing KiroEtal15...
processing PeteEtal18...
processing PennEtal14...
processing MikoEtal13b...
processing MnihHint09...
processing BrowEtal92...
processing BengEtal03...
processing YousHame17...
processing IyyeEtal15...
processing ViswEtal17...
processing DeerEtal90...
processing SzpuTulv11...
processing Tulv02b...
processing WheeEtal97...
pro

processing ShutEtal83...
processing GattDePa11...
processing MoraEtal07...
processing McMoEtal11...
processing GilmEtal79...
processing KivlGran06...
processing CookBeav13...
processing EdwaCast13...
processing BuchEtal13...
processing SteeEtal97...
processing EdwaCast15...
processing LjunEtal97...
processing CalvEtal97...
processing BishEtal06...
processing NateEtal06...
processing DamEtal16...
processing TurGEtal17...
processing LiuEtal16...
processing ShapEtal10...
processing AnasEtal17...
processing Arch11b...
processing Lojo10...
processing TaruEtal13...
processing ZagaEtal13a...
processing ZagaEtal13b...
processing FernEtal13...
processing SchmEtal03...
processing HogeEtal99...
processing HogeEtal08...
processing MileHard98...
processing HameChid08...
processing SaadEtal14a...
processing SalaEtal15...
processing SaadEtal15...
processing CechEtal14...
processing CianEtal01...
processing CianEtal00...
processing CahiAlki03...
processing McNeRadv15...
processing ColeTomp08...
proces

processing GrunEtal99...
processing Skan98...
processing WestEtal82...
processing MannEich06...
processing Scho06...
processing WilsWilk97...
processing RimoKodi92...
processing HanDobb08...
processing BhatEtal06...
processing WardEtal10...
processing GrenWard12...
processing BhatEtal08...
processing UnswBrew09...
processing GardTulv80...
processing NambEtal00...
processing KitaEtal76...
processing LeutMizu99...
processing DobbEtal02...
processing GrubGosc04...
processing MeckEtal99...
processing ArbuFran00...
processing NosoKant06...
processing Smit02a...
processing Noso88...
processing Noso92...
processing LohnEtal14...
processing Webe88...
processing SingEtal10...
processing BostEtal91...
processing PoeEtal00...
processing BarrEtal07a...
processing Gloo90...
processing Post69...
processing MullPilz00...
processing BirnEtal04...
processing Wood38...
processing MullSchu94...
processing Jaco87...
processing HowaKaha00...
processing JacoEtal12...
processing BergEtal06...
processing Stro

processing CornEtal08...
processing HuhEtal90...
processing HalgEtal78a...
processing CameEtal01...
processing EkstEtal05...
processing LegaEtal11b...
processing KingEtal02...
processing BurkEtal14...
processing CraiJenn92...
processing ZackEtal00...
processing Badd90...
processing AtkiShif68...
processing FellEtal01b...
processing Gree92...
processing SedeEtal11...
processing RutiEtal10...
processing Murd74...
processing MillEtal12b...
processing RizzEtal06...
processing MarcEtal08...
processing KoroEtal05...
processing ChamEtal03...
processing BodiEtal05...
processing HamiEtal02...
processing Warr19...
processing RuggYone03...
processing FostEtal13...
processing SummEtal11...
processing MaguEtal99...
processing WangSpel02...
processing ZaghEtal09...
processing BindEtal00...
processing ElgeEtal97...
processing KahaEtal99a...
processing CaplEtal03...
processing AstuEtal02...
processing CampEtal10...
processing CampEtal14...
processing RoedEtal82...
processing RoedChal89...
processing B

processing VoytEtal10a...
processing KochUllm85...
processing Ship46...
processing GuilEtal99...
processing Tulv70...
processing StorEtal07...
processing BurgGruz00...
processing BurgGruz97...
processing PetzHaub04...
processing ZhouEtal04...
processing ShifCook78...
processing Murd72...
processing UsheEtal08...
processing LoesWaug67...
processing Murd63a...
processing KinsWood75...
processing Wick65...
processing BotvPlau06...
processing Hens98a...
processing Crow67b...
processing AverCori61...
processing LewaFarr08...
processing NosoEtal11...
processing Loes67...
processing BjorHeal74...
processing FarrMcLa07...
processing PeteEtal61...
processing PetePete59...
processing Murd63b...
processing Raym69...
processing Shif75...
processing PostPhil65...
processing Pete66b...
processing SekuEtal06...
processing DingEtal00...
processing KemeChil72...
processing GreeSwet66...
processing Pico93...
processing YoneEtal96...
processing Rest65...
processing LachEtal08...
processing CabeEtal02...


processing Macr75...
processing RiedEtal88...
processing PesaEtal02...
processing LouiWils01...
processing RaymEtal92...
processing Daub92...
processing NeraBilk05...
processing RoedKarp06a...
processing BaloNeel80...
processing CummEtal03...
processing SzpuEtal08...
processing Slam69...
processing PennEtal08...
processing Lami99...
processing ChenGall84...
processing ChanKaha97...
processing McDaFish91...
processing Noso91...
processing HockCris96a...
processing BenjEtal12...
processing HockCris96b...
processing Youn62...
processing PoulEtal12...
processing Spea27...
processing Ruth00...
processing HeimEtal97...
processing SchuEtal83...
processing GalvEtal05...
processing MataEtal07...
processing AngeEtal10...
processing MagrEtal12...
processing Hock84...
processing WarrEtal71...
processing DemoEtal92...
processing AlvaEtal94...
processing Kais60...
processing EckaYoun36...
processing Yate66...
processing Glaz28...
processing BasaEtal87...
processing OverBeck09...
processing NaveEtal0

Exception: unexpected value for key BakeEtal08[ID]: expected "BakeEtal08" but found "dBakEtal08"

In [None]:
def writebib(fname, biblist, order, indent='\t'):
    def entry2str(e):
        s = '@' + e['ENTRYTYPE'] + '{' + e['ID'] + ','
        at_least_one = False
        for k in order:
            if (k not in ['ENTRYTYPE', 'ID']) and (k in e.keys()):
                s += '\n' + indent + k.capitalize() + ' = {' + e[k] + '},'
                at_least_one = True
        if at_least_one:
             s = s[:-1] #remove last comma

        return s + '}' + '\n'
    
    bibtex_str = '\n'.join([entry2str(b) for b in biblist])
    print(bibtex_str, file=open(fname, 'w'))

In [None]:
keep_fields = read('keep_fields.txt')
keep_fields.sort()

writebib('cleaned.bib', polished_bd, keep_fields)

To do:
- ~~handle first-author surnames that start with lowercase characters~~
- ~~better handling of curly braces to preserve name ordering~~
- ~~better handling of escape characters~~
- ~~ensure bibtex file compiles and gets parsed correctly~~
- ~~Check duplicate titles/authors (fail if detected)~~
- ~~Check bibtex keys.  Duplicates should be assigned a suffix of 'a', 'b', etc.~~
- ~~If keys match aside from suffix then still allow the bibtex file to "pass" as long as all "matching" keys are unique *and* all have suffixes *and* the suffixes span a, b, c, ..., etc. without gaps~~
~~- Correct page numberings:~~
~~  - Change hyphens to n-dashes~~
~~  - Change m-dashes to n-dashes~~
~~  - Remove spaces in page ranges~~
~~  - Print a warning for strangely formatted pages: leading 0, non-digits, very large numbers (greater than 10K?) but allow bibtex file to pass~~
- remove everything except for "keep_fields"
  - ~~Also add a "force" field that, when present, causes the checker to automatically pass that entry (if set to True); this will be used as a workaround for special cases not handled by the parser/checker~~
- ~~change all urls, dois, and page numbers to lowercase~~

Clean up:
- ~~correct all "strings" to full journal names~~
- ~~correct capitalization of journal names~~
- ~~correct all journal abbreviations to full journal names~~
- ~~Ensure no periods in journal names~~
- ~~Ensure no periods at the ends of affiliations or titles~~
- ~~Check for compressed initials (AA --> A A; A.A. --> A A; etc.)~~

Bonus:
- Use scholarly to verify information.  However, this is currently unreliable (server seems to hang frequently) and too slow to be viable