In [None]:
import pandas as pd
import numpy  as np
import re

In [None]:
def getregex():
    #Начало простого парсера для bib-файлов.
    #Игнорирует библиографические записи, невходящие в fieldList
    biginRegex = '(@'
    typeRegex = r'\S+)(\{'
    enRegex = r'\{(.*)\}'
    endEnRegex = r',\s*'
    bibKeyRegex = r'\S+)' + endEnRegex
    
    stFields = ['address','annote','author'      ,'booktitle'   ,'chapter','crossref',
                'edition','editor','howpublished','institution' ,'journal','key',
                'month'  ,'note'  ,'number'      ,'organization','pages'  ,'publisher',
                'school' ,'series','title'       ,'volume' ,'year'
               ]
    
    nonStFields = ['affiliation','abstract','doi'     ,'eid'     ,'contents','copyright',
                   'ISBN'       ,'ISSN'    ,'keywords','language','location','LCCN',
                   'mrnumber'   ,'price'   ,'size'    ,'URL'     ,'groups'
                  ]
    
    fieldList = ['type', 'bib-key'] + sorted(stFields + nonStFields, key=str.lower)
    fieldtup = tuple(fieldList)
    
    rentr = lambda i : '(?:(' + i + r'\s*\=\s*' + r'\{.*\})' + r',\s*)?'
    ListRegex = [rentr(i) for i in fieldtup]
    
    fullEntryRegex = biginRegex + typeRegex + bibKeyRegex + '(?:' + '|'.join(ListRegex) + ')+\}\s*'
    
    return re.compile(fullEntryRegex, flags=re.IGNORECASE), fieldtup

In [None]:
def makeFrame(fname):
    with open(fname,"r",encoding='utf-8') as f:
            filetext = f.read()
            
    regex, columnsTuple = getregex()
    enries = regex.findall(filetext)
    
    del filetext, regex

    data=pd.DataFrame(columns = columnsTuple)

    typeregex = re.compile(r'@(\S+)')
    bibkeyregex = re.compile(r'\{(\S+)')
    fieldregex = re.compile(r'(\S+)\s*\=\s*\{(.*)\}') #\1 - column; \2 - value

    number =- 1
    for i in enries:
        number += 1
        data.loc[number, 'type'] = typeregex.findall(i[0])[0]
        data.loc[number, 'bib-key'] = bibkeyregex.findall(i[1])[0]
        for j in i[2:]:
            if len(j) > 0:
                field = fieldregex.findall(j)
                if field[0][1] == '':
                    data.loc[number,field[0][0]] = np.nan
                else:
                    data.loc[number,field[0][0]] = field[0][1]
    return data

In [None]:
def cleandata(frame):
    
    data=frame.copy()
    
    data.dropna(axis=1, how='all', inplace=True)
    data.dropna(how='all', inplace=True)
    
    usefulcolumns = ['booktitle', 'title', 'author', 'keywords', 'doi', 'type']
    usefuldata = data.reindex(columns=usefulcolumns)
    usefuldata.dropna(subset=['title', 'author', 'keywords'], inplace=True)
    usefuldata.drop_duplicates(ignore_index=True, inplace=True)
    
    del data
    
    usefuldata['author'] = usefuldata['author'].str.replace(r'\{','')
    usefuldata['author'] = usefuldata['author'].str.replace(r'\}','')
    usefuldata['author'] = usefuldata['author'].str.replace(r'\sand\s',';')
    usefuldata['author'] = usefuldata['author'].str.replace(r'\s+',' ')
    
    usefuldata['keywords'] = usefuldata['keywords'].str.replace(r'\$[^\$]*\$','')
    usefuldata['keywords'] = usefuldata['keywords'].str.replace(r'\s*;\s*',';')
    usefuldata['keywords'] = usefuldata['keywords'].str.replace(r';+',';')
    usefuldata['keywords'] = usefuldata['keywords'].str.replace(r'\s+',' ')
    
    return usefuldata

In [None]:
def countKeys(keyfields):
    
    keywords = list()
    keylist = list()
    
    for i in keyfields:
        a = i.lower()
        a = re.sub(r'-', ' ', a)
        keylist += a.split(';')    
        
    for i in keylist:
        toks = i.split(' ')
        if len(toks) == 1:
                keywords.append(toks[0])
        elif len(toks) > 1:
            for j in range(len(toks)):
                for k in range(1, len(toks) + 1):
                    newkey=' '.join(toks[j:k])
                    newkey=re.sub(r'^\s+', '', newkey)
                    newkey=re.sub(r'\s+$', '', newkey)
                    if newkey != '':
                        keywords.append(newkey)
    
    keys = pd.Series(keywords)
    keys = keys[keys != '']
    keys = pd.Series(keys.value_counts(), name = 'number')
    keys = pd.DataFrame(keys)
    keys['key length'] = [len(i.split(' ')) for i in keys.index]
    return keys

In [None]:
data = makeFrame('.input_data/all_article.bib')
usefuldata = cleandata(data)
del data
keyfields = usefuldata['keywords']
del usefuldata

In [None]:
keys = countKeys(keyfields)
keys = keys.sort_index()
keys = keys[keys['key length'] > 1]
keys = keys[keys['number'] > 4]
keys.to_csv('.results/keycounter(key length more than 1 and number more than 4 and sorted keys).csv')