In [1]:
import pandas as pd
import numpy  as np
import re

In [2]:
def getregex():
    '''Начало простого парсера для bib-файлов.
    Игнорирует библиографические записи, невходящие в fieldList'''
    
    biginRegex = '(@'
    typeRegex = r'\S+)(\{'
    enRegex = r'\{(.*)\}'
    endEnRegex = r',\s*'
    bibKeyRegex = r'\S+)' + endEnRegex
    
    stFields = ['address','annote','author'      ,'booktitle'   ,'chapter','crossref',
                'edition','editor','howpublished','institution' ,'journal','key',
                'month'  ,'note'  ,'number'      ,'organization','pages'  ,'publisher',
                'school' ,'series','title'       ,'volume' ,'year'
               ]
    
    nonStFields = ['affiliation','abstract','doi'     ,'eid'     ,'contents','copyright',
                   'ISBN'       ,'ISSN'    ,'keywords','language','location','LCCN',
                   'mrnumber'   ,'price'   ,'size'    ,'URL'     ,'groups'
                  ]
    
    fieldList = ['type', 'bib-key'] + sorted(stFields + nonStFields, key=str.lower)
    
    fieldtup = tuple(fieldList)
    rentr = lambda i: '(?:(' + i + r'\s*\=\s*' + r'\{.*\})' + r',\s*)?'
    ListRegex = [rentr(i) for i in fieldtup]
    fullEntryRegex = biginRegex + typeRegex + bibKeyRegex + '(?:' + '|'.join(ListRegex) + ')+\}\s*'
    
    return re.compile(fullEntryRegex, flags=re.IGNORECASE), fieldtup

In [3]:
def makeFrame(fname):
    with open(fname,"r",encoding='utf-8') as f:
            filetext=f.read()
    regex,columnsTuple=getregex()
    enries=regex.findall(filetext)
    
    del filetext,regex

    data=pd.DataFrame(columns=columnsTuple)

    typeregex   = re.compile(r'@(\S+)')
    bibkeyregex = re.compile(r'\{(\S+)')
    fieldregex  = re.compile(r'(\S+)\s*\=\s*\{(.*)\}') #\1 - column; \2 - value

    number=-1
    for i in enries:
        number+=1
        data.loc[number,'type']=typeregex.findall(i[0])[0]
        data.loc[number,'bib-key']=bibkeyregex.findall(i[1])[0]
        for j in i[2:]:
            if len(j)>0:
                field=fieldregex.findall(j)
                if field[0][1] == '':
                    data.loc[number,field[0][0]] = np.nan
                else:
                    data.loc[number,field[0][0]] = field[0][1]
    return data

In [4]:
def cleandata(frame):
    data = frame.copy()
    data.dropna(axis=1, how='all', inplace=True)
    data.dropna(how='all', inplace=True)
    
    usefulcolumns = ['booktitle', 'title', 'author', 'keywords', 'doi', 'type']
    usefuldata = data.reindex(columns=usefulcolumns)
    usefuldata.dropna(subset=['title', 'author', 'keywords'], inplace=True)
    usefuldata.drop_duplicates(ignore_index=True, inplace=True)
    
    del data
    
    usefuldata['author'] = usefuldata['author'].str.replace(r'\{','')
    usefuldata['author'] = usefuldata['author'].str.replace(r'\}','')
    usefuldata['author'] = usefuldata['author'].str.replace(r'\sand\s',';')
    usefuldata['author'] = usefuldata['author'].str.replace(r'\s+',' ')
    
    usefuldata['keywords'] = usefuldata['keywords'].str.replace(r'\$[^\$]*\$','')
    usefuldata['keywords'] = usefuldata['keywords'].str.replace(r'\s*;\s*',';')
    usefuldata['keywords'] = usefuldata['keywords'].str.replace(r';+',';')
    usefuldata['keywords'] = usefuldata['keywords'].str.replace(r'\s+',' ')

    return usefuldata

In [5]:
def makecolmatrix(kwlist, data):
    usefuldata = data.copy()
    
    autorstable = usefuldata.reindex(columns=['author', 'keywords', 'doi'])
    
    del usefuldata
    
    kwlist = [i.replace(' ', '.') for i in kwlist]
    kwregex = '(?:' + '|'.join(kwlist) + ')'
    selauthors = autorstable[autorstable['keywords'].str.contains(kwregex)].reindex(columns=['doi','author'])
    
    del autorstable

    aframe = pd.DataFrame(columns=['doi','author'])
    for i in selauthors.index:
        alist = selauthors.loc[i,'author'].split(';')
        dlist = [selauthors.loc[i,'doi']] * len(alist)
        dframe = pd.DataFrame({'doi':dlist, 'author':alist}, columns=['doi','author'])
        aframe = aframe.append(dframe, ignore_index=True)
    
    del dframe
    
    aframe.drop_duplicates(ignore_index=True, inplace=True)
    mcauthors = aframe['author'].value_counts()
    doiarr = aframe['doi'].unique()
    
    adjacency_matrix = pd.DataFrame(np.zeros((len(mcauthors.index), len(mcauthors.index)), dtype=np.int32),
                                    index=mcauthors.index, columns = mcauthors.index)
    for i in doiarr:
        coauthors = aframe[aframe['doi'].str.contains(i)]
        cl = list(coauthors.loc[:,'author'])
        if len(cl) > 1:
            for j in cl[1:]:
                adjacency_matrix.loc[cl[0], j] += 1
                adjacency_matrix.loc[j, cl[0]] += 1
    
    for i in range(1, adjacency_matrix.shape[0]):
        adjacency_matrix.iloc[i, range(i)] = 0
        
    return mcauthors, adjacency_matrix

In [6]:
def make_adjacency_list(adjacency_matrix):
    adjacency_list = adjacency_matrix.stack()
    adjacency_list = adjacency_list.sort_values(ascending=False)
    
    l1 = [i[0] for i in adjacency_list.index]
    l2 = [i[1] for i in adjacency_list.index]
    
    l3 = list(adjacency_list)
    
    indelen = len(l1)
    
    del adjacency_list
    
    newd = {'author':l1, 'coauthor':l2, 'nuber of articles':l3}
    
    del l1,l2,l3
    
    adjacency_list = pd.DataFrame(newd, columns=['author', 'coauthor', 'nuber of articles'],
                                  index=np.arange(1, indelen+1, dtype=np.int32))
    del newd
    
    adjacency_list = adjacency_list[adjacency_list['nuber of articles'] > 0]
    
    return adjacency_list.set_index(['author'])

In [7]:
data = makeFrame('.input_data/all_article.bib')

In [8]:
usefuldata = cleandata(data)

del data

  usefuldata['author'] = usefuldata['author'].str.replace(r'\{','')
  usefuldata['author'] = usefuldata['author'].str.replace(r'\}','')
  usefuldata['author'] = usefuldata['author'].str.replace(r'\sand\s',';')
  usefuldata['author'] = usefuldata['author'].str.replace(r'\s+',' ')
  usefuldata['keywords'] = usefuldata['keywords'].str.replace(r'\$[^\$]*\$','')
  usefuldata['keywords'] = usefuldata['keywords'].str.replace(r'\s*;\s*',';')
  usefuldata['keywords'] = usefuldata['keywords'].str.replace(r';+',';')
  usefuldata['keywords'] = usefuldata['keywords'].str.replace(r'\s+',' ')


In [9]:
kwlist = list()
with open('.input_data/a_keys.txt', 'r') as f:
    for line in f:
        kwlist.append(line.strip())

mcauthors, adjacency_matrix = makecolmatrix(kwlist, usefuldata)

del usefuldata

In [10]:
# Пары, которые публиковались вместе более 8 раз
bounds = adjacency_matrix.copy()
bounds[adjacency_matrix < 8] = 0
boundslist = make_adjacency_list(bounds)
boundslist.to_csv('.results/strong_bounds_list.csv')