# Which citation styles do we have in the real data?
Dominika Tkaczyk

2.10.2018

In [1]:
import sys
sys.path.append('..')

%matplotlib inline

import warnings
warnings.simplefilter('ignore')

import json
import pandas as pd

from data_utils import add_noise, clean_data, read_ref_strings_data, remove_technical_parts
from features import get_features, select_features_chi2
from sklearn.linear_model import LogisticRegression

  from numpy.core.umath_tests import inner1d


Read the training data:

In [2]:
dataset = read_ref_strings_data('../data/ref_strings/')
print('Dataset size: {}'.format(dataset.shape[0]))
dataset.head()

Dataset size: 85000


Unnamed: 0,doi,string,style
0,10.1016/s0002-9394(14)70125-4,"[1]LEE, S.-H. and TSENG, S.C.G. 1997. Amniotic...",acm-sig-proceedings
1,10.1016/0920-9964(95)95073-i,"[1]Scheffer, R. et al. 1995. History of premor...",acm-sig-proceedings
2,10.1075/cilt.97.22vek,"[1]Vekerdi, J. 1993. 4. Word formation in Gips...",acm-sig-proceedings
3,10.1080/19761597.2013.810947,"[1]Kang, J. et al. 2013. Determinants of succe...",acm-sig-proceedings
4,10.1016/0378-1119(79)90090-8,"[1]Wickens, M.P. et al. 1979. Restriction map ...",acm-sig-proceedings


Preprocess the data:

In [3]:
dataset = clean_data(dataset)
dataset['string'] = dataset['string'].apply(remove_technical_parts)
dataset['string'] = dataset['string'].apply(add_noise)
print('Dataset size: {}'.format(dataset.shape[0]))

Dataset size: 82834


Train the model:

In [4]:
count_vectorizer, tfidf_transformer, features = get_features(dataset['string'], nfeatures=5000,
                                                             feature_selector=select_features_chi2, ngrams=(2, 4))
model = LogisticRegression(random_state=0).fit(features, dataset['style'])

Read a sample of real records:

In [5]:
with open('../data/samples/sample-10000.json', 'r') as file:
    data = json.loads(file.read())['sample']

Next, I iterate over all unstructured reference strings found in the records and predict the style for each of them:

In [6]:
strings = []
styles = []
probabilities = []
for d in data:
    for r in d.get('reference', []):
        if 'unstructured' in r:
            if len(r['unstructured']) < 11:
                continue
            _, _, test_features = get_features([r['unstructured']], count_vectorizer=count_vectorizer,
                                               tfidf_transformer=tfidf_transformer)
            prediction = model.predict(test_features)
            probabilities.append(max(model.predict_proba(test_features)[0]))
            strings.append(r['unstructured'])
            styles.append(prediction[0])
existing_styles = pd.DataFrame({'string': strings, 'style': styles})

The distribution of the styles:

In [7]:
styles_distr = existing_styles.groupby(['style']).size().reset_index(name='counts') 
styles_distr['fraction'] = styles_distr['counts'] / len(strings)
styles_distr = styles_distr.sort_values(by='counts', ascending=False).reset_index(drop=True)
styles_distr

Unnamed: 0,style,counts,fraction
0,springer-basic-author-date,6166,0.283039
1,apa,3198,0.146798
2,vancouver,2758,0.126601
3,springer-lecture-notes-in-computer-science,2224,0.102089
4,american-institute-of-physics,1148,0.052697
5,harvard3,1131,0.051916
6,bmc-bioinformatics,1088,0.049943
7,acm-sig-proceedings,962,0.044159
8,elsevier-with-titles,702,0.032224
9,american-chemical-society-with-titles,697,0.031994
