In [2]:
import pandas as pd
from itertools import chain
from functools import reduce
import numpy as np
import logging
import json
import glob
from tqdm import tqdm
import re

In [6]:
df = pd.read_pickle('../../data/intermediate_files/dataset.p')

In [7]:
df.columns

Index(['title', 'abstract', 'affiliation', 'aggregationType', 'authkeywords',
       'authorgroup', 'authors', 'chemicals', 'citedby_count', 'coverDate',
       'description', 'doi', 'funding', 'funding_text', 'citedby_link',
       'contributor_group', 'language', 'publicationName', 'references',
       'subject_areas', 'issueIdentifier', 'volume', 'eid'],
      dtype='object')

In [8]:
def clean_publicationName(text):
    text = text.lower()
    text = re.sub('(?<=studies in history and philosophy of science).*','',text)
    text = re.sub('(?<=isis).*','',text)
    text = re.sub('(?<=synthese).*','',text)
    text = re.sub('(?<=scientometrics).*','',text)
    text = re.sub('.*(?=british journal for the history of science)','',text)
    text = re.sub('science(,)? technology (&|and) society','science, technology and society',text)
    text = re.sub('science(,)? technology(,)? (&|and) human values','science, technology and human values',text)
    text = re.sub('science & education','science and education',text)
    return text
    

In [9]:
df.publicationName = df.publicationName.apply(lambda x: clean_publicationName(x))

In [10]:
df.publicationName.value_counts()

synthese                                        5674
scientometrics                                  5136
research policy                                 3221
science and public policy                       1707
studies in history and philosophy of science    1434
science and education                           1078
social studies of science                       1069
public understanding of science                  977
isis                                             932
journal of informetrics                          876
science, technology and human values             757
british journal for the history of science       742
research evaluation                              666
science, technology and society                  453
minerva                                          391
science and technology studies                   111
Name: publicationName, dtype: int64

In [11]:
def normalize_publicationName(argument): 
    switcher = { 
        'Science, Technology & Human Values': "Science Technology and Human Values", 
        'Science, Technology, & Human Values': "Science Technology and Human Values", 
        'Science, technology & human values': "Science Technology and Human Values",
        'Scientometrics: An International Journal for all Quantitative Aspects of the Science of Science, Communication in Science and Science Policy': "Scientometrics",
        'Computers and Education' : 'Research Policy',
        'Public understanding of science (Bristol, England)':'Public Understanding of Science',
        
        'Social studies of science': 'Social Studies of Science'
    } 
    return switcher.get(argument,argument) 

In [12]:
df.publicationName = df.publicationName.apply(normalize_publicationName)

filter articles incorrectly downloaded

In [14]:
df = df[~df['publicationName'].isin(['lecture notes in computer science (including subseries lecture notes in artificial intelligence and lecture notes in bioinformatics)',
                                     'Automation in Construction',
                              'science advances']) ]


#### Distribution of citatiosn

In [21]:
def most_cited(series):

    return set(df.title[df.citedby_count == max(series)].values)

In [22]:
df.groupby(['publicationName']).citedby_count.agg(['count', 'mean',np.sum, 'max', most_cited]).sort_values(by ='mean')

Unnamed: 0_level_0,count,mean,sum,max,most_cited
publicationName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
science and technology studies,111,5.288288,587,39,"{Science, technology and innovation policies i..."
"science, technology and society",453,5.735099,2598,238,{Public research and industrial innovations in...
synthese,5674,9.602749,54486,910,{Fuzzy logic and approximate reasoning - In me...
studies in history and philosophy of science,1434,10.693863,15335,288,"{What is structural realism?, Diversity and ne..."
isis,932,10.70279,9975,415,{Knowledge in transit.}
british journal for the history of science,742,11.045822,8196,132,"{Probability, explanation, and information, Ma..."
science and education,1078,11.358071,12244,298,{The knowledge creation metaphor - An emergent...
research evaluation,666,13.148649,8757,223,"{What do we measure by co-authorships?, Stages..."
science and public policy,1707,13.265378,22644,462,{Responsible research and innovation: From sci...
minerva,391,16.511509,6456,624,{Technologies of humility: Citizen participati...


### affiliation

In [15]:
affiliation = list(filter(None, df['affiliation'])) 
affiliation = [item for sublist in affiliation for item in sublist]

affiliation_df = pd.DataFrame(affiliation, columns=['id', 'name','city', 'country'])
affiliation_df.drop_duplicates(['id', 'name','city', 'country'],inplace = True)
affiliation_df.to_json('../../data/affiliations.json')
affiliation_df.sample()

Unnamed: 0,id,name,city,country
19810,60028088,Kagoshima University,Kagoshima,Japan


#### authors

In [16]:
authors = list(filter(None, df['authors'])) 
authors = [item for sublist in authors for item in sublist]
authors
authors_df = pd.DataFrame(authors, columns=['auid', 'indexed_name','surname', 'given_name','affiliation_id'])
authors_df.drop_duplicates(['auid', 'indexed_name','surname', 'given_name'],inplace = True)
authors_df.to_json('../../data/authors.json')
authors_df.sample()


Unnamed: 0,auid,indexed_name,surname,given_name,affiliation_id
20798,56668320800,Bates J.,Bates,Jo,[60001881]


#### references

In [17]:
def add_eid(ref_list, eid):
    if ref_list is not None:
        #ref_list = ref_list.tolist()[0]
        ref_list_extended = list(map(lambda x: x + [eid],ref_list))
    else:
        ref_list_extended = None
    return ref_list_extended

In [30]:
df["references"] = df.apply(lambda x: add_eid(x.references,x.eid), axis=1)

In [31]:
columns = ['position', 'eid_of_ref', 'doi', 'title', 'authors', 'authors_auid', 'authors_affiliationid', 'sourcetitle', 'publicationyear', 'volume', 'issue', 'first', 'last', 'citedbycount', 'type', 'text', 'fulltext','eid']

references = [x for x in df.references.values if x]
references = list(chain.from_iterable(references))

In [32]:
references_df = pd.DataFrame(references, columns=columns)

In [33]:
references_df.sample()

Unnamed: 0,position,eid_of_ref,doi,title,authors,authors_auid,authors_affiliationid,sourcetitle,publicationyear,volume,issue,first,last,citedbycount,type,text,fulltext,eid
1020043,29,84930701779,,The triviality of presentism,"Meyer, U.",,,New Papers on the Present,2012,,,,,,,"R. Ciuni, K. Miller, and G. Torrengo (Eds.), P...","Meyer, U. (2012). The triviality of presentism...",2-s2.0-84905270654


In [34]:
references_df.to_json('../../data/references.json')

In [35]:
df.to_pickle('../../data/intermediate_files/dataset.p')