In [1]:
import os
import datetime
import json
import pandas as pd
import re
from collections import Counter

In [2]:
path = "Files/Authors"
filePath = path + "/"
authorFiles = os.listdir(path)

df_presenters = pd.read_pickle("Files/Dataframes/Presenters.pkl")  
authorID = pd.read_pickle("Files/Dictionary/authorID.pkl")  

In [3]:
def getName(data): #get namme from json
    name = data['results'][0]['display_name'] #only save first name
        
    return name

In [4]:
def getMaxPaperCount(data): #get maximum number of published papers
    dataCount = len(data['results']) #length of results(with authors)
    paperCount = 0
    for i in range(0, dataCount):
        paperCount += data['results'][i]['works_count']

    return paperCount

In [5]:
def getFields(data): #get field
    dataCount = len(data['results']) #length of results(with authors)
    all_fields = []
    if data['results'][0]['x_concepts']: #if any fields
        for i in range(0, dataCount):
            fields = data['results'][i]['x_concepts']
            for f in fields: #loop through fields for result i
                all_fields.append(f['display_name'])
        if 'Physics' in all_fields: #set field as Physics if that is one of the fields, otherwise set it to the first field in file
            field = 'Physics'
        else:
            #field = fields[0]['display_name']
            field = data['results'][0]['x_concepts'][0]['display_name']
    else:
        field = 'No fields'
        
    return field

In [6]:
def getFirstYear(data): #get first year a paper was published for the researcher
    dataCount = len(data['results']) #length of results(with authors)
    publishYears = []
    for i in range(0, dataCount):
        publishCount = data['results'][i]['counts_by_year'] #publishCount
        if publishCount: #if any data in counts by year
            for y in range(0, len(publishCount)):
                if publishCount[y]['works_count'] > 0:
                    publishYears.append(publishCount[y]['year']) #append year if anything was published
    if publishYears: #if anything published
        firstYear = int(min(publishYears))
    else:
        firstYear = 'No published papers'
    
    return firstYear

In [7]:
def readFile(file, path): #function to read file
    with open(path + file) as f:
        data = json.load(f)
    return data

In [8]:
def getAuthorInfo(files, path, authorDict): #get info about authors
    nameList = {}
    index = 0
    for file in files:
        if file != '.DS_Store':
            data = readFile(file, path)
            if not 'error' in data.keys(): #if no error in the file
                fileIndex = int(re.findall(r'\d+', file)[0])
                originalName = authorDict[fileIndex]
                nAuthors = data['meta']['count']
                if  nAuthors > 0: #if at least one researcher found
                    name = getName(data) #get name
                    maxPaperCount = getMaxPaperCount(data) #get max paper count
                    field = getFields(data) #get field
                    firstYear = getFirstYear(data) #get first year a paper was published
                    nameList[index] = {'FileID': fileIndex, 'OriginalName': originalName, 'API_Name': name, 'MaxPaperCount': maxPaperCount, 'Field': field, 'FirstPaperYear': firstYear, 'AuthorCount': nAuthors}
                    index += 1
                        
    return nameList

In [9]:
def getFirstName(df):
    firstName_dict = {}
    for i in df.index:
        name1 = df.OriginalName.iloc[i].split()
        name2 = df.API_Name.iloc[i].split()
        if name1: #if there is any originalName
            firstName = max(name1[0], name2[0])
            firstName_dict[i] = firstName
        
    return firstName_dict

In [10]:
def cumulativePaperCount(confYear, fileID, path):
    conferenceYear = datetime.datetime.strptime(str(confYear), '%Y')
    publishCount = 0
    data = readFile(str(fileID) + '.txt', path)
    dataCount = len(data['results']) #length of results(with authors)
    for i in range(0, dataCount):
        published = data['results'][i]['counts_by_year']
        for j in published:
            publishYear = datetime.datetime.strptime(str(j['year']), '%Y')
            if publishYear < conferenceYear: 
                publishCount += j['works_count']
                
    return publishCount

#### Add additional information

In [11]:
authorInfo = getAuthorInfo(authorFiles, filePath, authorID) #dict with author Info

In [12]:
#create df from authorInfo
df_authors = pd.DataFrame.from_dict(authorInfo, orient = 'index')

In [13]:
authors = pd.merge(df_authors, df_presenters,  left_on=['OriginalName'], right_on=['Name'])

In [14]:
firstNameDict = getFirstName(authors)

In [15]:
#make firstNameDict to df
firstName_df = pd.DataFrame.from_dict(firstNameDict, orient = 'index').rename(columns = {0: 'FirstName'})

In [16]:
#merge authors with firstName_df - authors after using API
apiAuthors = pd.merge(authors, firstName_df, left_index = True, right_index = True)

In [22]:
#convert year and FirstPaperYear to datetime year instead of int
apiAuthors['Year'] = pd.to_datetime(apiAuthors.Year, format = '%Y', errors='coerce').dt.year
apiAuthors['FirstPaperYear'] = pd.to_datetime(apiAuthors.FirstPaperYear, format = '%Y', errors='coerce').dt.year

In [24]:
#publishedAuthors = apiAuthors[apiAuthors.FirstPaperYear != 'No published papers'] #authors published at some point

In [25]:
#publishedAuthors['Year'] = pd.to_datetime(publishedAuthors.Year, format = '%Y').dt.year
#publishedAuthors['FirstPaperYear'] = pd.to_datetime(publishedAuthors.FirstPaperYear, format = '%Y').dt.year

In [26]:
#add seniority
apiAuthors['Seniority'] = apiAuthors.Year - apiAuthors.FirstPaperYear 

In [27]:
#apply function to calculate cumulative paper count for current conference year
apiAuthors['CumulativePaperCount'] = apiAuthors.apply(lambda x: cumulativePaperCount(x.Year, x.FileID, filePath), axis = 1)

In [31]:
#add productivity (cumulative papercount / years in the field)
apiAuthors['Productivity'] = apiAuthors.CumulativePaperCount/apiAuthors.Seniority

In [32]:
#select only authors where first name does not contain "."
apiAuthors_realNames = apiAuthors[~apiAuthors.FirstName.str.contains('\.')]

In [34]:
#save authors
apiAuthors.to_pickle("Files/DataFrames/Authors_afterAPI.pkl")
apiAuthors_realNames.to_pickle("Files/DataFrames/apiAuthors_realNames.pkl")

In [38]:
print("After API, when using names without dot:", len(apiAuthors_realNames), "authors")

After API, when using names without dot: 1695315 authors


In [41]:
len(apiAuthors_realNames.Name.unique())

183272