In [1]:
import os
import pandas as pd
import json
import datetime
from tqdm import tqdm
import pickle
from collections import Counter

In [2]:
file_to_read = open('Files/Dictionary/researcherDB.pkl', "rb")
researcher_db = pickle.load(file_to_read)

In [3]:
researcherPath = 'Files/Researchers'
errorPath = 'Files/ErrorResearchers'
researcherFiles = os.listdir(researcherPath)
errorFiles = os.listdir(errorPath)

df_researchers = pd.read_pickle("Files/Dataframes/researchers_allFrom2005.pkl")[['Year', 'Name', 'Institution', 'AuthorIndex', 'Title', 'Division', 'SessionIndex', 'SessionType', 'LowerCaseName']]

In [4]:
def readFile(file, path): #function to read file
    with open(path + file) as f:
        data = json.load(f)
    return data

In [5]:
def getName(data): #get namme from json
    name = data['File']['data'][0]['name'] #only save first name
        
    return name

In [6]:
def getAliases(data): #get aliases from json
    aliases = data['File']['data'][0]['aliases'] #only save first name
        
    return aliases

In [7]:
def findFirstName(name): #get first name from name
    firstName = name.replace('.', '').split()[0]
    
    return firstName

In [8]:
def getFirstName(searchName, API_name, aliasList): #assign searchName if first name is longer than 1 letter or no first names are longer than 1,assign API_name if first name is longer than 1 letter, assign first alias with firs name longer than 1 letter
    firstName = findFirstName(searchName)
    if len(firstName) <= 1: #if first name consists of 1 or less letters
        firstName_API = findFirstName(API_name)
        if len(firstName_API) <= 1 and aliasList: #if first name consists of 1 or less letters
            for i in aliasList:
                firstName_alias = findFirstName(i)
                if len(firstName_alias) > 1: #if alias first name is longer than 1 letter, save that name
                    assignedFirstName = firstName_alias
                    break
                else:
                    assignedFirstName = firstName
        else:
            assignedFirstName = firstName_API
    else: 
        assignedFirstName = firstName
        
    return assignedFirstName

In [9]:
def getPublicationData(data): #get dict with number of publications for each year
    publishCount = {}
    paper_data = data['papers']
    n_papers = len(paper_data)
    for p in range(0, n_papers):
        year = paper_data[p]['year']
        if year in publishCount:
            publishCount[year]+= 1 #add 1 to count
        else:
            publishCount[year] = 1 #create year, as set count as 1
                
    return publishCount

In [10]:
def getAPI_field(data): #get field from API
    fields = []
    noneCount = 0
    paper_data = data['papers']
    n_papers = len(paper_data)
    for p in range(0, n_papers):
        field = paper_data[p]['fieldsOfStudy']
        if field:
            fields += paper_data[p]['fieldsOfStudy']
        else:
            noneCount += 1
    if fields:
        assignedField = Counter(fields).most_common(1)[0][0] #assign most occurring field
        fieldCount = Counter(fields).most_common(1)[0][1]
    else:
        assignedField = 'Unknown'
        fieldCount = noneCount
        #fieldCount = 'Unknown'
        
    return assignedField, fieldCount

In [11]:
def getFieldOfStudy_both(data): #get both external and model 
    #fields_dict = {}
    externalFields = []
    modelFields = []
    paper_data = data['papers']
    n_papers = len(paper_data)
    for p in range(0, n_papers):
        fields = paper_data[p]['s2FieldsOfStudy']
        for f in fields:
            if f['source'] == 'external':
                externalField = f['category']
                externalFields.append(externalField)
            elif f['source'] == 's2-fos-model':
                modelField = f['category']
                modelFields.append(modelField)
            else:
                print("No field")
    if externalFields: 
        assignedExternalField = Counter(externalFields).most_common(1)[0][0] #assign most occurring field
        externalFieldCount = Counter(externalFields).most_common(1)[0][1] #number of times most occuring field is assigned
    else:
        assignedExternalField = "Unknown"
        externalFieldCount = 0
        
    if modelFields:
        assignedModelField = Counter(modelFields).most_common(1)[0][0] #assign most occurring field
        modelCount = Counter(modelFields).most_common(1)[0][1] #number of times most occuring field is assigned
    else:
        assignedModelField = 'Unknown'
        modelCount = 0
    
    return assignedExternalField, assignedModelField, externalFieldCount, modelCount

In [12]:
def getAuthorInfo(files, path): #get info about authors
    error = {}
    researcher_dict = {}
    publicationData = {}
    for file in tqdm(files):
        if file != '.DS_Store':
            data = readFile(file, path)
            for key in data.keys():
                researcher_data = data[key]
                searchName = researcher_data['SearchName'] #name searched on
                if researcher_data['File']['data'][0]['papers']: #if any data on papers
                    API_name = getName(researcher_data) #name returned
                    aliases = getAliases(researcher_data) #get aliases
                    #assignedName = assignName(searchName, API_name, aliases)
                    firstName = getFirstName(searchName, API_name, aliases)
                    dataCount = data[key]['File']['total'] #authorResults
                    authorData = data[key]['File']['data'][0] #only data from first author
                    publicationData[key] = getPublicationData(authorData)
                    #if not len(publicationData[key]) == 1 and not publicationData[key].get(None):
                    if len(publicationData[key]) == 1 and publicationData[key].get(None): #if there's only 1 year,a nd the year is None
                        error[key] = {'SearchName': searchName}
                    else:
                        maxPaperCount = sum(publicationData[key].values())
                        API_field, API_fieldCount = getAPI_field(authorData) #get assigned field(can be based on external soruces or API)
                        field_external, field_model, fieldCount_external, fieldCount_model = getFieldOfStudy_both(authorData)
                #firstYear = min(list(filter(None,publicationData[key].keys())))
                        firstYear = min(list(filter(None,publicationData[key].keys())))
                    #firstYear = min(publicationData[key].keys())
                        researcher_dict[key] = {'SearchName': searchName, 'API_Name': API_name, 'FirstName': firstName, 'AuthorCount': dataCount, 'MaxPaperCount': maxPaperCount, 'API_Field': API_field, 'API_FieldCount': API_fieldCount, 'ExternalField': field_external, 'ModelField': field_model, 'ExternalFieldCount': fieldCount_external, 'ModelFieldCount': fieldCount_model, 'FirstPaperYear': firstYear}
                    #else:
                    #    error[key] = {'SearchName': searchName}
                else:
                    error[key] = {'SearchName': searchName}
    
    return researcher_dict, error, publicationData

In [13]:
def getErrorResearchers(files, path): #get searchName and ID of researchers that could not be found
    error = {}
    for file in tqdm(files):
        if file != '.DS_Store':
            data = readFile(file, path)
            for key in data.keys():
                searchName = data[key]
                error[key] = {'SearchName': searchName}
    return error

In [14]:
#old - delete later
#def getFirstName(df, searchNameCol, API_nameCol):
#    firstName_dict = {}
#    for i in df.index:
#        ID = df.iloc[i]['ID']
#        name1 = df[searchNameCol].iloc[i].split()
#        name2 = df[API_nameCol].iloc[i].split()
#        if name1: #if there is any originalName
#            firstName = max(name1[0], name2[0], key = len) #take longest first name of the 2
#            firstName_dict[ID] = firstName
        
#    return firstName_dict

In [15]:
def getCumulativePaperCount(confYear, ID, publicationDict):
    publishCount = 0
    publications = publicationDict[str(ID)]
    for key in publications:
        if key != None:
            if key < confYear :
                publishCount += publications[key]
            
    return publishCount

In [16]:
def getFirstName_errorResearchers(df, searchNameCol):
    firstName_dict = {}
    for i in df.index:
        ID = df.iloc[i]['ID']
        name = df[searchNameCol].iloc[i].split()
        firstName = name[0]
        firstName_dict[ID] = firstName
    
    return firstName_dict

#### Add additional information

In [17]:
researcherDict, errorDict, publicationData = getAuthorInfo(researcherFiles, researcherPath + "/") #dict with researcher info

100%|█████████████████████████████████████████| 187/187 [01:03<00:00,  2.95it/s]


In [18]:
#create df from researcherInfo
researcher_df = pd.DataFrame.from_dict(researcherDict, orient = 'index').reset_index().rename(columns = {'index': 'ID'})
researcher_df['ID'] = researcher_df.ID.astype(int) #ID as int

In [19]:
researchers = pd.merge(researcher_df, df_researchers, how = 'inner', left_on = 'SearchName', right_on = 'LowerCaseName')#add information to df
#researchers = pd.merge(researcher_df, df_researchers,  on=['ID']) #add information to df

In [20]:
#delete
#firstNameDict = getFirstName(researcher_df, 'SearchName', 'API_Name')

In [21]:
#make firstNameDict to df
#firstName_df = pd.DataFrame.from_dict(firstNameDict, orient = 'index').reset_index().rename(columns = {'index': 'ID', 0: 'FirstName'})

In [22]:
#delete
#add first name
#apiResearchers = pd.merge(researchers, firstName_df,  on = ['ID'])

In [23]:
#add seniority
researchers['Seniority'] = researchers.Year - researchers.FirstPaperYear 

In [24]:
#apply function to calculate cumulative paper count for current conference year
researchers['CumulativePaperCount'] = researchers.apply(lambda x: getCumulativePaperCount(x.Year, x.ID, publicationData), axis = 1)

In [25]:
#add productivity (cumulative papercount / years in the field)
researchers['Productivity'] = researchers.CumulativePaperCount/researchers.Seniority

In [26]:
print("Number of researchers with first name on one letter:", len(researchers[researchers.FirstName.str.len() == 1]))

Number of researchers with first name on one letter: 9954


In [27]:
#select only first names longer than one letter
researchers_realNames = researchers[researchers.FirstName.str.len() > 1]

In [28]:
#save researchers with all current columns
researchers_realNames.to_pickle("Files/DataFrames/apiResearchers_allColumns.pkl") 

In [29]:
#researchers with name columns deleted
researchers_nameColDel = researchers_realNames.drop(['API_Name', 'LowerCaseName'], axis = 1)

In [30]:
#save researchers again with the current columns(with some of the names columns deleted)
researchers_nameColDel.to_pickle("Files/DataFrames/apiResearchers_fewNameCol.pkl") 

### Researchers not found

In [31]:
errorResearchers = getErrorResearchers(errorFiles, errorPath + "/")

100%|███████████████████████████████████████| 186/186 [00:00<00:00, 5615.47it/s]


In [32]:
#create df with researchers not found or with no data from the API
errorResearchers_df = pd.DataFrame.from_dict(errorResearchers, orient = 'index').reset_index().rename(columns = {'index': 'ID'})
researchers_noData = pd.DataFrame.from_dict(errorDict, orient = 'index').reset_index().rename(columns = {'index': 'ID'})

In [33]:
#merge the two dataframes containing researchers with no data/that couldn't be found/where an error occurred
researchers_notFound_df = pd.concat([errorResearchers_df, researchers_noData])
researchers_notFound_df['ID'] = researchers_notFound_df.ID.astype(int) #ID as int

In [34]:
#add already known information to df for researchers not found
researchers_notFound = pd.merge(researchers_notFound_df, df_researchers,  how = 'inner',  left_on = 'SearchName', right_on = 'LowerCaseName')  

In [35]:
#crete dict with first name
firstNameDict_errorResearchers = getFirstName_errorResearchers(researchers_notFound, 'SearchName')

#make firstNameDict to df
firstName_errorResearchers_df = pd.DataFrame.from_dict(firstNameDict_errorResearchers, orient = 'index').reset_index().rename(columns = {'index': 'ID', 0: 'FirstName'})

In [36]:
#add first name to dfq
apiResearchers_notFound = pd.merge(researchers_notFound, firstName_errorResearchers_df,  on = ['ID'])

In [37]:
#delete
#select only researchers where first name does not contain "."
#researchers_realNames = apiResearchers[~apiResearchers.FirstName.str.contains('\.')]

In [38]:
print("Number of researchers with no data/not found:", len(apiResearchers_notFound.ID.unique()))
print("Number of researchers found:", len(researchers.ID.unique()))
print("Number of researchers removed due to not knowing their first name:", len(researchers[researchers.FirstName.str.len() <= 1]))

Number of researchers with no data/not found: 13563
Number of researchers found: 170902
Number of researchers removed due to not knowing their first name: 9954


In [39]:
#save researchers not found
researchers_notFound.to_pickle("Files/DataFrames/researcherNotFound.pkl") 