In [1]:
import pandas as pd
import requests
from tqdm import tqdm
import pickle
import numpy as np

In [2]:
#load data
errorResearchers = pd.read_pickle("Files/DataFrames/researcherNotFound.pkl")
researcher_db = pickle.load(open('Files/Dictionary/researcherDB.pkl', "rb"))

In [3]:
## errorResearchers names without departmnet/university/group/team
errorResearchers_ = errorResearchers[~errorResearchers.SearchName.str.contains('department|dept.|university|group|team|collaboration|institute|institution|academy|center|research|material|physics|engineering')]

In [4]:
#drop first index, which we can see is an error
errorResearchers_df = errorResearchers_.drop(0).reset_index(drop = True)

In [5]:
#save errorResearchers_df
errorResearchers_df.to_pickle("Files/Dataframes/errorResearchers_filtered.pkl")

In [6]:
removedName = len(errorResearchers.SearchName.unique()) - len(errorResearchers_df.SearchName.unique())
namesLeft = len(errorResearchers_df.SearchName.unique())

In [7]:
print("Number of unique names removed:", removedName)
print("Number of unique names left:", namesLeft)
print("Share of researchers removed by deleting name we believe are not names:", removedName/len(researcher_db)) 
print("Share of researchers not found by API, that is not removed:", namesLeft/len(researcher_db))

Number of unique names removed: 2613
Number of unique names left: 10397
Share of researchers removed by deleting name we believe are not names: 0.013894132347858454
Share of researchers not found by API, that is not removed: 0.05528407731369473


### Estimate gender

In [8]:
def assignGender(df, NameCol): #from GenderEstimation_theis.ipynb
    index = 0
    nameDict = {}
    genderEstimation = pd.read_csv("Files/wgnd_2_0_name-gender-code.csv") #genderEstimation file
    genderEstimation['Name'] = genderEstimation.name.str.replace(r'"|\'', '', regex=True).str.lower() #remove " and ' and make to lower case
    
    usNames = genderEstimation[genderEstimation.code == 'US'] #some names can't not be found, but can with another code
    otherNames = genderEstimation[~genderEstimation.Name.isin(usNames.Name)] #names not in US

    uniqueNamesUS = df[df[NameCol].isin(usNames.Name)][NameCol].unique() #df with names from presenters, that is also in genderEstimation
    uniqueNamesOther = df[df[NameCol].isin(otherNames.Name)][NameCol].unique()
    
    for n in tqdm(uniqueNamesUS): #loop through us names that is also in the presenter df
        nameTable = usNames[usNames.Name == n] #table with the name 
        genderIndex = nameTable.wgt.argmax() #index of gender with highest score/probability
        gender = nameTable.iloc[genderIndex].gender #gender that is most likely
        nameDict[index] = {'firstName': n, 'Gender': gender} #assign gender to the name in a dict
        index += 1
    
    for m in tqdm(uniqueNamesOther): #loop through other names that is also in the presenter df
        nameTable = otherNames[otherNames.Name == m].groupby('gender').mean(numeric_only = True) #table with the name - mean of wgt
        genderIndex = nameTable.wgt.argmax() #index of gender with higest average probability
        gender = nameTable.iloc[genderIndex].name
        nameDict[index] = {'firstName': m, 'Gender': gender} #assign gender to the name in a dict
        index += 1
    
    return nameDict

In [9]:
def getFirstName(name):
    firstName = name.split()[0]
    
    return firstName

In [10]:
errorResearchers_df['FirstName'] = errorResearchers_df.apply(lambda x: getFirstName(x.SearchName), axis = 1)

In [11]:
genderDict = assignGender(errorResearchers_df,'FirstName')

100%|██████████████████████████████████████| 2849/2849 [00:15<00:00, 182.43it/s]
100%|█████████████████████████████████████████| 933/933 [02:22<00:00,  6.55it/s]


In [12]:
#dataframe witht the gender and name
genderAssign = pd.DataFrame.from_dict(genderDict, orient = 'index')

Now we are merging genderAssign and errorResearchers_df. This will automatically add the gender to names to the researchers. 

In [13]:
#merge error researcher df with gender df
errorResearchers_wGender_df = errorResearchers_df.merge(genderAssign, how = 'left', left_on = 'FirstName', right_on = 'firstName').drop('firstName', axis = 1)

In [14]:
#remove names with gender assigned to NaN
errorResearchers_wGender = errorResearchers_wGender_df[~errorResearchers_wGender_df.Gender.isna()]

In [15]:
noGenderEstimation = len(errorResearchers_wGender_df[errorResearchers_wGender_df.Gender.isna()].SearchName.unique())
unknownGender = len(errorResearchers_wGender[errorResearchers_wGender.Gender == '?'].SearchName.unique())
genderAssigned = len(errorResearchers_wGender[(errorResearchers_wGender.Gender == 'F') | (errorResearchers_wGender.Gender == 'M')].SearchName.unique()) 
deletedOrUnestimated = len(errorResearchers.SearchName.unique()) - (unknownGender + genderAssigned)

In [16]:
print("Number of unique names with no gender estimated(assigned to NaN):", noGenderEstimation)
print("Number of unique names assigned to ? as gender:", unknownGender)
print("Number of unique names assigned to gender", genderAssigned)

print("Number of researchers, that were either deleted or not possible to estimate gender of:", deletedOrUnestimated)
print("Share of researchers that were either deleted or not possible to estimate gender of:", (deletedOrUnestimated)/len(researcher_db))
print("Share of researchers with estimated gender:", (unknownGender + genderAssigned)/len(researcher_db))

Number of unique names with no gender estimated(assigned to NaN): 2227
Number of unique names assigned to ? as gender: 194
Number of unique names assigned to gender 7976
Number of researchers, that were either deleted or not possible to estimate gender of: 4840
Share of researchers that were either deleted or not possible to estimate gender of: 0.025735782841039
Share of researchers with estimated gender: 0.043442426820514184


In [17]:
errorResearchers_wGender.to_pickle("Files/Dataframes/ResearchersNotFound_wGender.pkl") 