In [1]:
import pandas as pd
import re
import random
from tqdm import tqdm
import numpy as np

from libs.RaceDistribution import RaceDistribution
from libs.LastNamesInference import LastNamesInference

In [2]:
us_papers = pd.read_csv('/data/datasets//WOS/US/US_papers.txt')

In [3]:
first_authors = us_papers[us_papers.ordre==1].copy().reset_index(drop=True)
lni = LastNamesInference(names = first_authors.nom)

imputing by the mean: 100%|██████████| 4129691/4129691 [00:19<00:00, 211828.55it/s]


In [4]:
first_authors.nom.unique().shape

(918458,)

In [5]:
known_names = lni.lastnames_dict.keys()

In [6]:
def clean_prenom(prenom):
    prenom = re.sub(r"\b(?:[A-Z]\.)+(?!\w)",'',prenom)
    prenom = re.sub(' +', ' ',prenom)
    prenom = prenom.strip()
    return prenom.lower()

In [7]:
given_names = [clean_prenom(x) for x in first_authors.Prenom]

In [8]:
all_names = [lni.clean_nom(x) for x in first_authors.nom]

In [9]:
known_names_wos =  [i for i in all_names if i in known_names]
unknown_names_wos =  [i for i in all_names if i not in known_names]

In [10]:
len(unknown_names_wos)

774381

In [11]:
len(unknown_names_wos)/len(all_names)

0.18751548239323476

In [12]:
def infer_race(us_papers):
    first_authors = us_papers[us_papers.ordre==1].copy().reset_index(drop=True)
    #first_authors = first_authors[['id_art','Prenom', 'nom']]
    lni = LastNamesInference(names = first_authors.nom)
    tqdm.pandas(desc="inferring race from lastnames")
    lastname_race_dist = first_authors.progress_apply(lambda x: lni.get_name_dist(lastname=x.nom), axis=1)
    first_authors[lni.prob_order] = pd.DataFrame(lastname_race_dist.to_list())
    first_authors = first_authors[['id_art','white', 'hispanic', 'black', 'asian']]
    us_papers_race = us_papers.merge(first_authors, on ='id_art')

    return us_papers_race

In [13]:
us_papers.sample()

Unnamed: 0,cluster_ID,Annee_Bibliographique,yfp,id_art,Prenom,nom,ordre,nb_auteur,EDiscipline,ESpecialite,cit_rel_all_IAC,ordre_auteur,Province,disc_origin,spec_origin,count_origin,gender,cit_all_IAC
14427861,42690115,2015,2006,54196225,Rupal S.,Bhatt-RS,2,3,Clinical Medicine,Cancer,,2,MA,Clinical Medicine,Cancer,,F,


In [14]:
us_papers.Annee_Bibliographique.value_counts()

2018    2299205
2017    2214362
2016    2205826
2015    2117084
2014    2047594
2013    1983888
2012    1902626
2011    1705189
2010    1536059
2009    1455984
2008    1403497
2019     424019
Name: Annee_Bibliographique, dtype: int64

In [15]:
authors = us_papers[['cluster_ID', 'id_art', 'Prenom', 'nom','ordre']]

In [16]:
first_authors = authors.loc[authors.ordre == 1]

In [17]:
first_authors[['cluster_ID', 'Prenom', 'nom']].nunique()

cluster_ID    1609107
Prenom         255430
nom            918458
dtype: int64

In [18]:
def clean_prenom(prenom):
    prenom = re.sub(r"\b(?:[A-Z]\.)+(?!\w)",'',prenom)
    prenom = re.sub(' +', ' ',prenom)
    prenom = prenom.strip()
    return prenom.lower()

In [19]:
def clean_nom(nom):
    nom = re.sub(r"-[A-Z]+",'',nom)
    nom = re.sub(' +', ' ',nom)
    nom = nom.strip()
    return nom.lower()

In [20]:
first_authors['Prenom'] = first_authors.Prenom.apply(lambda x: clean_prenom(x))
first_authors['nom'] = first_authors.nom.apply(lambda x: clean_nom(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_authors['Prenom'] = first_authors.Prenom.apply(lambda x: clean_prenom(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_authors['nom'] = first_authors.nom.apply(lambda x: clean_nom(x))


In [21]:
#Select first name
prenom_list = first_authors.groupby(['cluster_ID'])['Prenom'].apply(lambda x: x.values).reset_index()
prenom_list['Prenom'] = prenom_list.Prenom.apply(lambda x: list(filter(None,x)))
prenom_list['Prenom_alt'] = prenom_list.Prenom.apply(lambda x: random.sample(x,1) if len(x)>0 else [''])
prenom_list = prenom_list.explode('Prenom_alt')

In [22]:
first_authors = first_authors.merge(prenom_list[['cluster_ID','Prenom_alt']], how='left', on='cluster_ID')

There's still room for improvement, but I will use this list

In [23]:
#get unique authors list
unique_authors = first_authors.groupby(['cluster_ID', 'Prenom_alt', 'nom']).size().reset_index().rename(columns={0:'count'})

unique_authors.drop('count',axis=1, inplace=True)
unique_authors.columns = ['cluster_ID', 'Prenom','nom']

# I remove duplicates. With the names and lastnames from the cluster this should not be necessary
unique_authors = unique_authors.drop_duplicates('cluster_ID').reset_index(drop=True)

unique_authors = unique_authors.fillna('')

In [25]:
# # exp 1 (sd)
# rd1 = RaceDistribution(exp=1,mode = 'sd')
# authors_dist_1 = unique_authors.progress_apply(lambda x: rd1.get_names_dist(x.Prenom, x.nom), axis=1)
# new_cols = ['exp1_'+ name for name in rd1.prob_order]
# unique_authors[new_cols] = pd.DataFrame(authors_dist_1.to_list())

In [26]:
# exp 2 (var) normalized given names
rd2_norm = RaceDistribution(exp=2,mode = 'sd',normalized_firstnames=True)
authors_dist_2_norm = unique_authors.progress_apply(lambda x: rd2_norm.get_names_dist(x.Prenom, x.nom), axis=1)
new_cols = ['exp2_norm_'+ name for name in rd2_norm.prob_order]
unique_authors[new_cols] = pd.DataFrame(authors_dist_2_norm.to_list())

100%|██████████| 1609107/1609107 [03:52<00:00, 6931.80it/s]


In [27]:
# exp 2 (var) unnormalized given names
rd2 = RaceDistribution(exp=2,mode = 'sd',normalized_firstnames=False)
authors_dist_2 = unique_authors.progress_apply(lambda x: rd2.get_names_dist(x.Prenom, x.nom), axis=1)
new_cols = ['exp2_notnorm_'+ name for name in rd2_norm.prob_order]
unique_authors[new_cols] = pd.DataFrame(authors_dist_2.to_list())

100%|██████████| 1609107/1609107 [03:50<00:00, 6990.19it/s]


In [28]:
# Only lastname
rd_lastname = RaceDistribution(mode = 'lastname')
authors_dist_lastname = unique_authors.progress_apply(lambda x: rd_lastname.get_names_dist(x.Prenom, x.nom), axis=1)
new_cols = ['lastname_notnorm_'+ name for name in rd2_norm.prob_order]
unique_authors[new_cols] = pd.DataFrame(authors_dist_lastname.to_list())

100%|██████████| 1609107/1609107 [00:55<00:00, 28915.89it/s]


In [29]:
# Only first name
rd_name = RaceDistribution(mode = 'name', normalized_firstnames= False)
authors_dist_name = unique_authors.progress_apply(lambda x: rd_name.get_names_dist(x.Prenom, x.nom), axis=1)
new_cols = ['name_notnorm_'+ name for name in rd2_norm.prob_order]
unique_authors[new_cols] = pd.DataFrame(authors_dist_name.to_list())

100%|██████████| 1609107/1609107 [00:55<00:00, 28930.36it/s]


In [30]:
# Only first name normalized
rd_name_norm = RaceDistribution(mode = 'name', normalized_firstnames=True)
authors_dist_name_norm = unique_authors.progress_apply(lambda x: rd_name_norm.get_names_dist(x.Prenom, x.nom), axis=1)
new_cols = ['name_norm_'+ name for name in rd2_norm.prob_order]
unique_authors[new_cols] = pd.DataFrame(authors_dist_name_norm.to_list())

100%|██████████| 1609107/1609107 [00:56<00:00, 28632.67it/s]


In [31]:
unique_authors

Unnamed: 0,cluster_ID,Prenom,nom,exp2_norm_white,exp2_norm_hispanic,exp2_norm_black,exp2_norm_asian,exp2_norm_other,exp2_notnorm_white,exp2_notnorm_hispanic,...,name_notnorm_white,name_notnorm_hispanic,name_notnorm_black,name_notnorm_asian,name_notnorm_other,name_norm_white,name_norm_hispanic,name_norm_black,name_norm_asian,name_norm_other
0,12,zeke,barger,0.894304,0.029825,0.033911,0.015908,0.026053,0.853548,0.032123,...,0.51422,0.08226,0.11541,0.28117,0.00694,0.336222,0.162606,0.274602,0.182239,0.044330
1,50,mia,divecha,0.178327,0.025214,0.103187,0.605588,0.087684,0.317416,0.024534,...,0.63399,0.06536,0.22222,0.06536,0.01307,0.345928,0.107817,0.441234,0.035352,0.069669
2,57,adam,faeth,0.927500,0.034439,0.009835,0.004287,0.023939,0.958658,0.018114,...,0.96148,0.01989,0.00635,0.01016,0.00212,0.893983,0.055911,0.021485,0.009364,0.019257
3,185,anna,pidgeon,0.825879,0.072110,0.049371,0.016753,0.035886,0.873732,0.042063,...,0.84762,0.07439,0.02211,0.05378,0.00210,0.690918,0.183321,0.065584,0.043455,0.016723
4,245,david,gochis,0.883263,0.081120,0.019563,0.009155,0.006899,0.925783,0.054305,...,0.92808,0.03223,0.01393,0.02393,0.00183,0.830267,0.087169,0.045349,0.021221,0.015994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609102,55572228,victoria,corum,0.777930,0.072631,0.092094,0.017055,0.040291,0.841290,0.049468,...,0.83515,0.08424,0.03740,0.03704,0.00617,0.631293,0.192511,0.102878,0.027754,0.045563
1609103,55572236,matthew,chadourne,0.847614,0.061196,0.041846,0.028875,0.020469,0.890408,0.044431,...,0.97350,0.01019,0.00679,0.00783,0.00169,0.924248,0.029248,0.023459,0.007369,0.015675
1609104,55572238,ekua,bentil,0.068283,0.053151,0.796380,0.015884,0.066302,0.154833,0.052108,...,0.51422,0.08226,0.11541,0.28117,0.00694,0.336222,0.162606,0.274602,0.182239,0.044330
1609105,55572281,ewen,kingsmith,0.562383,0.067266,0.265761,0.030603,0.073986,0.570747,0.061630,...,0.51422,0.08226,0.11541,0.28117,0.00694,0.336222,0.162606,0.274602,0.182239,0.044330


In [32]:
unique_authors.to_csv('../data/unique_authors.csv', index=False)

# Disciplines

In [24]:
us_papers_race = infer_race(us_papers)

imputing by the mean: 100%|██████████| 4129691/4129691 [00:19<00:00, 215023.81it/s]
inferring race from lastnames: 100%|██████████| 4129691/4129691 [01:07<00:00, 61409.54it/s]


In [25]:
us_papers_race.loc[:,['white','hispanic','black','asian']].mean()

white       0.571391
hispanic    0.055431
black       0.071673
asian       0.301505
dtype: float64

In [26]:
us_papers_race.columns

Index(['cluster_ID', 'Annee_Bibliographique', 'yfp', 'id_art', 'Prenom', 'nom',
       'ordre', 'nb_auteur', 'EDiscipline', 'ESpecialite', 'cit_rel_all_IAC',
       'ordre_auteur', 'Province', 'disc_origin', 'spec_origin',
       'count_origin', 'gender', 'cit_all_IAC', 'white', 'hispanic', 'black',
       'asian'],
      dtype='object')

In [27]:
us_papers_race = us_papers_race[us_papers_race.ordre==1].copy().reset_index(drop=True)

In [28]:
us_papers_race.loc[us_papers_race.gender == 'f','gender'] = 'F'
us_papers_race.loc[us_papers_race.gender == 'm','gender'] = 'M'

In [29]:
us_papers_race = us_papers_race[us_papers_race.gender.isin(['F','M'])].copy().reset_index(drop=True)

In [30]:
# I exclude 'Social Studies of Medicine' because there are too few obs
us_papers_race = us_papers_race[us_papers_race.ESpecialite != 'Social Studies of Medicine']

In [31]:
discipline_gender_agg = us_papers_race.groupby(['EDiscipline','ESpecialite','gender'])['white', 'hispanic', 'black', 'asian'].agg({'count','mean'})

discipline_gender_agg = discipline_gender_agg.stack(level=0)

#discipline_gender_agg.columns = ['EDiscipline', 'ESpecialite', 'gender', 'group', 'count', 'mean']

  discipline_gender_agg = us_papers_race.groupby(['EDiscipline','ESpecialite','gender'])['white', 'hispanic', 'black', 'asian'].agg({'count','mean'})


In [32]:
discipline_gender_agg.index.names = ['EDiscipline', 'ESpecialite', 'gender', 'group']

In [33]:
discipline_gender_agg['freq'] = discipline_gender_agg['count']*discipline_gender_agg['mean']

discipline_gender_agg = discipline_gender_agg.drop(['count','mean'],1)

In [34]:
discipline_gender_agg['joint_prob'] = discipline_gender_agg.freq/ discipline_gender_agg.freq.sum()

In [35]:
discipline_gender_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,freq,joint_prob
EDiscipline,ESpecialite,gender,group,Unnamed: 4_level_1,Unnamed: 5_level_1
Arts,Fine Arts & Architecture,F,asian,304.131391,0.000094
Arts,Fine Arts & Architecture,F,black,258.206450,0.000080
Arts,Fine Arts & Architecture,F,hispanic,152.297873,0.000047
Arts,Fine Arts & Architecture,F,white,1960.364286,0.000608
Arts,Fine Arts & Architecture,M,asian,279.548700,0.000087
...,...,...,...,...,...
Social Sciences,Sociology,F,white,5858.704683,0.001816
Social Sciences,Sociology,M,asian,1034.921559,0.000321
Social Sciences,Sociology,M,black,973.499659,0.000302
Social Sciences,Sociology,M,hispanic,651.249744,0.000202


Average citations by group & specialite

In [61]:
df = us_papers_race[['EDiscipline','ESpecialite','gender','cit_rel_all_IAC','cit_all_IAC','white', 'hispanic', 'black', 'asian']].copy()

In [62]:
df = df.dropna(subset = ["cit_all_IAC"], inplace=False).reset_index(drop=True)

In [63]:
df = pd.melt(df, id_vars=['EDiscipline','ESpecialite','gender','cit_all_IAC','cit_rel_all_IAC'], value_vars=['white', 'hispanic', 'black', 'asian'],
       var_name='race', value_name='weights')

In [64]:
def weighted_median(data, weights, quantile=.5):
     # Check the data
    if not isinstance(data, np.matrix):
        data = np.asarray(data)
    if not isinstance(weights, np.matrix):
        weights = np.asarray(weights)
    nd = data.ndim
    if nd != 1:
        raise TypeError("data must be a one dimensional array")
    ndw = weights.ndim
    if ndw != 1:
        raise TypeError("weights must be a one dimensional array")
    if data.shape != weights.shape:
        raise TypeError("the length of data and weights must be the same")
    if ((quantile > 1.) or (quantile < 0.)):
        raise ValueError("quantile must have a value between 0. and 1.")
    # Sort the data
    ind_sorted = np.argsort(data)
    sorted_data = data[ind_sorted]
    sorted_weights = weights[ind_sorted]
    # Compute the auxiliary arrays
    Sn = np.cumsum(sorted_weights)
    # TODO: Check that the weights do not sum zero
    #assert Sn != 0, "The sum of the weights must not be zero"
    Pn = (Sn-0.5*sorted_weights)/Sn[-1]
    # Get the value of the weighted median
    return np.interp(quantile, Pn, sorted_data)

In [65]:
def grouped_wmedian(group):
    data = group['cit_all_IAC']
    weights = group['weights']
    median = weighted_median(data, weights, quantile=.5)
    return median

In [66]:
citations_agg = df.groupby(['EDiscipline','ESpecialite','gender', 'race']).apply(grouped_wmedian)

In [67]:
discipline_gender_agg['median_citations'] = citations_agg

In [68]:
np.all(discipline_gender_agg.median_citations.isnull())

False

In [71]:
def grouped_wmean(group, citation_type='cit_all_IAC'):
    values = group[citation_type]
    weights = group['weights']
    mean = np.average(values, weights=weights)
    return mean

In [72]:
mean_citations  = df[df.weights!=0].groupby(['EDiscipline','ESpecialite','gender', 'race']).apply(grouped_wmean)

In [73]:
mean_citations

EDiscipline      ESpecialite               gender  race    
Arts             Fine Arts & Architecture  F       asian       0.882907
                                                   black       0.527940
                                                   hispanic    0.575744
                                                   white       0.505146
                                           M       asian       1.030127
                                                                 ...   
Social Sciences  Sociology                 F       white       9.026525
                                           M       asian       8.603166
                                                   black       8.677733
                                                   hispanic    8.339761
                                                   white       9.745542
Length: 1136, dtype: float64

In [74]:
mean_rel_citations  = df[df.weights!=0].groupby(['EDiscipline','ESpecialite','gender', 'race']).apply(lambda x: grouped_wmean(x,'cit_rel_all_IAC'))

In [75]:
mean_rel_citations

EDiscipline      ESpecialite               gender  race    
Arts             Fine Arts & Architecture  F       asian       1.399342
                                                   black       0.932253
                                                   hispanic    0.899143
                                                   white       0.917584
                                           M       asian       1.664927
                                                                 ...   
Social Sciences  Sociology                 F       white       1.444623
                                           M       asian       1.287778
                                                   black       1.363124
                                                   hispanic    1.342150
                                                   white       1.485567
Length: 1136, dtype: float64

In [76]:
discipline_gender_agg['mean_citations'] = mean_citations
discipline_gender_agg['mean_rel_citations'] = mean_rel_citations

In [77]:
discipline_gender_agg.to_csv('../results/discipline_gender_agg.csv')

## Leiden Fields

In [57]:
subfields = pd.read_csv('/data/WOS/US/subfields.txt', delimiter='\t')
titles = pd.read_csv('/data/WOS/US/titles.txt', delimiter='\t')

In [58]:
us_papers_race2 = us_papers_race.merge(titles, how='inner', left_on='id_art', right_on='id_Art')

In [59]:
us_papers_race2 = us_papers_race2.merge(subfields, how='inner', left_on='ITEMID', right_on='ut')

In [60]:
subfields_race = us_papers_race2.groupby('cluster_id1')['white','hispanic','black','asian'].mean().reset_index()

  subfields_race = us_papers_race2.groupby('cluster_id1')['white','hispanic','black','asian'].mean().reset_index()


In [61]:
subfields_race.to_csv('../results/leiden_clusters.csv')