## Expected distribution on topics, given race and gender distribution of authors. 

In [1]:
%config Completer.use_jedi = False

In [2]:
import pandas as pd
import pickle
# import pickle5 as pickle
from tqdm.notebook import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from libs.LastNamesInference import LastNamesInference

In [3]:
def save(x, file_name):
    with open(file_name, 'wb') as handle:
        pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL)

def restore(file_name):
    with open(file_name, 'rb') as handle:
        x = pickle.load(handle)
    return x

In [4]:
df_socsci = pd.read_csv('/data/datasets//WOS/US/text_clean_socsci.txt')
us_papers = pd.read_pickle('/data/datasets//WOS/US/US_papers.p')
df_health = pd.read_csv('/data/datasets/WOS/US/text_clean_health.txt')

In [5]:
doc_dist_socsci = restore('/data/datasets/WOS/US/lda_socsci_dist.p')
doc_dist_health = restore('/data/datasets/WOS/US/lda_health_dist.p')

In [6]:
socsci_papers = us_papers.loc[(us_papers.id_art.isin(df_socsci.id_art)),]
first_authors_socsci = socsci_papers[socsci_papers.ordre==1].copy()

doc_dist_socsci,_ = doc_dist_socsci.set_index('id_art').align(first_authors_socsci.set_index('id_art'),axis=0, join='right')
doc_dist_socsci = doc_dist_socsci.reset_index()

In [7]:
health_papers = us_papers.loc[(us_papers.id_art.isin(df_health.id_art)),]
first_authors_health = health_papers[health_papers.ordre==1].copy()

doc_dist_health,_ = doc_dist_health.set_index('id_art').align(first_authors_health.set_index('id_art'),axis=0, join='right')
doc_dist_health = doc_dist_health.reset_index()

In [8]:
address_clean = pd.read_pickle('/data/datasets/WOS/US/address_clean.p')

In [9]:
df = us_papers.merge(address_clean, how='inner', left_on=['cluster_ID','id_art'], right_on=['cluster_id', 'id_art'])

IOStream.flush timed out


In [10]:
def split_by_gender(df_race):
    df_race = df_race.copy()
    df_race['gender'] = df_race.gender.str.upper()

    boolean_mask_M = df_race.gender == 'M'
    boolean_mask_F = df_race.gender == 'F'

    race_dist_M = df_race.loc[boolean_mask_M,['white','hispanic','black','asian']]
    race_dist_F = df_race.loc[boolean_mask_F,['white','hispanic','black','asian']]

    race_dist_M.columns = race_dist_M.columns + '_M'
    race_dist_F.columns = race_dist_F.columns + '_F'
    race_dist_MF = pd.concat([race_dist_M,race_dist_F]).fillna(0).sort_index()
    df_race = df_race.join(race_dist_MF,how='inner' )
    return df_race


In [11]:
def infer_race(us_papers,df_socsci, authors='first'):
    socsci_papers = us_papers.loc[(us_papers.id_art.isin(df_socsci.id_art)),]
    first_authors = socsci_papers[socsci_papers.ordre==1].copy().reset_index(drop=True)
    #first_authors = first_authors[['id_art','Prenom', 'nom']]
    lni = LastNamesInference(names = first_authors.nom)
    tqdm.pandas(desc="inferring race from lastnames")
    lastname_race_dist = first_authors.progress_apply(lambda x: lni.get_name_dist(lastname=x.nom), axis=1)
    first_authors[lni.prob_order] = pd.DataFrame(lastname_race_dist.to_list())
    #first_authors = first_authors[['id_art','white', 'hispanic', 'black', 'asian']]
    df_socsci_race = df_socsci.merge(first_authors, on ='id_art')

    return df_socsci_race

In [12]:
df_socsci_race = infer_race(us_papers,df_socsci)

imputing by the mean: 100% 238652/238652 [00:01<00:00, 230658.24it/s]


inferring race from lastnames:   0%|          | 0/238652 [00:00<?, ?it/s]

In [13]:
df_health_race = infer_race(us_papers,df_health)

imputing by the mean: 100% 123472/123472 [00:00<00:00, 249744.10it/s]


inferring race from lastnames:   0%|          | 0/123472 [00:00<?, ?it/s]

In [14]:
university_groups = ['r1', 'control', 'obereg', 'hbcu', 'hsi', 'msi', 'womens', 'selindex',
                    'usnr_rank_cat','Top10CR_Q','Top5CR_Q','Top1CR_Q','impact_Q','avg_citations_Q','avg_citations_Q10']
race_groups = ['white','hispanic','black','asian']
race_gender_groups = ['white_M', 'hispanic_M', 'black_M', 'asian_M', 'white_F','hispanic_F', 'black_F', 'asian_F']

In [15]:
def project_lda_topics(df_race,doc_dist):
    
    race_dist = df_race.filter(regex=('white|hispanic|black|asian'))
    topics_by_group = race_dist.T @ doc_dist 
    
    topics_by_group = topics_by_group.astype(np.float128)
    
    joint_prob = topics_by_group/topics_by_group.to_numpy().sum()
    marginal_by_topic = joint_prob.div(joint_prob.sum(axis=0), axis=1)
    marginal_by_group = joint_prob.div(joint_prob.sum(axis=1), axis=0)
#    dist_diff_topic = marginal_by_topic.subtract(joint_prob.sum(axis=1), axis=0)  # with the substraction, this gives "how many percentual points (more/less) than 
                                                                                    # expected they talk about this topic
    dist_diff_topic = marginal_by_topic.div(joint_prob.sum(axis=1), axis=0) -1   # with the ratio, this gives "how much % (more/less) than expected
                                                                                    # they talk about this topic
    joint_prob = joint_prob.T.rename_axis('topic').reset_index()
    marginal_by_topic = marginal_by_topic.T.rename_axis('topic').reset_index()
    marginal_by_group = marginal_by_group.T.rename_axis('topic').reset_index()
    dist_diff_topic = dist_diff_topic.T.rename_axis('topic').reset_index()
    
    # I start the topics in 1, so they are equal to the LDAVIZ!!!!
    joint_prob.topic += 1
    marginal_by_topic.topic += 1
    marginal_by_group.topic += 1
    dist_diff_topic.topic += 1    
    
    return joint_prob, marginal_by_topic, marginal_by_group, dist_diff_topic

In [16]:
def intersection_rg(df_race):
    
    df_race['gender'] = df_race.gender.str.upper()

    boolean_mask_M = df_race.gender == 'M'
    boolean_mask_F = df_race.gender == 'F'
    
    race_dist_M = df_race.loc[boolean_mask_M,['white','hispanic','black','asian']]
    race_dist_F = df_race.loc[boolean_mask_F,['white','hispanic','black','asian']]

    race_dist_M.columns = race_dist_M.columns + '_M'
    race_dist_F.columns = race_dist_F.columns + '_F'

    race_dist_MF = pd.concat([race_dist_M,race_dist_F]).fillna(0)
    return race_dist_MF

In [17]:
def intersect_by_gender(df_race, doc_dist):
    
    #race_dist = race_dist.merge(metadata_papers, how='left', left_index=True, right_on='id_art')
    #race_dist = first_or_so_authors(race_dist)

    df_race['gender'] = df_race.gender.str.upper()

    boolean_mask_M = df_race.gender == 'M'
    boolean_mask_F = df_race.gender == 'F'
    
    race_dist_M = df_race.loc[boolean_mask_M,['white','hispanic','black','asian']]
    race_dist_F = df_race.loc[boolean_mask_F,['white','hispanic','black','asian']]

    race_dist_M.columns = race_dist_M.columns + '_M'
    race_dist_F.columns = race_dist_F.columns + '_F'

    race_dist_MF = pd.concat([race_dist_M,race_dist_F]).fillna(0)
    doc_dist_MF = np.concatenate((doc_dist[boolean_mask_M],doc_dist[boolean_mask_F]))

    joint_prob, marginal_by_topic, marginal_by_group, dist_diff_topic = project_lda_topics(race_dist_MF,doc_dist_MF)

    return joint_prob, marginal_by_topic, marginal_by_group, dist_diff_topic

In [18]:
def get_rg_by_institution(df):
    prop_df_agg = pd.DataFrame()
    for ug in university_groups:
        _ = df.groupby(ug)[race_gender_groups].mean().reset_index()
        prop_df_agg = prop_df_agg.append(_)
    prop_df_agg = prop_df_agg.melt(id_vars=race_gender_groups,var_name='carnegie_group',value_name='carnegie_tag').dropna().reset_index(drop=True)
    return prop_df_agg

In [19]:
def get_expected_joint_prob(prop_df_agg,df_race, doc_dist,topics_col):
    joint_prob_gender, marginal_by_topic_gender, marginal_by_group_gender, dist_diff_topic_gender = intersect_by_gender(df_race, doc_dist.drop(columns='id_art'))

    expected_topics_dist_carnegie = prop_df_agg[race_gender_groups] @ joint_prob_gender[race_gender_groups].T
    expected_topics_dist_carnegie.columns = topics_col
    expected_dist = prop_df_agg.join(expected_topics_dist_carnegie).drop(columns=race_gender_groups)
    return expected_dist

In [20]:
def get_groups_sizes(df):
    groups_size = pd.DataFrame()
    for carnegie_group in university_groups:
        group_n = df.groupby(carnegie_group).id_art.count().reset_index(level=carnegie_group).reset_index(drop=True).rename(columns={'id_art':'n'})
        groups_size = groups_size.append(group_n)
    groups_size = groups_size.melt(id_vars='n',var_name='carnegie_group',value_name='carnegie_tag').dropna().reset_index(drop=True)
    return groups_size

In [21]:
def marginal_by_topic_on_institutions(expected_dist,groups_size):
    # normalization so that for each topic we get the expected distribution between institutions (the marginal by topic)
    # Expand the joint prob to get the expected number of papers per institution group and topic. 
    #And get the marginal by topic over different institution groups of the same categorization
    expected_dist = expected_dist.melt(id_vars=['carnegie_group','carnegie_tag'],var_name='topic',value_name='expected_dist')
    expected_dist = expected_dist.merge(groups_size, on =['carnegie_group','carnegie_tag'])
    expected_dist['total_exp_papers'] = expected_dist.expected_dist* expected_dist.n

    carnegie_group_total = expected_dist.groupby(['carnegie_group','topic']).total_exp_papers.sum().reset_index().rename(columns={'total_exp_papers':'sum_group'})
    expected_dist = expected_dist.merge(carnegie_group_total, on = ['carnegie_group','topic'])
    expected_dist['expected_dist'] = expected_dist.total_exp_papers/expected_dist.sum_group
    expected_dist = expected_dist.drop(columns=['n','sum_group','total_exp_papers'])
    return expected_dist

## Social Sciences

In [22]:
socsci_papers = df_socsci_race.rename(columns={'cluster_ID':'cluster_id'}).merge(address_clean, on= ['id_art','cluster_id'])
socsci_papers[race_gender_groups] = intersection_rg(socsci_papers)
socsci_papers = socsci_papers.dropna(subset=race_gender_groups)


In [23]:
prop_df_agg = get_rg_by_institution(socsci_papers)
expected_dist= get_expected_joint_prob(prop_df_agg, df_socsci_race,doc_dist_socsci,['topic_'+ str(x) for x in range(1,301)])
groups_size = get_groups_sizes(socsci_papers)

expected_dist = marginal_by_topic_on_institutions(expected_dist,groups_size)

  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append

In [24]:
prop_df_agg

Unnamed: 0,white_M,hispanic_M,black_M,asian_M,white_F,hispanic_F,black_F,asian_F,carnegie_group,carnegie_tag
0,0.432051,0.035301,0.05424,0.097387,0.265898,0.02429,0.034888,0.055944,r1,R1
1,0.414148,0.032676,0.054675,0.090878,0.286855,0.024553,0.03695,0.059265,r1,not R1
2,0.357124,0.044534,0.062678,0.002331,0.366831,0.037158,0.026581,0.102763,control,private_fp
3,0.458442,0.037372,0.054889,0.08686,0.257868,0.022111,0.031769,0.050689,control,private_nfp
4,0.413554,0.033447,0.054069,0.100337,0.276573,0.025419,0.037061,0.05954,control,public
5,0.421586,0.040513,0.052223,0.092854,0.265725,0.029022,0.033908,0.064169,obereg,Far_West
6,0.427873,0.030555,0.050217,0.10032,0.273278,0.023639,0.033894,0.060225,obereg,Great_Lakes
7,0.420552,0.035603,0.050331,0.100343,0.272129,0.024245,0.033129,0.063667,obereg,Mid_East
8,0.460652,0.036469,0.054074,0.08021,0.264845,0.023131,0.034063,0.046557,obereg,New_England
9,0.383476,0.210907,0.070497,0.059609,0.148793,0.0906,0.014199,0.021919,obereg,Outlying_areas


In [25]:
expected_dist['topic'] = expected_dist.topic.str.lstrip('topic_').astype(int)

In [26]:
expected_dist

Unnamed: 0,carnegie_group,carnegie_tag,topic,expected_dist
0,r1,R1,1,0.780823
1,r1,not R1,1,0.219177
2,r1,R1,2,0.780899
3,r1,not R1,2,0.219101
4,r1,R1,3,0.780812
...,...,...,...,...
16795,avg_citations_Q10,"(1.59, 1.66)",300,0.095481
16796,avg_citations_Q10,"(1.66, 1.81)",300,0.080923
16797,avg_citations_Q10,"(1.82, 1.91)",300,0.065933
16798,avg_citations_Q10,"(1.91, 2.15)",300,0.078035


In [27]:
expected_dist.to_csv('../../results/institutions_topics/institutions_expected_dist_topics_socsci.csv',index=False)

## Health

In [28]:
health_papers = df_health_race.rename(columns={'cluster_ID':'cluster_id'}).merge(address_clean, on= ['id_art','cluster_id'])
health_papers[race_gender_groups] = intersection_rg(health_papers)
health_papers = health_papers.dropna(subset=race_gender_groups)


In [29]:
prop_df_agg = get_rg_by_institution(health_papers)
expected_dist= get_expected_joint_prob(prop_df_agg, df_health_race,doc_dist_health,['topic_'+ str(x) for x in range(1,201)])
groups_size = get_groups_sizes(health_papers)

expected_dist = marginal_by_topic_on_institutions(expected_dist,groups_size)

  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  prop_df_agg = prop_df_agg.append(_)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append(group_n)
  groups_size = groups_size.append

In [30]:
expected_dist['topic'] = expected_dist.topic.str.lstrip('topic_').astype(int)

In [31]:
expected_dist.to_csv('../../results/institutions_topics/institutions_expected_dist_topics_health.csv',index=False)