- Top 10 topics by RG
- Distribution of those topics by institutions flags

Institutions profile:

2. R1 vs other
3. public, private control
4. Region
5. HBCU
7. HSI
8. MSI
9. Womens' college
10. Selectivity index

US Newsreport ranking: Top 10, top 100, other


In [1]:
%config Completer.use_jedi = False

In [2]:
import pandas as pd
import pickle
# import pickle5 as pickle
from tqdm.notebook import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from libs.LastNamesInference import LastNamesInference

In [3]:
def save(x, file_name):
    with open(file_name, 'wb') as handle:
        pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL)

def restore(file_name):
    with open(file_name, 'rb') as handle:
        x = pickle.load(handle)
    return x

In [4]:
df_socsci = pd.read_csv('/data/datasets//WOS/US/text_clean_socsci.txt')
us_papers = pd.read_pickle('/data/datasets//WOS/US/US_papers.p')
df_health = pd.read_csv('/data/datasets/WOS/US/text_clean_health.txt')

In [5]:
doc_dist_socsci = restore('/data/datasets/WOS/US/lda_socsci_dist.p')
doc_dist_health = restore('/data/datasets/WOS/US/lda_health_dist.p')

In [6]:
socsci_papers = us_papers.loc[(us_papers.id_art.isin(df_socsci.id_art)),]
first_authors_socsci = socsci_papers[socsci_papers.ordre==1].copy()

doc_dist_socsci,_ = doc_dist_socsci.set_index('id_art').align(first_authors_socsci.set_index('id_art'),axis=0, join='right')
doc_dist_socsci = doc_dist_socsci.reset_index()

In [7]:
health_papers = us_papers.loc[(us_papers.id_art.isin(df_health.id_art)),]
first_authors_health = health_papers[health_papers.ordre==1].copy()

doc_dist_health,_ = doc_dist_health.set_index('id_art').align(first_authors_health.set_index('id_art'),axis=0, join='right')
doc_dist_health = doc_dist_health.reset_index()

In [8]:
address_clean = pd.read_pickle('/data/datasets/WOS/US/address_clean.p')

In [9]:
df = us_papers.merge(address_clean, how='inner', left_on=['cluster_ID','id_art'], right_on=['cluster_id', 'id_art'])

In [10]:
def split_by_gender(df_race):
    df_race = df_race.copy()
    df_race['gender'] = df_race.gender.str.upper()

    boolean_mask_M = df_race.gender == 'M'
    boolean_mask_F = df_race.gender == 'F'

    race_dist_M = df_race.loc[boolean_mask_M,['white','hispanic','black','asian']]
    race_dist_F = df_race.loc[boolean_mask_F,['white','hispanic','black','asian']]

    race_dist_M.columns = race_dist_M.columns + '_M'
    race_dist_F.columns = race_dist_F.columns + '_F'
    race_dist_MF = pd.concat([race_dist_M,race_dist_F]).fillna(0).sort_index()
    df_race = df_race.join(race_dist_MF,how='inner' )
    return df_race


In [11]:
def infer_race(us_papers,df_socsci, authors='first'):
    socsci_papers = us_papers.loc[(us_papers.id_art.isin(df_socsci.id_art)),]
    first_authors = socsci_papers[socsci_papers.ordre==1].copy().reset_index(drop=True)
    #first_authors = first_authors[['id_art','Prenom', 'nom']]
    lni = LastNamesInference(names = first_authors.nom)
    tqdm.pandas(desc="inferring race from lastnames")
    lastname_race_dist = first_authors.progress_apply(lambda x: lni.get_name_dist(lastname=x.nom), axis=1)
    first_authors[lni.prob_order] = pd.DataFrame(lastname_race_dist.to_list())
    #first_authors = first_authors[['id_art','white', 'hispanic', 'black', 'asian']]
    df_socsci_race = df_socsci.merge(first_authors, on ='id_art')

    return df_socsci_race

In [12]:
df_socsci_race = infer_race(us_papers,df_socsci)

imputing by the mean: 100% 238652/238652 [00:00<00:00, 264688.69it/s]


inferring race from lastnames:   0%|          | 0/238652 [00:00<?, ?it/s]

In [13]:
df_health_race = infer_race(us_papers,df_health)

imputing by the mean: 100% 123472/123472 [00:00<00:00, 260675.26it/s]


inferring race from lastnames:   0%|          | 0/123472 [00:00<?, ?it/s]

In [14]:
def project_lda_topics(df_race,doc_dist):
    
    race_dist = df_race.filter(regex=('white|hispanic|black|asian'))
    topics_by_group = race_dist.T @ doc_dist 
    
    topics_by_group = topics_by_group.astype(np.float128)
    
    joint_prob = topics_by_group/topics_by_group.to_numpy().sum()
    marginal_by_topic = joint_prob.div(joint_prob.sum(axis=0), axis=1)
    marginal_by_group = joint_prob.div(joint_prob.sum(axis=1), axis=0)
#    dist_diff_topic = marginal_by_topic.subtract(joint_prob.sum(axis=1), axis=0)  # with the substraction, this gives "how many percentual points (more/less) than 
                                                                                    # expected they talk about this topic
    dist_diff_topic = marginal_by_topic.div(joint_prob.sum(axis=1), axis=0) -1   # with the ratio, this gives "how much % (more/less) than expected
                                                                                    # they talk about this topic
    joint_prob = joint_prob.T.rename_axis('topic').reset_index()
    marginal_by_topic = marginal_by_topic.T.rename_axis('topic').reset_index()
    marginal_by_group = marginal_by_group.T.rename_axis('topic').reset_index()
    dist_diff_topic = dist_diff_topic.T.rename_axis('topic').reset_index()
    
    # I start the topics in 1, so they are equal to the LDAVIZ!!!!
    joint_prob.topic += 1
    marginal_by_topic.topic += 1
    marginal_by_group.topic += 1
    dist_diff_topic.topic += 1    
    
    return joint_prob, marginal_by_topic, marginal_by_group, dist_diff_topic

In [15]:
def intersect_by_gender(df_race, doc_dist):
    
    #race_dist = race_dist.merge(metadata_papers, how='left', left_index=True, right_on='id_art')
    #race_dist = first_or_so_authors(race_dist)

    df_race['gender'] = df_race.gender.str.upper()

    boolean_mask_M = df_race.gender == 'M'
    boolean_mask_F = df_race.gender == 'F'
    
    race_dist_M = df_race.loc[boolean_mask_M,['white','hispanic','black','asian']]
    race_dist_F = df_race.loc[boolean_mask_F,['white','hispanic','black','asian']]

    race_dist_M.columns = race_dist_M.columns + '_M'
    race_dist_F.columns = race_dist_F.columns + '_F'

    race_dist_MF = pd.concat([race_dist_M,race_dist_F]).fillna(0)
    doc_dist_MF = np.concatenate((doc_dist[boolean_mask_M],doc_dist[boolean_mask_F]))

    joint_prob, marginal_by_topic, marginal_by_group, dist_diff_topic = project_lda_topics(race_dist_MF,doc_dist_MF)

    return joint_prob, marginal_by_topic, marginal_by_group, dist_diff_topic

In [16]:
def group_stats(group, res = 'dist_diff_topic_gender'):
    df_race_group = group[['gender']+ race_groups].copy()
    doc_dist_group = group[topics_col].copy()
    
    joint_prob_gender, marginal_by_topic_gender, marginal_by_group_gender, dist_diff_topic_gender = intersect_by_gender(df_race_group, doc_dist_group)
    
    results_dict = {'joint_prob_gender':joint_prob_gender, 'marginal_by_topic_gender':marginal_by_topic_gender, 
                    'marginal_by_group_gender':marginal_by_group_gender, 'dist_diff_topic_gender':dist_diff_topic_gender}
    
    return results_dict[res]

In [17]:
def group_dist_diff(group):
    return group_stats(group, 'dist_diff_topic_gender')

In [18]:
def group_joint_prop(group):
    return group_stats(group, 'joint_prob_gender')

In [19]:
def get_marginal_by_topic_gender(group):
    return group_stats(group, 'marginal_by_topic_gender')

In [20]:
def get_marginal_by_group_gender(group):
    return group_stats(group, 'marginal_by_group_gender')

In [21]:
university_groups = ['r1', 'control', 'obereg', 'hbcu', 'hsi', 'msi', 'womens', 'selindex',
                    'usnr_rank_cat','Top10CR_Q','Top5CR_Q','Top1CR_Q','impact_Q','avg_citations_Q','avg_citations_Q10']
race_groups = ['white','hispanic','black','asian']
race_gender_groups = ['white_M', 'hispanic_M', 'black_M', 'asian_M', 'white_F','hispanic_F', 'black_F', 'asian_F']

# Social sciences

In [22]:
# profile_similarity = pd.DataFrame(cosine_similarity(carnegie_topic_profile.iloc[: , 1:],group_topic_profile), columns=group_topic_profile.index, index =carnegie_topic_profile.iloc[: , 1:].index ) 

# profile_similarity.insert(0, column='group',value=carnegie_topic_profile.iloc[: , :1].values)

# profile_similarity.to_csv('../../results/institutions/carnegie_groups_profile_sim_sosci.csv', index_label= 'group_tag')

## Topics by rg on institutions 

In [23]:
doc_dist_socsci.shape

(238652, 301)

In [24]:
df_socsci_race.shape

(238652, 23)

In [25]:
df_race = df_socsci_race

In [26]:
doc_dist =doc_dist_socsci

In [27]:
joint_prob_gender, marginal_by_topic_gender, marginal_by_group_gender, dist_diff_topic_gender = intersect_by_gender(df_socsci_race, doc_dist_socsci)

# dist_diff_topic for each Carnegie group

In [28]:
topics_col = ['topic_'+ str(x) for x in range(1,301)]

In [29]:
df_merge = df_socsci_race.merge(address_clean, left_on=['id_art','cluster_ID'], right_on= ['id_art','cluster_id'])
df_merge= df_merge.merge(doc_dist_socsci,on='id_art')


In [30]:
groups_dist_diff = pd.DataFrame()
for carnegie_group in university_groups:
    agg_res = df_merge.groupby(carnegie_group).apply(group_dist_diff).reset_index(level=carnegie_group).reset_index(drop=True)
    groups_dist_diff = groups_dist_diff.append(agg_res)


  groups_dist_diff = groups_dist_diff.append(agg_res)
  groups_dist_diff = groups_dist_diff.append(agg_res)
  groups_dist_diff = groups_dist_diff.append(agg_res)
  groups_dist_diff = groups_dist_diff.append(agg_res)
  groups_dist_diff = groups_dist_diff.append(agg_res)
  groups_dist_diff = groups_dist_diff.append(agg_res)
  groups_dist_diff = groups_dist_diff.append(agg_res)
  groups_dist_diff = groups_dist_diff.append(agg_res)
  groups_dist_diff = groups_dist_diff.append(agg_res)
  groups_dist_diff = groups_dist_diff.append(agg_res)
  groups_dist_diff = groups_dist_diff.append(agg_res)
  groups_dist_diff = groups_dist_diff.append(agg_res)
  groups_dist_diff = groups_dist_diff.append(agg_res)
  groups_dist_diff = groups_dist_diff.append(agg_res)
  groups_dist_diff = groups_dist_diff.append(agg_res)


In [31]:
groups_dist_diff = groups_dist_diff.melt(['topic', 'white_M', 'hispanic_M', 'black_M', 'asian_M', 'white_F',
       'hispanic_F', 'black_F', 'asian_F'],var_name='carnegie_group',value_name='carnegie_tag').dropna()

In [32]:
groups_dist_diff.to_csv('../../results/institutions_topics/rg_carnegie_topic_dist.csv',index=False)

## Carnegie groups proportion in topics

In [33]:
university_groups_prop = pd.DataFrame()
for carnegie_group in university_groups:
    df_group = df_merge.groupby(carnegie_group)[topics_col].sum()
    university_groups_prop_topic = df_group.div(df_group.sum(axis=0), axis=1).reset_index(level=carnegie_group)
    university_groups_prop = university_groups_prop.append(university_groups_prop_topic)


  university_groups_prop = university_groups_prop.append(university_groups_prop_topic)
  university_groups_prop = university_groups_prop.append(university_groups_prop_topic)
  university_groups_prop = university_groups_prop.append(university_groups_prop_topic)
  university_groups_prop = university_groups_prop.append(university_groups_prop_topic)
  university_groups_prop = university_groups_prop.append(university_groups_prop_topic)
  university_groups_prop = university_groups_prop.append(university_groups_prop_topic)
  university_groups_prop = university_groups_prop.append(university_groups_prop_topic)
  university_groups_prop = university_groups_prop.append(university_groups_prop_topic)
  university_groups_prop = university_groups_prop.append(university_groups_prop_topic)
  university_groups_prop = university_groups_prop.append(university_groups_prop_topic)
  university_groups_prop = university_groups_prop.append(university_groups_prop_topic)
  university_groups_prop = university_group

In [34]:
university_groups_prop = university_groups_prop.melt(id_vars=topics_col,var_name='carnegie_group',value_name='carnegie_tag').dropna()

In [35]:
university_groups_prop.to_csv('../../results/institutions_topics/topic_prop_by_university_groups.csv', index=False)

# Howard Harvard effect


harvard: 166027
Howard: 131520

In [36]:
hh_df = df_merge[df_merge.carnegie_id.isin(['166027','131520'])]

In [37]:
hh_df.carnegie_name

62        Harvard University
80        Harvard University
101       Harvard University
203       Harvard University
204       Harvard University
                 ...        
206738    Harvard University
206883    Harvard University
207239    Harvard University
207244    Harvard University
207319    Harvard University
Name: carnegie_name, Length: 3599, dtype: object

In [38]:
df_hh = hh_df.groupby('carnegie_name')[topics_col].sum()

In [39]:
hh_topic_prop = df_hh.div(df_merge[topics_col].sum(), axis=1).reset_index()

In [40]:
hh_topic_prop.to_csv('../../results/institutions_topics/hh_topic_prop.csv', index=False)

## Topic proportion by race, gender and institution

Note: We change the joint probability to the margial probability by topic, on race, gender and institution group, on EACH institution classification. 

In this way, we have the answer to 'which proportion of the papers in topic X are from [race] [gender] and [institution type within a specific classification]

In [41]:
# groups_prop = pd.DataFrame()
# for carnegie_group in university_groups:
#     agg_res = df_merge.groupby(carnegie_group).apply(marginal_by_topic_gender).reset_index(level=carnegie_group).reset_index(drop=True)
#     groups_prop = groups_prop.append(agg_res)


# topics_rg_institutions_prop = groups_prop.melt(['topic', 'white_M', 'hispanic_M', 'black_M', 'asian_M', 'white_F',
#        'hispanic_F', 'black_F', 'asian_F'],var_name='carnegie_group',value_name='carnegie_tag').dropna()

In [42]:
marginal_by_group_gender_socsci = pd.DataFrame()
for carnegie_group in tqdm(university_groups):
    group_dist = df_merge.groupby(carnegie_group).apply(get_marginal_by_group_gender).reset_index(level=carnegie_group).reset_index(drop=True)
    # group_n = df_merge.groupby(carnegie_group).id_art.count().reset_index(level=carnegie_group).reset_index(drop=True).rename(columns={'id_art':'n'})
    # group_dist = group_dist.merge(group_n, on = carnegie_group)
    # group_dist[race_gender_groups] = group_dist[race_gender_groups].multiply(group_dist.n,axis=0)/group_n.n.sum()
    marginal_by_group_gender_socsci = marginal_by_group_gender_socsci.append(group_dist)


  0%|          | 0/15 [00:00<?, ?it/s]

  marginal_by_group_gender_socsci = marginal_by_group_gender_socsci.append(group_dist)
  marginal_by_group_gender_socsci = marginal_by_group_gender_socsci.append(group_dist)
  marginal_by_group_gender_socsci = marginal_by_group_gender_socsci.append(group_dist)
  marginal_by_group_gender_socsci = marginal_by_group_gender_socsci.append(group_dist)
  marginal_by_group_gender_socsci = marginal_by_group_gender_socsci.append(group_dist)
  marginal_by_group_gender_socsci = marginal_by_group_gender_socsci.append(group_dist)
  marginal_by_group_gender_socsci = marginal_by_group_gender_socsci.append(group_dist)
  marginal_by_group_gender_socsci = marginal_by_group_gender_socsci.append(group_dist)
  marginal_by_group_gender_socsci = marginal_by_group_gender_socsci.append(group_dist)
  marginal_by_group_gender_socsci = marginal_by_group_gender_socsci.append(group_dist)
  marginal_by_group_gender_socsci = marginal_by_group_gender_socsci.append(group_dist)
  marginal_by_group_gender_socsci = margina

In [43]:
marginal_by_group_gender_socsci = marginal_by_group_gender_socsci.melt(['topic', 'white_M', 'hispanic_M', 'black_M', 'asian_M', 'white_F',
       'hispanic_F', 'black_F', 'asian_F'],var_name='carnegie_group',value_name='carnegie_tag').dropna()

In [44]:
marginal_by_group_gender_socsci.to_csv('../../results/institutions_topics/marginal_by_group_gender_socsci.csv', index=False)

In [45]:
marginal_by_topic_gender_socsci = pd.DataFrame()
for carnegie_group in tqdm(university_groups):
    group_dist = df_merge.groupby(carnegie_group).apply(get_marginal_by_topic_gender).reset_index(level=carnegie_group).reset_index(drop=True)
    # group_n = df_merge.groupby(carnegie_group).id_art.count().reset_index(level=carnegie_group).reset_index(drop=True).rename(columns={'id_art':'n'})
    # group_dist = group_dist.merge(group_n, on = carnegie_group)
    # group_dist[race_gender_groups] = group_dist[race_gender_groups].multiply(group_dist.n,axis=0)/group_n.n.sum()
    marginal_by_topic_gender_socsci = marginal_by_topic_gender_socsci.append(group_dist)


  0%|          | 0/15 [00:00<?, ?it/s]

  marginal_by_topic_gender_socsci = marginal_by_topic_gender_socsci.append(group_dist)
  marginal_by_topic_gender_socsci = marginal_by_topic_gender_socsci.append(group_dist)
  marginal_by_topic_gender_socsci = marginal_by_topic_gender_socsci.append(group_dist)
  marginal_by_topic_gender_socsci = marginal_by_topic_gender_socsci.append(group_dist)
  marginal_by_topic_gender_socsci = marginal_by_topic_gender_socsci.append(group_dist)
  marginal_by_topic_gender_socsci = marginal_by_topic_gender_socsci.append(group_dist)
  marginal_by_topic_gender_socsci = marginal_by_topic_gender_socsci.append(group_dist)
  marginal_by_topic_gender_socsci = marginal_by_topic_gender_socsci.append(group_dist)
  marginal_by_topic_gender_socsci = marginal_by_topic_gender_socsci.append(group_dist)
  marginal_by_topic_gender_socsci = marginal_by_topic_gender_socsci.append(group_dist)
  marginal_by_topic_gender_socsci = marginal_by_topic_gender_socsci.append(group_dist)
  marginal_by_topic_gender_socsci = margina

In [46]:
marginal_by_topic_gender_socsci = marginal_by_topic_gender_socsci.melt(['topic', 'white_M', 'hispanic_M', 'black_M', 'asian_M', 'white_F',
       'hispanic_F', 'black_F', 'asian_F'],var_name='carnegie_group',value_name='carnegie_tag').dropna()

In [47]:
marginal_by_topic_gender_socsci.to_csv('../../results/institutions_topics/marginal_by_topic_gender_socsci.csv', index=False)

In [48]:
marginal_by_topic_gender_socsci[marginal_by_topic_gender_socsci.carnegie_tag=='top_10']

Unnamed: 0,topic,white_M,hispanic_M,black_M,asian_M,white_F,hispanic_F,black_F,asian_F,carnegie_group,carnegie_tag
142800,1,0.290296,0.025831,0.034135,0.044330,0.452329,0.034022,0.059236,0.059821,usnr_rank_cat,top_10
142801,2,0.500576,0.043301,0.053817,0.093292,0.210019,0.018688,0.024670,0.055636,usnr_rank_cat,top_10
142802,3,0.458450,0.036911,0.050724,0.094212,0.255823,0.021745,0.027893,0.054241,usnr_rank_cat,top_10
142803,4,0.531305,0.049091,0.064924,0.092866,0.179512,0.012389,0.023843,0.046069,usnr_rank_cat,top_10
142804,5,0.506408,0.045949,0.054823,0.104826,0.208693,0.017039,0.024046,0.038217,usnr_rank_cat,top_10
...,...,...,...,...,...,...,...,...,...,...,...
143095,296,0.509969,0.043898,0.057284,0.108290,0.190035,0.016479,0.024179,0.049867,usnr_rank_cat,top_10
143096,297,0.517268,0.041742,0.062165,0.115103,0.168970,0.012091,0.019562,0.063098,usnr_rank_cat,top_10
143097,298,0.446272,0.036973,0.051569,0.125083,0.233238,0.017598,0.026641,0.062626,usnr_rank_cat,top_10
143098,299,0.471132,0.039686,0.052314,0.087459,0.251095,0.024037,0.030815,0.043461,usnr_rank_cat,top_10


# Health

## Topics by rg on institutions 

In [49]:
doc_dist_health.shape

(123472, 201)

In [50]:
df_health_race.shape

(123472, 23)

In [51]:
df_race = df_health_race

In [52]:
doc_dist = doc_dist_health

In [53]:
joint_prob_gender, marginal_by_topic_gender, marginal_by_group_gender, dist_diff_topic_gender = intersect_by_gender(df_health_race, doc_dist_health)

# dist_diff_topic for each Carnegie group

In [54]:
university_groups = ['r1', 'control', 'obereg', 'hbcu', 'hsi', 'msi', 'womens', 'selindex',
                    'usnr_rank_cat','Top10CR_Q','Top5CR_Q','Top1CR_Q','impact_Q','avg_citations_Q','avg_citations_Q10']
topics_col = ['topic_'+ str(x) for x in range(1,201)]
race_groups = ['white','hispanic','black','asian']


In [55]:
df_merge_health = df_health_race.merge(address_clean, left_on=['id_art','cluster_ID'], right_on= ['id_art','cluster_id'])
df_merge_health= df_merge_health.merge(doc_dist_health,on='id_art')


In [56]:
groups_dist_diff_health = pd.DataFrame()
for carnegie_group in university_groups:
    agg_res = df_merge_health.groupby(carnegie_group).apply(group_dist_diff).reset_index(level=carnegie_group).reset_index(drop=True)
    groups_dist_diff_health = groups_dist_diff_health.append(agg_res)


  groups_dist_diff_health = groups_dist_diff_health.append(agg_res)
  groups_dist_diff_health = groups_dist_diff_health.append(agg_res)
  groups_dist_diff_health = groups_dist_diff_health.append(agg_res)
  groups_dist_diff_health = groups_dist_diff_health.append(agg_res)
  groups_dist_diff_health = groups_dist_diff_health.append(agg_res)
  groups_dist_diff_health = groups_dist_diff_health.append(agg_res)
  groups_dist_diff_health = groups_dist_diff_health.append(agg_res)
  groups_dist_diff_health = groups_dist_diff_health.append(agg_res)
  groups_dist_diff_health = groups_dist_diff_health.append(agg_res)
  groups_dist_diff_health = groups_dist_diff_health.append(agg_res)
  groups_dist_diff_health = groups_dist_diff_health.append(agg_res)
  groups_dist_diff_health = groups_dist_diff_health.append(agg_res)
  groups_dist_diff_health = groups_dist_diff_health.append(agg_res)
  groups_dist_diff_health = groups_dist_diff_health.append(agg_res)
  groups_dist_diff_health = groups_dist_diff_hea

In [57]:
groups_dist_diff_health = groups_dist_diff_health.melt(['topic', 'white_M', 'hispanic_M', 'black_M', 'asian_M', 'white_F',
       'hispanic_F', 'black_F', 'asian_F'],var_name='carnegie_group',value_name='carnegie_tag').dropna()

In [58]:
groups_dist_diff_health.to_csv('../../results/institutions_topics/rg_carnegie_topic_dist_health.csv',index=False)

## Carnegie groups proportion in topics

In [59]:
university_groups_prop_health = pd.DataFrame()
for carnegie_group in university_groups:
    df_group = df_merge_health.groupby(carnegie_group)[topics_col].sum()
    university_groups_prop_topic = df_group.div(df_group.sum(axis=0), axis=1).reset_index(level=carnegie_group)
    university_groups_prop_health = university_groups_prop_health.append(university_groups_prop_topic)


  university_groups_prop_health = university_groups_prop_health.append(university_groups_prop_topic)
  university_groups_prop_health = university_groups_prop_health.append(university_groups_prop_topic)
  university_groups_prop_health = university_groups_prop_health.append(university_groups_prop_topic)
  university_groups_prop_health = university_groups_prop_health.append(university_groups_prop_topic)
  university_groups_prop_health = university_groups_prop_health.append(university_groups_prop_topic)
  university_groups_prop_health = university_groups_prop_health.append(university_groups_prop_topic)
  university_groups_prop_health = university_groups_prop_health.append(university_groups_prop_topic)
  university_groups_prop_health = university_groups_prop_health.append(university_groups_prop_topic)
  university_groups_prop_health = university_groups_prop_health.append(university_groups_prop_topic)
  university_groups_prop_health = university_groups_prop_health.append(university_groups_pr

In [60]:
university_groups_prop_health = university_groups_prop_health.melt(id_vars=topics_col,var_name='carnegie_group',value_name='carnegie_tag').dropna()

In [61]:
university_groups_prop_health.to_csv('../../results/institutions_topics/topic_prop_by_university_groups_health.csv', index=False)

## Topic proportion by race, gender and institution

In [62]:
# groups_prop_health = pd.DataFrame()
# for carnegie_group in university_groups:
#     agg_res = df_merge_health.groupby(carnegie_group).apply(marginal_by_topic_gender).reset_index(level=carnegie_group).reset_index(drop=True)
#     groups_prop_health = groups_prop_health.append(agg_res)


# topics_rg_institutions_prop_health = groups_prop_health.melt(['topic', 'white_M', 'hispanic_M', 'black_M', 'asian_M', 'white_F',
#        'hispanic_F', 'black_F', 'asian_F'],var_name='carnegie_group',value_name='carnegie_tag').dropna()

In [63]:
marginal_by_group_gender_health = pd.DataFrame()
for carnegie_group in tqdm(university_groups):
    group_dist = df_merge_health.groupby(carnegie_group).apply(get_marginal_by_group_gender).reset_index(level=carnegie_group).reset_index(drop=True)
    # group_n = df_merge.groupby(carnegie_group).id_art.count().reset_index(level=carnegie_group).reset_index(drop=True).rename(columns={'id_art':'n'})
    # group_dist = group_dist.merge(group_n, on = carnegie_group)
    # group_dist[race_gender_groups] = group_dist[race_gender_groups].multiply(group_dist.n,axis=0)/group_n.n.sum()
    marginal_by_group_gender_health = marginal_by_group_gender_health.append(group_dist)


  0%|          | 0/15 [00:00<?, ?it/s]

  marginal_by_group_gender_health = marginal_by_group_gender_health.append(group_dist)
  marginal_by_group_gender_health = marginal_by_group_gender_health.append(group_dist)
  marginal_by_group_gender_health = marginal_by_group_gender_health.append(group_dist)
  marginal_by_group_gender_health = marginal_by_group_gender_health.append(group_dist)
  marginal_by_group_gender_health = marginal_by_group_gender_health.append(group_dist)
  marginal_by_group_gender_health = marginal_by_group_gender_health.append(group_dist)
  marginal_by_group_gender_health = marginal_by_group_gender_health.append(group_dist)
  marginal_by_group_gender_health = marginal_by_group_gender_health.append(group_dist)
  marginal_by_group_gender_health = marginal_by_group_gender_health.append(group_dist)
  marginal_by_group_gender_health = marginal_by_group_gender_health.append(group_dist)
  marginal_by_group_gender_health = marginal_by_group_gender_health.append(group_dist)
  marginal_by_group_gender_health = margina

In [64]:
marginal_by_group_gender_health = marginal_by_group_gender_health.melt(['topic', 'white_M', 'hispanic_M', 'black_M', 'asian_M', 'white_F',
       'hispanic_F', 'black_F', 'asian_F'],var_name='carnegie_group',value_name='carnegie_tag').dropna()

In [65]:
marginal_by_group_gender_health.to_csv('../../results/institutions_topics/marginal_by_group_gender_health.csv', index=False)

In [66]:
marginal_by_topic_gender_health = pd.DataFrame()
for carnegie_group in tqdm(university_groups):
    group_dist = df_merge_health.groupby(carnegie_group).apply(get_marginal_by_topic_gender).reset_index(level=carnegie_group).reset_index(drop=True)
    # group_n = df_merge.groupby(carnegie_group).id_art.count().reset_index(level=carnegie_group).reset_index(drop=True).rename(columns={'id_art':'n'})
    # group_dist = group_dist.merge(group_n, on = carnegie_group)
    # group_dist[race_gender_groups] = group_dist[race_gender_groups].multiply(group_dist.n,axis=0)/group_n.n.sum()
    marginal_by_topic_gender_health = marginal_by_topic_gender_health.append(group_dist)


  0%|          | 0/15 [00:00<?, ?it/s]

  marginal_by_topic_gender_health = marginal_by_topic_gender_health.append(group_dist)
  marginal_by_topic_gender_health = marginal_by_topic_gender_health.append(group_dist)
  marginal_by_topic_gender_health = marginal_by_topic_gender_health.append(group_dist)
  marginal_by_topic_gender_health = marginal_by_topic_gender_health.append(group_dist)
  marginal_by_topic_gender_health = marginal_by_topic_gender_health.append(group_dist)
  marginal_by_topic_gender_health = marginal_by_topic_gender_health.append(group_dist)
  marginal_by_topic_gender_health = marginal_by_topic_gender_health.append(group_dist)
  marginal_by_topic_gender_health = marginal_by_topic_gender_health.append(group_dist)
  marginal_by_topic_gender_health = marginal_by_topic_gender_health.append(group_dist)
  marginal_by_topic_gender_health = marginal_by_topic_gender_health.append(group_dist)
  marginal_by_topic_gender_health = marginal_by_topic_gender_health.append(group_dist)
  marginal_by_topic_gender_health = margina

In [67]:
marginal_by_topic_gender_health = marginal_by_topic_gender_health.melt(['topic', 'white_M', 'hispanic_M', 'black_M', 'asian_M', 'white_F',
       'hispanic_F', 'black_F', 'asian_F'],var_name='carnegie_group',value_name='carnegie_tag').dropna()

In [68]:
marginal_by_topic_gender_health.to_csv('../../results/institutions_topics/marginal_by_topic_gender_health.csv', index=False)

In [69]:
# groups_join_prop_health = pd.DataFrame()
# for carnegie_group in tqdm(university_groups):
#     group_dist = df_merge_health.groupby(carnegie_group).apply(get_marginal_by_topic_gender).reset_index(level=carnegie_group).reset_index(drop=True)
#     group_n = df_merge.groupby(carnegie_group).id_art.count().reset_index(level=carnegie_group).reset_index(drop=True).rename(columns={'id_art':'n'})
#     group_dist = group_dist.merge(group_n, on = carnegie_group)
#     group_dist[race_gender_groups] = group_dist[race_gender_groups].multiply(group_dist.n,axis=0)/group_n.n.sum()
#     groups_join_prop_health = groups_join_prop_health.append(group_dist)


# groups_join_prop_health = groups_join_prop_health.melt(['topic', 'white_M', 'hispanic_M', 'black_M', 'asian_M', 'white_F',
#        'hispanic_F', 'black_F', 'asian_F'],var_name='carnegie_group',value_name='carnegie_tag').dropna()

# groups_join_prop_health.to_csv('../../results/institutions_topics/topics_rg_institutions_prop_health.csv', index=False)