- topical profile of institutions
- topical profile of race&gender group
- cosine similarity between institutions and race and gender


Institutions profile:

2. R1 vs other
3. public, private control
4. Region
5. HBCU
7. HSI
8. MSI
9. Womens' college
10. Selectivity index


In [1]:
%config Completer.use_jedi = False

In [2]:
import pandas as pd
import pickle
# import pickle5 as pickle
from tqdm.notebook import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from libs.LastNamesInference import LastNamesInference

In [3]:
df_socsci = pd.read_csv('/data/datasets//WOS/US/text_clean_socsci.txt')
us_papers = pd.read_pickle('/data/datasets//WOS/US/US_papers.p')
df_health = pd.read_csv('/data/datasets/WOS/US/text_clean_health.txt')

In [4]:
def save(x, file_name):
    with open(file_name, 'wb') as handle:
        pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL)

def restore(file_name):
    with open(file_name, 'rb') as handle:
        x = pickle.load(handle)
    return x

In [5]:
doc_dist_socsci = restore('/data/datasets/WOS/US/lda_socsci_dist.p')
doc_dist_health = restore('/data/datasets/WOS/US/lda_health_dist.p')

In [6]:
address_clean = pd.read_pickle('/data/datasets/WOS/US/address_clean.p')

In [7]:
df = us_papers.merge(address_clean, how='inner', left_on=['cluster_ID','id_art'], right_on=['cluster_id', 'id_art'])

In [8]:
def split_by_gender(df_race):
    df_race = df_race.copy()
    df_race['gender'] = df_race.gender.str.upper()

    boolean_mask_M = df_race.gender == 'M'
    boolean_mask_F = df_race.gender == 'F'

    race_dist_M = df_race.loc[boolean_mask_M,['white','hispanic','black','asian']]
    race_dist_F = df_race.loc[boolean_mask_F,['white','hispanic','black','asian']]

    race_dist_M.columns = race_dist_M.columns + '_M'
    race_dist_F.columns = race_dist_F.columns + '_F'
    race_dist_MF = pd.concat([race_dist_M,race_dist_F]).fillna(0).sort_index()
    df_race = df_race.join(race_dist_MF,how='inner' )
    return df_race


In [9]:
def infer_race(us_papers, df=None):
    papers = us_papers.drop_duplicates('cluster_ID')
    lni = LastNamesInference(names = papers.nom)
    if df is None:
        df = us_papers.copy()
    tqdm.pandas(desc="inferring race from lastnames")
    lastname_race_dist = df.progress_apply(lambda x: lni.get_name_dist(lastname=x.nom), axis=1)
    df[lni.prob_order] = pd.DataFrame(lastname_race_dist.to_list(), index=df.index)
    df = split_by_gender(df)
    return df

In [10]:
def transform_data(df,dataset= 'socsci',n_components=300):
    
    lda_model = restore( '../../models/lda_model_{}_k{}.p'.format(dataset,n_components))
    vectorizer = restore( '../../models/vectorizer_{}.p'.format(dataset))

    texts = df.text_clean.values
    data_vectorized = vectorizer.transform(texts)
    doc_dist = lda_model.transform(data_vectorized)
    return doc_dist

In [11]:
df_race = infer_race(df)

imputing by the mean: 100% 3441264/3441264 [00:11<00:00, 291815.00it/s]


inferring race from lastnames:   0%|          | 0/16033434 [00:00<?, ?it/s]

In [12]:
df_race.columns

Index(['cluster_ID', 'Annee_Bibliographique', 'yfp', 'id_art', 'Prenom', 'nom',
       'ordre', 'nb_auteur', 'EDiscipline', 'ESpecialite', 'cit_rel_all_IAC',
       'ordre_auteur', 'Province', 'disc_origin', 'spec_origin',
       'count_origin', 'gender', 'cit_all_IAC', 'cluster_id', 'grid_id',
       'carnegie_id', 'carnegie_name', 'r1', 'control', 'obereg', 'hbcu',
       'tribal', 'hsi', 'msi', 'womens', 'selindex', 'n_papers_on_WOS',
       'ncitable_docs', 'Top10CR', 'Top5CR', 'Top1CR', 'Top10CR_Q', 'Top5CR_Q',
       'Top1CR_Q', 'npapers', 'impact', 'avg_citations', 'npapers_Q',
       'impact_Q', 'avg_citations_Q', 'usnr_rank', 'usnr_rank_cat',
       'avg_citations_Q10', 'white', 'hispanic', 'black', 'asian', 'white_M',
       'hispanic_M', 'black_M', 'asian_M', 'white_F', 'hispanic_F', 'black_F',
       'asian_F'],
      dtype='object')

# Social sciences

In [13]:
metadata_df = df_race[df_race.ordre==1]

metadata_df = df_socsci.merge(metadata_df,how='left', on= 'id_art')

In [14]:
metadata_df = metadata_df.merge(doc_dist_socsci, how='left',on='id_art').dropna()

In [15]:
topics_col = ['topic_'+ str(x) for x in range(1,301)]
groups = ['white_M','hispanic_M', 'black_M', 'asian_M', 'white_F', 'hispanic_F', 'black_F','asian_F']
carnegie_groups = ['r1', 'control', 'obereg', 'hbcu', 'hsi', 'msi', 'womens', 'selindex',
                  'usnr_rank_cat','Top10CR_Q','Top5CR_Q','Top1CR_Q','impact_Q','avg_citations_Q','avg_citations_Q10']

## topical profile by Carnegie groups

- Topical profile: Is the average distribution by topics given a Carnegie group

- Topical profile norm: Is the ratio between the topical profile of a group and the average distribution by topics (topics_size). ~It shows the topics where a group is publishing relatively more/less than average~ it is a ver size-dependent measure!

In [16]:
carnegie_topic_profile = pd.DataFrame()
for grp in carnegie_groups:
    topic_profile_grp = metadata_df.groupby(grp)[topics_col].mean()
    topic_profile_grp.insert(0, 'group', grp)
    carnegie_topic_profile = carnegie_topic_profile.append(topic_profile_grp)

  carnegie_topic_profile = carnegie_topic_profile.append(topic_profile_grp)
  carnegie_topic_profile = carnegie_topic_profile.append(topic_profile_grp)
  carnegie_topic_profile = carnegie_topic_profile.append(topic_profile_grp)
  carnegie_topic_profile = carnegie_topic_profile.append(topic_profile_grp)
  carnegie_topic_profile = carnegie_topic_profile.append(topic_profile_grp)
  carnegie_topic_profile = carnegie_topic_profile.append(topic_profile_grp)
  carnegie_topic_profile = carnegie_topic_profile.append(topic_profile_grp)
  carnegie_topic_profile = carnegie_topic_profile.append(topic_profile_grp)
  carnegie_topic_profile = carnegie_topic_profile.append(topic_profile_grp)
  carnegie_topic_profile = carnegie_topic_profile.append(topic_profile_grp)
  carnegie_topic_profile = carnegie_topic_profile.append(topic_profile_grp)
  carnegie_topic_profile = carnegie_topic_profile.append(topic_profile_grp)
  carnegie_topic_profile = carnegie_topic_profile.append(topic_profile_grp)
  carnegie_t

In [17]:
topics_size = metadata_df[topics_col].mean()

In [18]:
#carnegie_topic_profile_norm = carnegie_topic_profile.iloc[: , 1:].divide(topics_size,axis=1)

## topical profile by race and gender

In [19]:
topics_by_group = metadata_df[groups].T @ metadata_df[topics_col]

group_topic_profile = topics_by_group.div(topics_by_group.sum(axis=1), axis=0)

In [20]:
group_topic_profile_norm = group_topic_profile.divide(topics_size,axis=1)

In [21]:
profile_similarity = pd.DataFrame(cosine_similarity(carnegie_topic_profile.iloc[: , 1:],group_topic_profile), columns=group_topic_profile.index, index =carnegie_topic_profile.iloc[: , 1:].index ) 

In [22]:
# normalize by on row level
# profile_similarity = profile_similarity.div(profile_similarity.sum(axis=1),axis=0)

In [23]:
# profile_similarity.insert(0, column='group',value=carnegie_topic_profile.iloc[: , :1].values)

In [24]:
profile_similarity.to_csv('../../results/institutions/carnegie_groups_profile_sim_sosci.csv', index_label= 'group_tag')

### institutional level similarity

In [25]:
institutions_profile = metadata_df.groupby('carnegie_id')[topics_col].mean()

In [26]:
institutions_profile_norm = institutions_profile.divide(topics_size,axis=1)

In [27]:
institutions_grp_similarity = pd.DataFrame(cosine_similarity(institutions_profile_norm,group_topic_profile_norm), columns=group_topic_profile_norm.index, index =institutions_profile_norm.index ) 

In [28]:
# normalize by on row level
institutions_grp_similarity = institutions_grp_similarity.div(institutions_grp_similarity.sum(axis=1),axis=0)

In [29]:
universities_carnegie = address_clean[['carnegie_id', 'carnegie_name', 'r1','control', 'obereg', 'hbcu', 'tribal', 'hsi', 'msi', 'womens','selindex',
                                      'usnr_rank_cat','Top10CR_Q','Top5CR_Q','Top1CR_Q','impact_Q','avg_citations_Q','avg_citations_Q10']].drop_duplicates('carnegie_id')

In [30]:
#some unis have multiple names, we unify
universities_carnegie2 = address_clean[['carnegie_id', 'carnegie_name', 'r1','control', 'obereg', 'hbcu', 'tribal', 'hsi', 'msi', 'womens','selindex',
                                       'usnr_rank_cat','Top10CR_Q','Top5CR_Q','Top1CR_Q','impact_Q','avg_citations_Q','avg_citations_Q10']].drop_duplicates()

universities_carnegie2 = universities_carnegie2.merge(universities_carnegie, indicator=True, how='outer')

universities_carnegie2 = universities_carnegie2[universities_carnegie2.carnegie_id.isin(universities_carnegie2[universities_carnegie2._merge=='left_only'].carnegie_id)]

shortest_names = universities_carnegie2.groupby('carnegie_id').carnegie_name.apply(lambda x:  min(x, key=len))
largest_names = universities_carnegie2.groupby('carnegie_id').carnegie_name.apply(lambda x:  max(x, key=len))

name_map = pd.DataFrame([shortest_names,largest_names]).T

name_map.columns =['short_name', 'large_name']

name_map = name_map.set_index('large_name')

universities_carnegie.loc[universities_carnegie.carnegie_name.isin(name_map.index),'carnegie_name'] = universities_carnegie.loc[universities_carnegie.carnegie_name.isin(name_map.index),'carnegie_name'].map(name_map.short_name)

In [31]:
institutions_grp_similarity = institutions_grp_similarity.merge(universities_carnegie, left_index=True, right_on='carnegie_id')

In [32]:
institutions_grp_similarity.to_csv('../../results/institutions/institutions_grp_similarity.csv', index=False)

# Health

In [33]:
metadata_df_health = df_race[df_race.ordre==1]
metadata_df_health = df_health.merge(metadata_df_health,how='left', on= 'id_art')

In [34]:
metadata_df_health = metadata_df_health.merge(doc_dist_health, how='left',on='id_art').dropna()

In [35]:
topics_col_health = ['topic_'+ str(x) for x in range(1,201)]
groups = ['white_M','hispanic_M', 'black_M', 'asian_M', 'white_F', 'hispanic_F', 'black_F','asian_F']
carnegie_groups = ['r1', 'control', 'obereg', 'hbcu', 'hsi', 'msi', 'womens', 'selindex',
                  'usnr_rank_cat','Top10CR_Q','Top5CR_Q','Top1CR_Q','impact_Q','avg_citations_Q','avg_citations_Q10']

## topical profile by Carnegie groups


In [36]:
carnegie_topic_profile_health = pd.DataFrame()
for grp in carnegie_groups:
    topic_profile_grp = metadata_df_health.groupby(grp)[topics_col_health].mean()
    topic_profile_grp.insert(0, 'group', grp)
    carnegie_topic_profile_health = carnegie_topic_profile_health.append(topic_profile_grp)

  carnegie_topic_profile_health = carnegie_topic_profile_health.append(topic_profile_grp)
  carnegie_topic_profile_health = carnegie_topic_profile_health.append(topic_profile_grp)
  carnegie_topic_profile_health = carnegie_topic_profile_health.append(topic_profile_grp)
  carnegie_topic_profile_health = carnegie_topic_profile_health.append(topic_profile_grp)
  carnegie_topic_profile_health = carnegie_topic_profile_health.append(topic_profile_grp)
  carnegie_topic_profile_health = carnegie_topic_profile_health.append(topic_profile_grp)
  carnegie_topic_profile_health = carnegie_topic_profile_health.append(topic_profile_grp)
  carnegie_topic_profile_health = carnegie_topic_profile_health.append(topic_profile_grp)
  carnegie_topic_profile_health = carnegie_topic_profile_health.append(topic_profile_grp)
  carnegie_topic_profile_health = carnegie_topic_profile_health.append(topic_profile_grp)
  carnegie_topic_profile_health = carnegie_topic_profile_health.append(topic_profile_grp)
  carnegie

In [37]:
topics_size_health = metadata_df_health[topics_col_health].mean()

## topical profile by race and gender

In [38]:
topics_by_group_health = metadata_df_health[groups].T @ metadata_df_health[topics_col_health]

group_topic_profile_health = topics_by_group_health.div(topics_by_group_health.sum(axis=1), axis=0)

In [39]:
group_topic_profile_norm_health = group_topic_profile_health.divide(topics_size_health,axis=1)

In [40]:
profile_similarity_health = pd.DataFrame(cosine_similarity(carnegie_topic_profile_health.iloc[: , 1:],group_topic_profile_health), columns=group_topic_profile_health.index, index =carnegie_topic_profile_health.iloc[: , 1:].index ) 

In [41]:
profile_similarity_health['group']=carnegie_topic_profile_health.group

In [42]:
profile_similarity_health.to_csv('../../results/institutions/carnegie_groups_profile_sim_health.csv', index_label= 'group_tag')

### institutional level similarity

In [43]:
institutions_profile_health = metadata_df_health.groupby('carnegie_id')[topics_col_health].mean()

In [44]:
institutions_profile_norm_health = institutions_profile_health.divide(topics_size_health,axis=1)

In [45]:
institutions_grp_similarity_health = pd.DataFrame(cosine_similarity(institutions_profile_norm_health,group_topic_profile_norm_health), columns=group_topic_profile_norm_health.index, index =institutions_profile_norm_health.index ) 

In [46]:
# normalize by on row level
institutions_grp_similarity_health = institutions_grp_similarity_health.div(institutions_grp_similarity_health.sum(axis=1),axis=0)

In [47]:
universities_carnegie = address_clean[['carnegie_id', 'carnegie_name', 'r1','control', 'obereg', 'hbcu', 'tribal', 'hsi', 'msi', 'womens','selindex',
                                      'usnr_rank_cat','Top10CR_Q','Top5CR_Q','Top1CR_Q','impact_Q','avg_citations_Q','avg_citations_Q10']].drop_duplicates('carnegie_id')

In [48]:
#some unis have multiple names, we unify
universities_carnegie2 = address_clean[['carnegie_id', 'carnegie_name', 'r1','control', 'obereg', 'hbcu', 'tribal', 'hsi', 'msi', 'womens','selindex',
                                       'usnr_rank_cat','Top10CR_Q','Top5CR_Q','Top1CR_Q','impact_Q','avg_citations_Q','avg_citations_Q10']].drop_duplicates()

universities_carnegie2 = universities_carnegie2.merge(universities_carnegie, indicator=True, how='outer')

universities_carnegie2 = universities_carnegie2[universities_carnegie2.carnegie_id.isin(universities_carnegie2[universities_carnegie2._merge=='left_only'].carnegie_id)]

shortest_names = universities_carnegie2.groupby('carnegie_id').carnegie_name.apply(lambda x:  min(x, key=len))
largest_names = universities_carnegie2.groupby('carnegie_id').carnegie_name.apply(lambda x:  max(x, key=len))

name_map = pd.DataFrame([shortest_names,largest_names]).T

name_map.columns =['short_name', 'large_name']

name_map = name_map.set_index('large_name')

universities_carnegie.loc[universities_carnegie.carnegie_name.isin(name_map.index),'carnegie_name'] = universities_carnegie.loc[universities_carnegie.carnegie_name.isin(name_map.index),'carnegie_name'].map(name_map.short_name)

In [49]:
institutions_grp_similarity_health = institutions_grp_similarity_health.merge(universities_carnegie, left_index=True, right_on='carnegie_id')

In [50]:
institutions_grp_similarity_health.to_csv('../../results/institutions/institutions_grp_similarity_health.csv', index=False)