In [1]:
import string
import pandas as pd
import pyLDAvis.gensim
import pyLDAvis.sklearn
import re
import pickle
from tqdm.notebook import tqdm
import unicodedata2
import numpy as np
import random
import os

from libs.lda_wrapper import LDA_wrapper

#from libs.RaceDistribution import RaceDistribution
from libs.LastNamesInference import LastNamesInference

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
tqdm.pandas()

  and should_run_async(code)


## Load data

In [3]:
#df_full = pd.read_csv('/data/WOS/US/text_clean.txt')
df_socsci = pd.read_csv('/data/datasets//WOS/US/text_clean_socsci.txt')
us_papers = pd.read_csv('/data/datasets/WOS/US/US_papers.txt')

In [4]:
df_socsci.id_art.to_csv('../data/id_art_socsci.txt',index=False)

In [5]:
# # remove previous run

# os.unlink('../results/lda_model_socsci_k300.p')
# os.unlink('../results/lda_model_health_k200.p')

## utils

In [6]:
def save(x, file_name):
    with open(file_name, 'wb') as handle:
        pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL)

def restore(file_name):
    with open(file_name, 'rb') as handle:
        x = pickle.load(handle)
    return x

In [7]:
def infer_race(us_papers,df_socsci, authors='first'):
    socsci_papers = us_papers.loc[(us_papers.id_art.isin(df_socsci.id_art)),]
    first_authors = socsci_papers[socsci_papers.ordre==1].copy().reset_index(drop=True)
    #first_authors = first_authors[['id_art','Prenom', 'nom']]
    lni = LastNamesInference(names = first_authors.nom)
    tqdm.pandas(desc="inferring race from lastnames")
    lastname_race_dist = first_authors.progress_apply(lambda x: lni.get_name_dist(lastname=x.nom), axis=1)
    first_authors[lni.prob_order] = pd.DataFrame(lastname_race_dist.to_list())
    #first_authors = first_authors[['id_art','white', 'hispanic', 'black', 'asian']]
    df_socsci_race = df_socsci.merge(first_authors, on ='id_art')

    return df_socsci_race

In [8]:
def fit_model(df,dataset= 'full',n_batches=100,max_iter = 10,n_components=100):
    LDA = LDA_wrapper(n_batches=n_batches)
    if not os.path.exists('../results/lda_model_{}_k{}.p'.format(dataset,n_components)):
        texts = df['text_clean'].values
        lda_model,data_vectorized,vectorizer = LDA.lda(data=texts,max_iter = max_iter,n_components=n_components)
        
        save(lda_model, '../results/lda_model_{}_k{}.p'.format(dataset,n_components))
        save(vectorizer,'../results/vectorizer_{}.p'.format(dataset))
        save(data_vectorized, 'tmp/data_vectorized.p')
    else:
        lda_model = restore( '../results/lda_model_{}_k{}.p'.format(dataset,n_components))
        vectorizer = restore( '../results/vectorizer_{}.p'.format(dataset))
        data_vectorized = restore('tmp/data_vectorized.p')
    return lda_model,data_vectorized,vectorizer

In [9]:
def transform_data(df,lda_model,vectorizer,dataset= 'socsci',n_batches=500,n_components=100,method='mmds'):
    '''
    method: mmds, tsne, PcoA
    '''
    texts = df.text_clean.values
    data_vectorized = vectorizer.transform(texts)
    doc_dist = lda_model.transform(data_vectorized)
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds=method,sort_topics=False )
    pyLDAvis.save_html(vis, '../results/lda_tsne_{}_k{}.html'.format(dataset,n_components))
    return vis, doc_dist

In [10]:
def project_lda_topics(df_race,doc_dist):
    
    race_dist = df_race.filter(regex=('white|hispanic|black|asian'))
    topics_by_group = race_dist.T @ doc_dist 
    
    topics_by_group = topics_by_group.astype(np.float128)
    
    joint_prob = topics_by_group/topics_by_group.to_numpy().sum()
    marginal_by_topic = joint_prob.div(joint_prob.sum(axis=0), axis=1)
    marginal_by_group = joint_prob.div(joint_prob.sum(axis=1), axis=0)
#    dist_diff_topic = marginal_by_topic.subtract(joint_prob.sum(axis=1), axis=0)  # with the substraction, this gives "how many percentual points (more/less) than 
                                                                                    # expected they talk about this topic
    dist_diff_topic = marginal_by_topic.div(joint_prob.sum(axis=1), axis=0) -1   # with the ratio, this gives "how much % (more/less) than expected
                                                                                    # they talk about this topic
    joint_prob = joint_prob.T.rename_axis('topic').reset_index()
    marginal_by_topic = marginal_by_topic.T.rename_axis('topic').reset_index()
    marginal_by_group = marginal_by_group.T.rename_axis('topic').reset_index()
    dist_diff_topic = dist_diff_topic.T.rename_axis('topic').reset_index()
    
    # I start the topics in 1, so they are equal to the LDAVIZ!!!!
    joint_prob.topic += 1
    marginal_by_topic.topic += 1
    marginal_by_group.topic += 1
    dist_diff_topic.topic += 1    
    
    return joint_prob, marginal_by_topic, marginal_by_group, dist_diff_topic

In [11]:
def intersect_by_gender(df_race, doc_dist):
    
    #race_dist = race_dist.merge(metadata_papers, how='left', left_index=True, right_on='id_art')
    #race_dist = first_or_so_authors(race_dist)

    df_race['gender'] = df_race.gender.str.upper()

    boolean_mask_M = df_race.gender == 'M'
    boolean_mask_F = df_race.gender == 'F'
    
    race_dist_M = df_race.loc[boolean_mask_M,['white','hispanic','black','asian']]
    race_dist_F = df_race.loc[boolean_mask_F,['white','hispanic','black','asian']]

    race_dist_M.columns = race_dist_M.columns + '_M'
    race_dist_F.columns = race_dist_F.columns + '_F'

    race_dist_MF = pd.concat([race_dist_M,race_dist_F]).fillna(0)
    doc_dist_MF = np.concatenate((doc_dist[boolean_mask_M],doc_dist[boolean_mask_F]))

    joint_prob, marginal_by_topic, marginal_by_group, dist_diff_topic = project_lda_topics(race_dist_MF,doc_dist_MF)

    return joint_prob, marginal_by_topic, marginal_by_group, dist_diff_topic

## race by paper

In [12]:
df_socsci_race = infer_race(us_papers,df_socsci)

imputing by the mean: 100%|██████████| 238652/238652 [00:01<00:00, 228027.73it/s]


HBox(children=(HTML(value='inferring race from lastnames'), FloatProgress(value=0.0, max=238652.0), HTML(value…




# Fit and transform models

In [13]:
#lda_model,data_vectorized,vectorizer = fit_model(df_socsci,dataset= 'socsci',n_batches=100,max_iter = 10,n_components=200)

In [14]:
#lda_model = restore('../results/lda_model_socsci_k200.p')
#data_vectorized = restore('tmp/data_vectorized.p')
#vectorizer = restore('../results/vectorizer_socsci.p')


In [15]:
lda_model,data_vectorized,vectorizer = fit_model(df_socsci,dataset= 'socsci',n_batches=100,max_iter = 10,n_components=300)



In [16]:
lda_model

LatentDirichletAllocation(learning_method='online', n_components=300, n_jobs=-1,
                          random_state=1234, verbose=1)

In [17]:
# we need to adjust everything below!

In [18]:
p, doc_dist_socsci_300 = transform_data(df_socsci_race,lda_model,vectorizer,dataset= 'socsci',n_batches=100,n_components=300, method='mmds')

In [19]:
p

## top words by topic

In [20]:
LDA =LDA_wrapper()

In [21]:
words_by_topic = LDA.topic_keyowrd_matrix(lda_model,vectorizer)

In [22]:
def top_words(topic):
    tw = topic[topic.values.argsort()][:-5 - 1:-1].index
    tw = ', '.join(tw.to_list())
    return tw 

In [23]:
tws = words_by_topic.apply(lambda row: top_words(row),axis=1)

In [24]:
top_words_df = pd.DataFrame(tws.values, columns=['top_words'])

top_words_df['topic'] = range(300)
top_words_df.topic += 1

In [25]:
top_words_df

Unnamed: 0,top_words,topic
0,"women, violence, gender, sexual, men",1
1,"local, sources, industry, matter, levels",2
2,"biomass, policy preferences, arms, new perspec...",3
3,"points, random, percentage, family business, p...",4
4,"change, policies, constraints, inventory, vari...",5
...,...,...
295,"data, scale, using, size, large",296
296,"product, consumer, consumers, brand, products",297
297,"medical, clinical, patients, patient, art",298
298,"opportunity, respect, repression, hidden, qual...",299


In [26]:
top_words_df.to_csv('../results/top_words_300.csv', index=False)

# projection of topics

In [27]:
# joint_prob, marginal_by_topic, marginal_by_group, dist_diff_topic = project_lda_topics(df_socsci_race,doc_dist_socsci_300)

In [28]:
# joint_prob.to_csv('../results/joint_prob_300.csv', index=False)
# marginal_by_topic.to_csv('../results/marginal_by_topic_300.csv', index=False)
# marginal_by_group.to_csv('../results/marginal_by_group_300.csv', index=False)
# dist_diff_topic.to_csv('../results/dist_diff_topic_300.csv', index=False)

In [29]:
joint_prob_gender, marginal_by_topic_gender, marginal_by_group_gender, dist_diff_topic_gender = intersect_by_gender(df_socsci_race, doc_dist_socsci_300)

In [30]:
joint_prob_gender.to_csv('../results/joint_prob_gender_300.csv',index=False)
marginal_by_topic_gender.to_csv('../results/marginal_by_topic_gender_300.csv',index=False)
marginal_by_group_gender.to_csv('../results/marginal_by_group_gender_300.csv',index=False)
dist_diff_topic_gender.to_csv('../results/dist_diff_topic_gender_300.csv',index=False)

# topics_by_group_prob_MF.to_csv('../results/topics_by_group_prob_gender_200.csv', index=False) 
# topics_by_group_ratio_MF.to_csv('../results/topics_by_group_ratio_gender_200.csv', index=False)


In [31]:
# Topic proportion includes also articles without gender, that's why it differs

topic_proportion = pd.DataFrame(doc_dist_socsci_300.sum(axis=0)/np.sum(doc_dist_socsci_300))

topic_proportion.columns = ['proportion']

topic_proportion['topic'] = topic_proportion.index +1

In [32]:
topic_proportion.to_csv('../results/topic_proportion_300.csv',index=False)

## Health

In [33]:
df_health = pd.read_csv('/data/datasets/WOS/US/text_clean_health.txt')


In [34]:
df_health_race = infer_race(us_papers,df_health)

imputing by the mean: 100%|██████████| 123472/123472 [00:00<00:00, 222495.65it/s]


HBox(children=(HTML(value='inferring race from lastnames'), FloatProgress(value=0.0, max=123472.0), HTML(value…




In [35]:
lda_model_health,data_vec_health,vectorizer_health = fit_model(df_health,dataset= 'health',n_batches=50,max_iter = 10,n_components=200)



In [36]:
# we need to adjust everything below!

In [37]:
p, doc_dist_health_200 = transform_data(df_health_race,lda_model_health,vectorizer_health,dataset= 'health',n_batches=100,n_components=200, method='mmds')

In [38]:
p

manual exploration - msm??

In [40]:
titles = pd.read_csv('/data/datasets/WOS/US/titles.txt', delimiter='\t')

In [41]:
df_health_race.loc[pd.DataFrame(doc_dist_health_200).sort_values(by=[147],ascending=False)[[23]].index[0:10],:]

Unnamed: 0,id_art,text_clean,cluster_ID,Annee_Bibliographique,yfp,Prenom,nom,ordre,nb_auteur,EDiscipline,...,Province,disc_origin,spec_origin,count_origin,gender,cit_all_IAC,white,hispanic,black,asian
28757,47125499,research demonstrated importance engage work f...,50381778,2011,2004,Karren,Kowalski-K,1,1,Health,...,CO,Clinical Medicine,Obstetrics & Gynecology,,F,,0.973842,0.018757,0.002839,0.004563
51298,51554675,three prereading children name target le...,10762584,2013,2013,Yusuke,Hayashi-Y,1,3,Health,...,KS,Health,Rehabilitation,United States,M,0.0,0.087118,0.032262,0.0,0.880621
492,29703816,research findings studies using event related ...,44495870,2008,1980,Victoria J.,Molfese-VJ,1,8,Health,...,KY,Psychology,Developmental & Child Psychology,,F,0.0,0.9268,0.0732,0.0,0.0
89476,59231892,third stage infective larva dracunculus medine...,44428802,2016,1980,Mark L.,Eberhard-ML,1,6,Health,...,GA,Biomedical Research,Parasitology,,M,9.0,0.965208,0.026834,0.002653,0.005306
503,29707904,context community health centers chcs critical...,38295818,2008,1992,Michael E.,Samuels-ME,1,4,Health,...,KY,Health,Health Policy & Services,,M,4.0,0.404184,0.03258,0.555637,0.007599
96838,60373284,gop aca repeal replace plan dead least attenti...,46215961,2017,1986,Timothy Stoltzfus,Jost-TS,1,1,Health,...,VA,Professional Fields,Law,,M,,0.952255,0.032202,0.001219,0.014323
60957,53223278,ideal myoelectric prosthetic hand ability cont...,14419218,2014,2013,Jacob L.,Segil-JL,1,2,Health,...,CO,Health,Speech-Language Pathology and Audiology,United States,M,8.0,0.707402,0.059228,0.097167,0.136203
101999,61235536,less known advice keeping patients safe six th...,45960926,2017,2017,Nancy,OConnor-N,1,1,Health,...,OR,Health,Nursing,United States,F,,0.944247,0.026554,0.021569,0.00763
12257,32577792,optical mapping tool used cardiac electrophysi...,40875942,2009,2002,Mina,Attin-M,1,2,Health,...,IL,Clinical Medicine,Cardiovascular System,,F,5.0,0.707402,0.059228,0.097167,0.136203
499,29704857,research conducted summer camp siblings childr...,21886906,2008,1997,Wendy,Packman-W,1,7,Health,...,CA,Psychology,Developmental & Child Psychology,,F,20.0,1.0,0.0,0.0,0.0


In [42]:
titles[titles.id_Art==63801123].titre.values

array(['At odds with the truth'], dtype=object)

## top words by topic health

In [43]:
LDA =LDA_wrapper()

In [44]:
words_by_topic_health = LDA.topic_keyowrd_matrix(lda_model_health,vectorizer_health)

In [45]:
tws_health = words_by_topic_health.apply(lambda row: top_words(row),axis=1)

In [46]:
top_words_health_df = pd.DataFrame(tws_health.values, columns=['top_words'])

top_words_health_df['topic'] = range(200)
top_words_health_df.topic += 1

In [47]:
top_words_health_df.iloc[30:33,:]

Unnamed: 0,top_words,topic
30,"scale, scores, measures, validity, reliability",31
31,"completing, mental disorders, endorsed, white ...",32
32,"visual, case study, techniques, skill, special",33


In [48]:
top_words_health_df

Unnamed: 0,top_words,topic
0,"development, process, new, outcomes, support",1
1,"states, united, united states, new, york",2
2,"management, diabetes, type, self management, t...",3
3,"rural, urban, areas, observation, vulnerability",4
4,"adolescents, youth, alcohol, adolescent, drinking",5
...,...,...
195,"nurses, nursing, work, practice, care",196
196,"hospital, safety, hospitals, quality, patient",197
197,"obesity, overweight, protocol, weight, obese",198
198,"therapy, stroke, dose, events, event",199


In [49]:
top_words_health_df.to_csv('../results/top_words_health_200.csv', index=False)

# projection of topics

In [50]:
joint_prob_gender_health, marginal_by_topic_gender_health, marginal_by_group_gender_health, dist_diff_topic_gender_health = intersect_by_gender(df_health_race, doc_dist_health_200)

In [51]:
joint_prob_gender_health.to_csv('../results/joint_prob_gender_health_200.csv',index=False)
marginal_by_topic_gender_health.to_csv('../results/marginal_by_topic_gender_health_200.csv',index=False)
marginal_by_group_gender_health.to_csv('../results/marginal_by_group_gender_health_200.csv',index=False)
dist_diff_topic_gender_health.to_csv('../results/dist_diff_topic_gender_health_200.csv',index=False)


In [52]:
topic_proportion_health = pd.DataFrame(doc_dist_health_200.sum(axis=0)/np.sum(doc_dist_health_200))

topic_proportion_health.columns = ['proportion']

topic_proportion_health['topic'] = topic_proportion_health.index +1

In [53]:
topic_proportion_health.to_csv('../results/topic_proportion_health_200.csv',index=False)