In [1]:
#!pip install statsmodels

In [2]:
%config Completer.use_jedi = False

In [3]:
import pandas as pd
import pickle
#import pickle5 as pickle
from tqdm.notebook import tqdm
import numpy as np
import statsmodels.api as sm
from libs.LastNamesInference import LastNamesInference

import glob
import os

In [4]:
def save(x, file_name):
    with open(file_name, 'wb') as handle:
        pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL)

def restore(file_name):
    with open(file_name, 'rb') as handle:
        x = pickle.load(handle)
    return x

In [5]:
us_papers = pd.read_pickle('/data/datasets//WOS/US/US_papers.p')

In [6]:
cit_2_iac_df = pd.read_csv('/data/datasets/WOS/US/us_papers_citations.csv', delimiter='\t')

In [7]:
jif = pd.read_csv('/data/datasets/WOS/US/us_papers_jif.csv', delimiter='\t')
jif = jif[~jif.FI_2.isnull()]

In [8]:
address_clean = pd.read_pickle('/data/datasets/WOS/US/address_clean.p')

## Fit LDA models and save doc dist

In [9]:
def transform_data(df,lda_model,vectorizer):
    lda_model.n_jobs = 1
    texts = df.text_clean.values
    data_vectorized = vectorizer.transform(texts)
    doc_dist = lda_model.transform(data_vectorized)
    doc_dist_df = pd.DataFrame(doc_dist,columns=['topic_{}'.format(x) for x in range(1,201)])
    doc_dist_df = pd.merge(df['id_art'],doc_dist_df,left_index=True,right_index=True)
    return doc_dist_df


In [10]:
#!mkdir '/data/datasets/WOS/US/lda_fields/'

In [11]:
# def get_doc_dist_all_fields(text_path ="/data/datasets/WOS/US/text/"):
    
#     filenames =  glob.glob(text_path + '*.txt')
#     disciplines = [os.path.basename(x).replace(r'text_clean_', '').replace('.txt','') for x in filenames]  
#     for discipline in tqdm(disciplines):
#         lda_model_ = restore('../../../race/results/lda_fields/lda_model_{}.p'.format(discipline))
#         vectorizer_ = restore('../../../race/results/lda_fields/vectorizer_{}.p'.format(discipline))
#         df_ = pd.read_csv("/data/datasets/WOS/US/text/text_clean_{}.txt".format(discipline))
#         doc_dist_df = transform_data(df_,lda_model_,vectorizer_)
#         doc_dist_df.to_pickle('/data/datasets/WOS/US/lda_fields/doc_dist_{}'.format(discipline))

In [12]:
# get_doc_dist_all_fields(text_path ="/data/datasets/WOS/US/text/")

## Clean dataset

In [13]:
class PrepareDataOLS:
    
    def __init__(self,us_papers,address_clean,jif): #cit_2_iac_df
        df_papers = us_papers[us_papers.ordre==1]
        #df_papers = df_papers.merge(cit_2_iac_df[['ID_Art','Cit_2_iac']], left_on='id_art',right_on='ID_Art').drop('ID_Art',1)
        df_papers = df_papers.merge(jif, left_on='id_art', right_on='ID_Art').drop("ID_Art", 1)
        df_papers = df_papers.merge(address_clean, left_on=['id_art','cluster_ID'], right_on= ['id_art','cluster_id'])
        self.df_papers = df_papers
        self.race_gender_groups= ["white_M", "hispanic_M", "black_M", "asian_M", "white_F", "hispanic_F", "black_F", "asian_F"]
        text_path ="/data/datasets/WOS/US/text/"
        self.disciplines = [os.path.basename(x).replace(r'text_clean_', '').replace('.txt','') for x in glob.glob(text_path + '*.txt')] + ['socsci','health']
        
    def split_by_gender(self,df_race):
        df_race = df_race.copy()
        df_race['gender'] = df_race.gender.str.upper()

        boolean_mask_M = df_race.gender == 'M'
        boolean_mask_F = df_race.gender == 'F'

        race_dist_M = df_race.loc[boolean_mask_M,['white','hispanic','black','asian']]
        race_dist_F = df_race.loc[boolean_mask_F,['white','hispanic','black','asian']]

        race_dist_M.columns = race_dist_M.columns + '_M'
        race_dist_F.columns = race_dist_F.columns + '_F'
        race_dist_MF = pd.concat([race_dist_M,race_dist_F]).fillna(0).sort_index()
        df_race = df_race.join(race_dist_MF,how='inner' )
        return df_race
    
    def infer_race(self,normalized_df):
        authors = normalized_df.drop_duplicates('cluster_ID')
        lni = LastNamesInference(names = authors.nom)
        df = normalized_df.copy()
        tqdm.pandas(desc="inferring race from lastnames")
        lastname_race_dist = df.progress_apply(lambda x: lni.get_name_dist(lastname=x.nom), axis=1)
        df[lni.prob_order] = pd.DataFrame(lastname_race_dist.to_list(), index=df.index)
        df = self.split_by_gender(df)
        return df
 
    def prepare_dataset(self,normalized_df):
        df_race = self.infer_race(normalized_df)
        
        df_race = df_race[df_race.selindex!='not_indexed']
        
        df_race['gender'] = df_race.gender.replace({'M':0,'F':1,
                                     'UNK':np.nan,
                                     'UNI':np.nan,
                                     'uni':np.nan,
                                     'INI':np.nan,
                                     'f':1,
                                     'm':0
                                     })

        df_race['career_age'] = df_race.Annee_Bibliographique - df_race.yfp
        df_race = df_race.drop_duplicates()
        df_race = df_race[~df_race.white_F.isnull()]
        return df_race

    def normalize_by_topic(self,df_papers,discipline):
        if discipline in (['socsci','health']):
            doc_dist_df = pd.read_pickle('/data/datasets/WOS/US/lda_{}_dist.p'.format(discipline))
        else:
            doc_dist_df = pd.read_pickle('/data/datasets/WOS/US/lda_fields/doc_dist_{}'.format(discipline))        
        df = df_papers.merge(doc_dist_df, on ='id_art').drop_duplicates('id_art').copy()

        for dep in ['FI_2','cit_all_IAC']:
            df_tmp=df.copy()
            _ = pd.DataFrame()
            for year in tqdm(df_tmp.Annee_Bibliographique.unique()):
                df_year = df_tmp[df_tmp.Annee_Bibliographique == year].copy().reset_index(drop=True) 
                avg_y_topic = (df_year.filter(like='topic').T@df_year[dep])/df_year.filter(like='topic').sum()
                expected_y = df_year.filter(like='topic')@avg_y_topic
                df_year['norm_{}'.format(dep)] = df_year[dep]/expected_y
                _ = _.append(df_year)
            df = df.merge(_[['id_art','norm_{}'.format(dep)]], how='outer', on='id_art')
        return df
    
    def normalize_disciplines(self):
        normalized_df = pd.DataFrame()
        for discipline in tqdm(self.disciplines):
            _ = self.normalize_by_topic(self.df_papers,discipline)
            normalized_df = normalized_df.append(_)
        return normalized_df
    
    def main(self):
        
        normalized_df = self.normalize_disciplines().reset_index(drop=True)
        df_merge = self.prepare_dataset(normalized_df)
        return df_merge


In [14]:
#prepare_data = PrepareDataOLS(us_papers,address_clean,cit_2_iac_df,jif)
prepare_data = PrepareDataOLS(us_papers,address_clean,jif)

  df_papers = df_papers.merge(jif, left_on='id_art', right_on='ID_Art').drop("ID_Art", 1)


In [15]:
df_merge = prepare_data.main()

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  normalized_df = normalized_df.append(_)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  normalized_df = normalized_df.append(_)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  normalized_df = normalized_df.append(_)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  normalized_df = normalized_df.append(_)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  normalized_df = normalized_df.append(_)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  normalized_df = normalized_df.append(_)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  normalized_df = normalized_df.append(_)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  normalized_df = normalized_df.append(_)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  normalized_df = normalized_df.append(_)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  normalized_df = normalized_df.append(_)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)


  0%|          | 0/11 [00:00<?, ?it/s]

  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  _ = _.append(df_year)
  normalized_df = normalized_df.append(_)
imputing by the mean: 100% 646716/646716 [00:02<00:00, 264665.10it/s]


inferring race from lastnames:   0%|          | 0/1250500 [00:00<?, ?it/s]

In [16]:
df_merge[['Annee_Bibliographique','cit_all_IAC','norm_cit_all_IAC','FI_2','norm_FI_2','FIR_2','cit_rel_all_IAC']]

Unnamed: 0,Annee_Bibliographique,cit_all_IAC,norm_cit_all_IAC,FI_2,norm_FI_2,FIR_2,cit_rel_all_IAC
1,2016,9.0,1.018068,3.129,0.975018,0.997,1.006
2,2010,52.0,1.595845,1.222,0.437601,0.702,2.519
3,2008,62.0,1.407677,3.333,1.012697,1.139,1.729
4,2010,37.0,1.028914,3.875,1.167059,1.559,1.550
5,2016,9.0,1.053697,1.887,0.639668,1.119,1.759
...,...,...,...,...,...,...,...
1250492,2015,6.0,0.884398,1.058,0.723933,0.664,0.886
1250494,2015,8.0,1.178308,0.811,0.559207,0.509,1.182
1250497,2015,8.0,1.113019,2.221,1.447992,1.413,1.221
1250498,2014,4.0,0.449347,0.427,0.294682,0.475,0.710


In [17]:
df_merge.hsi.value_counts(normalize=True)

not HSI    0.943506
HSI        0.056494
Name: hsi, dtype: float64

In [18]:
df_merge.avg_citations_Q10.value_counts()

(0.1, 1.24)     120995
(1.24, 1.34)    103332
(1.54, 1.59)    103223
(1.34, 1.43)    102170
(1.59, 1.66)     92540
(2.17, 4.07)     92114
(1.66, 1.81)     89933
(1.91, 2.15)     86097
(1.43, 1.54)     80966
(1.82, 1.91)     76146
Name: avg_citations_Q10, dtype: int64

In [19]:
df_merge.to_pickle('../../data/df_ols.p')