## Legacy code

new goals:

1. Valuative classification: proportion of papers in top 1%/5% most cited)
2. Perceptual classification: US news report. 

In [1]:
%config Completer.use_jedi = False

In [2]:
import pandas as pd
import pickle
from tqdm.notebook import tqdm
import numpy as np
from pandas._libs.lib import is_integer


from libs.LastNamesInference import LastNamesInference

In [3]:
def split_by_gender(df_race):
    df_race = df_race.copy()
    df_race['gender'] = df_race.gender.str.upper()

    boolean_mask_M = df_race.gender == 'M'
    boolean_mask_F = df_race.gender == 'F'

    race_dist_M = df_race.loc[boolean_mask_M,['white','hispanic','black','asian']]
    race_dist_F = df_race.loc[boolean_mask_F,['white','hispanic','black','asian']]

    race_dist_M.columns = race_dist_M.columns + '_M'
    race_dist_F.columns = race_dist_F.columns + '_F'
    race_dist_MF = pd.concat([race_dist_M,race_dist_F]).fillna(0).sort_index()
    df_race = df_race.join(race_dist_MF,how='inner' )
    return df_race


In [4]:
def infer_race(us_papers, df=None):
    papers = us_papers.drop_duplicates('cluster_ID')
    lni = LastNamesInference(names = papers.nom)
    if df is None:
        df = us_papers.copy()
    tqdm.pandas(desc="inferring race from lastnames")
    lastname_race_dist = df.progress_apply(lambda x: lni.get_name_dist(lastname=x.nom), axis=1)
    df[lni.prob_order] = pd.DataFrame(lastname_race_dist.to_list(), index=df.index)
    df = split_by_gender(df)
    return df

In [5]:
def weighted_qcut(values, weights, q, **kwargs):
    'Return weighted quantile cuts from a given series, values.'
    if is_integer(q):
        quantiles = np.linspace(0, 1, q + 1)
    else:
        quantiles = q
    order = weights.iloc[values.argsort()].cumsum()
    bins = pd.cut(order / order.iloc[-1], quantiles, **kwargs)
    return bins.sort_index()

In [6]:
def weighted_qcut_labels(df, var):
    lims = df.groupby('{}_Q'.format(var)).agg({var:[lambda x: round(min(x),ndigits=2),lambda x: round(max(x),ndigits=2)]}).droplevel(axis=1,level=1)
    lims.columns = ['min','max']
    lims['label'] = lims.apply(lambda x: '({}, {})'.format(x['min'], x['max']),axis=1)
    lims = lims.drop(columns=['min','max'])
    qlabels = df['{}_Q'.format(var)].map(lims['label'])
    return qlabels

In [7]:
# us_papers = pd.read_csv('/data/datasets//WOS/US/US_papers.txt')
# us_papers.to_pickle('/data/datasets//WOS/US/US_papers.p')
us_papers = pd.read_pickle('/data/datasets//WOS/US/US_papers.p')

In [8]:
us_papers_citations = pd.read_csv('/data/datasets/WOS/US/us_papers_citations.csv', delimiter='\t')

In [9]:
us_papers_citations = us_papers_citations.rename(columns={'ID_Art':'id_art'})

In [10]:
institutions_impact = pd.read_excel('../../data/impact.xlsx')
#institutions_impact = pd.read_excel('/data/datasets/WOS/US/institutions_impact.xlsx', names = ['institution','country', 'npapers','avg_citations'])
#address = pd.read_csv('/data/datasets/WOS/US/Adresses_race.txt', delimiter='\t')

In [11]:
institutions_impact = institutions_impact[pd.to_numeric(institutions_impact['carnegie_id'], errors='coerce').notnull()]

In [12]:
institutions_impact['carnegie_id'] = institutions_impact.carnegie_id.astype(str)

In [13]:
handcoding_df = pd.read_excel('../../data/institutions_20220329.xlsx',sheet_name='Sheet1',converters={'index':int,'carnegie_id':int})
handcoding_df['carnegie_id'] = handcoding_df.carnegie_id.astype('str')
handcoding_flags = pd.read_excel('../../handcoding/carnegie_flags.xlsx',sheet_name='to_clean',converters={'real_unitid':str})

In [14]:
crosswalk_wos_carnegie = handcoding_df[['name','carnegie_id']]

In [15]:
_ = handcoding_flags[['name_wos','real_unitid']].dropna().drop_duplicates().\
rename(columns={'name_wos':'name','real_unitid':'carnegie_id'})

In [16]:
crosswalk_wos_carnegie = crosswalk_wos_carnegie.append(_)

  crosswalk_wos_carnegie = crosswalk_wos_carnegie.append(_)


In [17]:
address_clean= pd.read_pickle('/data/datasets/WOS/US/address_clean_carnegie.p')

### proprotion of articles in the top x% most cited

In [18]:
df_topcr = us_papers_citations.merge(address_clean[['id_art','carnegie_id']].drop_duplicates(),on='id_art')

In [19]:
agg_topcr = df_topcr.groupby('carnegie_id').agg({'id_art':len, 'Top10CR':sum,'Top5CR':sum,'Top1CR':sum})

In [20]:
agg_topcr['Top10CR'] = agg_topcr.Top10CR/agg_topcr.id_art
agg_topcr['Top5CR'] = agg_topcr.Top5CR/agg_topcr.id_art
agg_topcr['Top1CR'] = agg_topcr.Top1CR/agg_topcr.id_art

In [21]:
agg_topcr = agg_topcr.rename(columns={'id_art':'ncitable_docs'})

In [22]:
# I use weighted quartiles, to consider the size of institutions

agg_topcr['Top10CR_Q'] = weighted_qcut(agg_topcr['Top10CR'], agg_topcr['ncitable_docs'], 3, labels=False)
agg_topcr['Top5CR_Q'] = weighted_qcut(agg_topcr['Top5CR'], agg_topcr['ncitable_docs'], 3, labels=False)
agg_topcr['Top1CR_Q'] = weighted_qcut(agg_topcr['Top1CR'], agg_topcr['ncitable_docs'], 3, labels=False)

# agg_topcr['Top10CR_Q'] = pd.qcut(agg_topcr.Top10CR, 4)
# agg_topcr['Top5CR_Q'] = pd.qcut(agg_topcr.Top5CR, 4)
# agg_topcr['Top1CR_Q'] = pd.qcut(agg_topcr.Top1CR, 4)

In [23]:
agg_topcr['Top10CR_Q'] = weighted_qcut_labels(agg_topcr, 'Top10CR')
agg_topcr['Top5CR_Q'] = weighted_qcut_labels(agg_topcr, 'Top5CR')
agg_topcr['Top1CR_Q'] = weighted_qcut_labels(agg_topcr, 'Top1CR')

In [24]:
agg_topcr.Top5CR_Q.value_counts()

(0.0, 0.09)     540
(0.09, 0.11)     80
(0.11, 0.33)     65
Name: Top5CR_Q, dtype: int64

In [25]:
address_clean = address_clean.merge(agg_topcr, on = 'carnegie_id')

## impact

In [26]:
# impact_df = crosswalk_wos_carnegie.merge(institutions_impact, left_on='name',right_on='institution')\
# [['carnegie_id','npapers','avg_citations']].copy()

# impact_df = impact_df[impact_df.carnegie_id!='nan']

# impact_df['impact'] = impact_df.npapers*impact_df.avg_citations

# impact_df = impact_df.groupby('carnegie_id').agg({'npapers':sum, 'impact':sum})

# impact_df['avg_citations'] = impact_df.impact/impact_df.npapers # I reconstruct the average citations as the weighted average

In [27]:
impact_df = institutions_impact.copy()

In [28]:
impact_df = impact_df[impact_df.carnegie_id.isin(address_clean.carnegie_id.unique())]

In [29]:
# I use weighted quartiles, to consider the size of institutions

impact_df['npapers_Q'] = weighted_qcut(impact_df['npapers'], impact_df['npapers'], 3, labels=False)
impact_df['impact_Q'] = weighted_qcut(impact_df['impact'], impact_df['npapers'], 3, labels=False)
impact_df['avg_citations_Q'] = weighted_qcut(impact_df['avg_citations'], impact_df['npapers'], 3, labels=False)

impact_df['npapers_Q'] = weighted_qcut_labels(impact_df, 'npapers')
impact_df['impact_Q'] = weighted_qcut_labels(impact_df, 'impact')
impact_df['avg_citations_Q'] = weighted_qcut_labels(impact_df, 'avg_citations')

In [30]:
impact_df.npapers_Q.value_counts()

(5, 64564)          619
(65003, 145486)      44
(146327, 445620)     22
Name: npapers_Q, dtype: int64

In [31]:
# impact_df['npapers_Q'] = pd.qcut(impact_df.npapers, 4)
# impact_df['impact_Q'] = pd.qcut(impact_df.impact, 4)
# impact_df['avg_citations_Q'] = pd.qcut(impact_df.avg_citations, 4)

In [32]:
address_clean = address_clean.merge(impact_df, on='carnegie_id', how='left')

In [33]:
address_clean.npapers_Q.value_counts()

(146327, 445620)    5528158
(65003, 145486)     5444355
(5, 64564)          5051713
Name: npapers_Q, dtype: int64

In [34]:
address_clean.avg_citations_Q.value_counts()

(1.77, 4.07)    6093028
(1.48, 1.74)    5355699
(0.1, 1.47)     4575499
Name: avg_citations_Q, dtype: int64

In [35]:
address_clean.avg_citations_Q.value_counts()

(1.77, 4.07)    6093028
(1.48, 1.74)    5355699
(0.1, 1.47)     4575499
Name: avg_citations_Q, dtype: int64

In [36]:
address_clean.loc[address_clean.avg_citations_Q=='(1.48, 1.74)','avg_citations'].max()

1.73778974087

In [37]:
address_clean.loc[address_clean.avg_citations_Q=='(1.77, 4.07)','avg_citations'].min()

1.77047619047

In [38]:
address_clean.to_pickle('/data/datasets/WOS/US/address_clean.p')