In [1]:
import pandas as pd
import pickle
from tqdm.notebook import tqdm
import numpy as np
from pandas._libs.lib import is_integer

In [2]:
def weighted_qcut(values, weights, q, **kwargs):
    'Return weighted quantile cuts from a given series, values.'
    if is_integer(q):
        quantiles = np.linspace(0, 1, q + 1)
    else:
        quantiles = q
    order = weights.iloc[values.argsort()].cumsum()
    bins = pd.cut(order / order.iloc[-1], quantiles, **kwargs)
    return bins.sort_index()

In [3]:
def weighted_qcut_labels(df, var,q):
    lims = df.groupby('{}_Q{}'.format(var,q)).agg({var:[lambda x: round(min(x),ndigits=2),lambda x: round(max(x),ndigits=2)]}).droplevel(axis=1,level=1)
    lims.columns = ['min','max']
    lims['label'] = lims.apply(lambda x: '({}, {})'.format(x['min'], x['max']),axis=1)
    lims = lims.drop(columns=['min','max'])
    qlabels = df['{}_Q{}'.format(var,q)].map(lims['label'])
    return qlabels

In [4]:
address_clean = pd.read_pickle('/data/datasets/WOS/US/address_clean.p')

In [5]:
institutions_impact = pd.read_excel('../../data/impact.xlsx')

In [6]:
institutions_impact = institutions_impact[pd.to_numeric(institutions_impact['carnegie_id'], errors='coerce').notnull()]

In [7]:
institutions_impact['carnegie_id'] = institutions_impact.carnegie_id.astype(str)

In [8]:
impact_df = institutions_impact.copy()

In [9]:
impact_df = impact_df[impact_df.carnegie_id.isin(address_clean.carnegie_id.unique())]

In [10]:
# I use weighted quartiles, to consider the size of institutions
impact_df['avg_citations_Q10'] = weighted_qcut(impact_df['avg_citations'], impact_df['npapers'], 10, labels=False)
impact_df['avg_citations_Q10'] = weighted_qcut_labels(impact_df, 'avg_citations',10)

# percentiles creates 10 groups with a single institution and 90 groups with less than 10 institutions
# impact_df['avg_citations_Q100'] = weighted_qcut(impact_df['avg_citations'], impact_df['npapers'], 100, labels=False)
# impact_df['avg_citations_Q100'] = weighted_qcut_labels(impact_df, 'avg_citations',100)

In [11]:
#impact_df.avg_citations_Q100.value_counts().values

In [12]:
impact_df = impact_df[['carnegie_id','avg_citations_Q10']]

In [13]:
address_clean = address_clean.merge(impact_df, on='carnegie_id', how='left')

In [14]:
address_clean.to_pickle('/data/datasets/WOS/US/address_clean.p')