# Comparison Statistics between Cohorts

## Python Setup

In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
from pandas import Series
import glob
from openpyxl import load_workbook
import re
pd.options.display.max_columns = 100

## Load In Data

In [None]:
funded_grnts = pd.read_csv('../data/funded_grants.csv', low_memory=False)
grnts = pd.read_csv('../data/researcher_grants.csv', low_memory=False)
pubs = pd.read_csv('../data/researcher_publications.csv', low_memory=False)
rsrs = pd.read_csv('../data/researcher_info.csv', low_memory=False)
propensity_scores = pd.read_csv('../data/rsr_weights.csv', low_memory=False)

In [None]:
grnts_ct = pd.read_csv('../data/topic_lookups/grants_cancer_type.csv')
grnts_cso = pd.read_csv('../data/topic_lookups/grants_cso.csv')
grnts_rcdc = pd.read_csv('../data/topic_lookups/grants_rcdc.csv')
pubs_ct = pd.read_csv('../data/topic_lookups/publications_cancer_type.csv')
pubs_cso = pd.read_csv('../data/topic_lookups/publications_cso.csv')
pubs_rcdc = pd.read_csv('../data/topic_lookups/publications_rcdc.csv')

cso_lookup = pd.read_csv('../data/cso_codes/cso_lookup.csv')

In [None]:
print(funded_grnts[funded_grnts['rsr_id'].isnull()].shape)
print(grnts[grnts['rsr_id'].isnull()].shape)
print(pubs[pubs['pub_id'].isnull()].shape)
print(pubs[pubs['date'].isnull()].shape)

In [None]:
funded_grnts = funded_grnts[(funded_grnts['rsr_id'].notnull())
                            &(funded_grnts['grant_id'].notnull())].reset_index(drop=True)
grnts = grnts[(grnts['rsr_id'].notnull())
              &(grnts['grant_id'].notnull())].reset_index(drop=True)
pubs = pubs[(pubs['pub_id'].notnull())&(pubs['date'].notnull())].reset_index(drop=True)

In [None]:
funders = sorted(list(funded_grnts['funder_name'].unique()))
print(funders)

## Cleaning Data

In [None]:
pubs['citations'] = pd.to_numeric(pubs['citations'])
pubs['nb_authors'] = pd.to_numeric(pubs['nb_authors'], errors='coerce')

In [None]:
funded_grnts['start_date'] = pd.to_datetime(funded_grnts['start_date'])
funded_grnts['end_date'] = pd.to_datetime(funded_grnts['end_date'])
grnts['start_date'] = pd.to_datetime(grnts['start_date'])
grnts['end_date'] = pd.to_datetime(grnts['end_date'])
funded_grnts['funding_len'] = (funded_grnts['end_date']-funded_grnts['start_date'])/timedelta(days=365)
grnts['funding_len'] = (grnts['end_date']-grnts['start_date'])/timedelta(days=365)
funded_grnts['start_year'] = pd.DatetimeIndex(funded_grnts['start_date']).year
grnts['start_year'] = pd.DatetimeIndex(grnts['start_date']).year

In [None]:
funded_grnts['nb_rsrs'] = funded_grnts.groupby('grant_id')['rsr_id'].transform('nunique')
funded_grnts['nb_rsrs'] = funded_grnts['nb_rsrs'].replace(0, np.nan)
grnts['nb_rsrs'] = grnts.groupby('grant_id')['rsr_id'].transform('nunique')
grnts['nb_rsrs'] = grnts['nb_rsrs'].replace(0, np.nan)

In [None]:
grnts_rcdc = grnts_rcdc.drop_duplicates().sort_values('grant_id').reset_index(drop=True)
grnts_rcdc['nb_rcdc'] = grnts_rcdc.groupby('grant_id')['grant_id'].transform('count')

grnts_cso = grnts_cso.drop_duplicates().sort_values('grant_id').reset_index(drop=True)
grnts_cso['nb_cso'] = grnts_cso.groupby('grant_id')['grant_id'].transform('count')

pubs_rcdc = pubs_rcdc.drop_duplicates().sort_values('pub_id').reset_index(drop=True)
pubs_rcdc['nb_rcdc'] = pubs_rcdc.groupby('pub_id')['pub_id'].transform('count')

pubs_cso = pubs_cso.drop_duplicates().sort_values('pub_id').reset_index(drop=True)
pubs_cso['nb_cso'] = pubs_cso.groupby('pub_id')['pub_id'].transform('count')

### Create Lookup Tables

In [None]:
funded_grnts_funder = funded_grnts[['funder_name', 'grant_id']].drop_duplicates().reset_index(drop=True)
grnts_funder = grnts[['funder_name', 'grant_id']].drop_duplicates().reset_index(drop=True)

funded_grnts_rsr = funded_grnts[['rsr_id', 'grant_id']].drop_duplicates().reset_index(drop=True)
grnts_rsr = grnts[['rsr_id', 'grant_id']].drop_duplicates().reset_index(drop=True)

pub_rsr_cols = ['rsr_id', 'rsr_country', 'rsr_city', 'rsr_affiliation', 'rsr_affiliation_id']
pubs_rsr = pubs[pub_rsr_cols+['pub_id']].drop_duplicates().reset_index(drop=True)

In [None]:
to_remove = ('rsr_id', 'funder_name', 'first_name', 'last_name')
cols = [col for col in list(funded_grnts) if col not in to_remove]
funded_grnts_info = funded_grnts[cols].drop_duplicates().reset_index(drop=True)

In [None]:
to_remove = ('rsr_id', 'funder_name')
cols = [col for col in list(grnts) if col not in to_remove]
grnts_info = grnts[cols].drop_duplicates('grant_id').reset_index(drop=True)

In [None]:
cols = [col for col in list(pubs) if col not in pub_rsr_cols]
pubs_info = pubs[cols].drop_duplicates('pub_id').reset_index(drop=True)

### Merging on Prior and Subsequent Grants and Publications

I keep only grants and publications that are within 5 years of the funded grant.

In [None]:
start_year=2007
end_year=2012

col1=[]
col2=[]
col3=[]   
for grant_year in range(start_year, end_year+1):
    for award_year in range(grant_year-5, grant_year+6):
        if grant_year>award_year:
            col1.append(grant_year)
            col2.append("pre")
            col3.append(award_year)
        if grant_year==award_year:
            col1.append(grant_year)
            col2.append("pre")
            col3.append(award_year)
        if grant_year<award_year:
            col1.append(grant_year)
            col2.append("post")
            col3.append(award_year)
merge_key = pd.DataFrame({'start_year':col1, 'status':col2, 'year':col3})

In [None]:
temp = pd.merge(funded_grnts[['funder_name', 'grant_id', 'start_year', 'start_date', 'rsr_id']].drop_duplicates(), 
                merge_key, how='left', on='start_year')

In [None]:
grnt_cols = ['rsr_id', 'grant_id', 'start_year']
grnts_mrg = pd.merge(temp, grnts[grnt_cols].drop_duplicates().rename(columns={'start_year':'year'}),
                     how='left', on=['rsr_id', 'year'], suffixes=('', '_2'))

In [None]:
pub_cols = ['rsr_id', 'pub_id', 'year']
pubs_mrg = pd.merge(temp, pubs[pub_cols].drop_duplicates(),
                    how='left', on=['rsr_id', 'year'], suffixes=('', '_2'))

## Store Results in Dictionary

In [None]:
results = dict()

## Create Statistics Functions

In [None]:
def grant_groupby(grnts_mrg, yearly=False):
    
    grnts_info.columns = [col+"_2" for col in list(grnts_info)]
    df = pd.merge(grnts_mrg, grnts_info.rename(columns={'start_year_2':'year'}), 
                  how='left', on=['grant_id_2', 'year'])
    grnts_info.columns = [col[:-2] for col in list(grnts_info)]

    cols = ['grant_id', 'rsr_id', 'status']
    if yearly == True:
        cols += ['year']
    grnts_stats = df.groupby(cols)
    grnts_stats = pd.DataFrame({'nb_grnts':grnts_stats['grant_id_2'].nunique()
                                , 'fund_amt':grnts_stats['funding_amount_2'].mean()
                                , 'avg_fund_len':grnts_stats['funding_len_2'].mean()
                                , 'avg_team_size':grnts_stats['nb_rsrs_2'].mean()
                               }).reset_index()
    grnts_stats = pd.merge(funded_grnts_funder, grnts_stats, how='left', on='grant_id')
    
    return grnts_stats

In [None]:
def pub_groupby(pubs_mrg, yearly=False):
    
    cols = ['grant_id', 'rsr_id', 'status']
    if yearly == True:
        cols += ['year']
    
    # Publication Statistics
    df = pd.merge(pubs_mrg, pubs_info, how='left', on=['pub_id', 'year'])
    pubs_stats = df.groupby(cols)
    pubs_stats = pd.DataFrame({'nb_pubs':pubs_stats['pub_id'].nunique()
                                , 'citations_per_pub':pubs_stats['citations'].mean()
                                , 'team_size':pubs_stats['nb_authors'].mean()
                               }).reset_index()
    
    # Network Statistics:
    df = pd.merge(pubs_mrg, pubs_rsr.rename(columns={'rsr_id':'author_id'}), 
                  how='left', on='pub_id')
    pubs_collab = df.groupby(cols)
    pubs_collab = pd.DataFrame({'nb_collabs': pubs_collab['author_id'].nunique()
                                , 'nb_collab_countries': pubs_collab['rsr_country'].nunique()
                               }).reset_index()
    
    # In Topic or not in Topic?
    df = pd.merge(pubs_mrg, grnts_rcdc, how='left', on='grant_id')
    df = pd.merge(df, pubs_rcdc, how='left', on='pub_id')
    df['in_topic'] = (df['rcdc_name_x']==df['rcdc_name_y'])
    col_temp = list(pubs_mrg)
    df = df.groupby(col_temp)['in_topic'].sum().reset_index()
    df['in_topic'] = np.where(df['in_topic']>0, True, False)
    pubs_topic = df.groupby(cols)
    pubs_topic = pd.DataFrame({'nb_pubs_in_topic': pubs_topic['in_topic'].sum()
                                , 'pct_pubs_in_topic': pubs_topic['in_topic'].mean()
                               }).reset_index()    
    
    
    # Combine the two:
    pubs_stats = pd.merge(pubs_stats, pubs_collab, how='outer', on=cols)
    pubs_stats = pd.merge(pubs_stats, pubs_topic, how='outer', on=cols)
    pubs_stats = pd.merge(funded_grnts_funder, pubs_stats, how='left', on='grant_id')
    
    return pubs_stats

## 1. Funded Grants Statistics

### Number of Grants per Funder

In [None]:
funded_grnts.describe(include='all')

In [None]:
results['nb_unique_rsrs'] = {}

print("Number of unique researchers funded:")
print(funded_grnts['rsr_id'].nunique())
results['nb_unique_rsrs']['overall'] = funded_grnts['rsr_id'].nunique()

print("\nNumber of unique researchers funded by each agency:")
print(funded_grnts.groupby('funder_name')['rsr_id'].nunique())
for funder in funders:
    results['nb_unique_rsrs'][funder] = funded_grnts.groupby('funder_name')['rsr_id'].nunique()[funder]

In [None]:
results['nb_unique_grnts'] = {}

print("Number of unique grants funded:")
print(funded_grnts['grant_id'].nunique())
results['nb_unique_grnts']['overall'] = funded_grnts['grant_id'].nunique()

print("\nNumber of unique grants funded by each agency:")
print(funded_grnts.groupby('funder_name')['grant_id'].nunique())
for funder in funders:
    results['nb_unique_grnts'][funder] = funded_grnts.groupby('funder_name')['grant_id'].nunique()[funder]

### Number of Grants per Researcher

In [None]:
print("Agerage Number of Grants per Researcher:")
print(funded_grnts.groupby('rsr_id')['grant_id'].nunique().mean())
print("\nNumber of grants from agencies per researcher (as % of total):\n")
print(funded_grnts.groupby('rsr_id')['grant_id'].nunique().value_counts(normalize=True).head())

In [None]:
print("Agerage Number of Agencies per Researcher:")
print(funded_grnts.groupby('rsr_id')['funder_name'].nunique().mean())
print("\nBy how many agencies are the researchers funded?\n")
print(funded_grnts.groupby('rsr_id')['funder_name'].nunique().value_counts(normalize=True))

In [None]:
print("How many grants does each agency give to its reserachers in the 5 focal years?")
for funder in funders:
    print("\n{}:".format(funder))
    temp = funded_grnts[funded_grnts['funder_name']==funder].groupby('rsr_id')['grant_id'].nunique().reset_index()
    print(temp['grant_id'].value_counts(normalize=True).head(3))
    print("(Total researchers: {})".format(temp.shape[0]))

### Grant Characteristics

In [None]:
results['grant_characteristics'] = {}

df = pd.merge(funded_grnts_funder, funded_grnts_info, how='left', on='grant_id')
print("Average Grant Amount: {}".format(df['funding_amount'].mean()))
print("Average Funding Length: {}".format(df['funding_len'].mean()))
print("Average Team Size: {}".format(df['nb_rsrs'].mean()))
results['grant_characteristics']['overall'] = df[['funding_amount', 'funding_len', 'nb_rsrs']].describe()

funded_grnts_amt_avg = []
funded_grnts_amt_med = []
funded_grnts_len_avg = []
funded_grnts_len_med = []
funded_grnts_team_size_avg = []
funded_grnts_team_size_med = []
for funder in funders:
    print("\n{}:".format(funder))
    temp = df[df['funder_name']==funder].copy()
    print(temp[['funding_amount', 'funding_len', 'nb_rsrs']].describe())
    print("(Total grants: {})".format(temp.shape[0]))
    results['grant_characteristics'][funder] = temp[['funding_amount', 'funding_len', 'nb_rsrs']].describe()

In [None]:
temp = pd.merge(funded_grnts_funder, funded_grnts_info, how='left', on='grant_id')
temp['year'] = pd.DatetimeIndex(temp['start_date']).year
pd.crosstab(temp['funder_name'], temp['year'])

## 2. Prior and Subsequent Grants

In [None]:
grnts_stats = grant_groupby(grnts_mrg)

In [None]:
results['pre_grnt_stats'] = {}
results['post_grnt_stats'] = {}

cols = ['nb_grnts', 'fund_amt', 'avg_fund_len', 'avg_team_size']

temp = grnts_stats[grnts_stats['status']=="pre"].copy()
print("Average Funding Length Pre-Grant: {}".format(temp['avg_fund_len'].mean()))
print("Average Team Size Pre-Grant: {}".format(temp['avg_team_size'].mean()))
print("Average Funding Amount Pre-Grant: {}".format(temp['fund_amt'].mean()))
print("Average Total Number of Grants Pre-Grant: {}".format(temp['nb_grnts'].mean()))
results['pre_grnt_stats']['overall'] = temp[cols].describe()

temp = grnts_stats[grnts_stats['status']=="post"].copy()
print("\nAverage Funding Length Post-Grant: {}".format(temp['avg_fund_len'].mean()))
print("Average Team Size Post-Grant: {}".format(temp['avg_team_size'].mean()))
print("Average Funding Amount Post-Grant: {}".format(temp['fund_amt'].mean()))
print("Average Total Number of Grants Post-Grant: {}".format(temp['nb_grnts'].mean()))
results['post_grnt_stats']['overall'] = temp[cols].describe()

print("\n")

print("Pre-Funding Grant Statistics:")
df = grnts_stats[grnts_stats['status']=="pre"].copy()
for funder in funders:
    print("\n{}:".format(funder))
    temp = df[df['funder_name']==funder]
    print(temp[cols].describe())
    results['pre_grnt_stats'][funder] = temp[cols].describe()

print("\n")

print("Post-Funding Grant Statistics:")
df = grnts_stats[grnts_stats['status']=="post"].copy()
for funder in funders:
    print("\n{}:".format(funder))
    temp = df[df['funder_name']==funder]
    print(temp[cols].describe())
    results['pre_grnt_stats'][funder] = temp[cols].describe()

## 3. Prior and Subsequent Publications

In [None]:
pubs_stats = pub_groupby(pubs_mrg)

In [None]:
results['pre_pubs_stats'] = {}
results['post_pubs_stats'] = {}

cols = ['citations_per_pub', 'nb_pubs', 'team_size', 'nb_collabs', 'nb_collab_countries']

temp = pubs_stats[pubs_stats['status']=="pre"].copy()
print("Average Citations Pre-Grant: {}".format(temp['citations_per_pub'].mean()))
print("Average Total Number of Publications Pre-Grant: {}".format(temp['nb_pubs'].mean()))
results['pre_pubs_stats']['overall'] = temp[cols].describe()

temp = pubs_stats[pubs_stats['status']=="post"].copy()
print("\nAverage Citations Post-Grant: {}".format(temp['citations_per_pub'].mean()))
print("Average Total Number of Publications Post-Grant: {}".format(temp['nb_pubs'].mean()))
results['post_pubs_stats']['overall'] = temp[cols].describe()

print("\n")

print("Pre-Funding Publication Statistics:")
df = pubs_stats[pubs_stats['status']=="pre"].copy()
for funder in funders:
    print("\n{}:".format(funder))
    temp = df[df['funder_name']==funder]
    print(temp[cols].describe())
    results['pre_pubs_stats'][funder] = temp[cols].describe()

print("\n")

print("Post-Funding Publication Statistics:")
df = pubs_stats[pubs_stats['status']=="post"].copy()
cols = ['citations_per_pub', 'nb_pubs', 'team_size', 'nb_collabs', 'nb_collab_countries']
for funder in funders:
    print("\n{}:".format(funder))
    temp = df[df['funder_name']==funder]
    print(temp[cols].describe())
    results['post_pubs_stats'][funder] = temp[cols].describe()

## 4. Topic Analyses

### Funded Grant RCDC Codes Analysis

In [None]:
df = pd.merge(funded_grnts_info, grnts_rcdc, how='left', on='grant_id')

In [None]:
print("Most Common RCDC Codes:")
print(df['rcdc_name'].value_counts().head())
funded_grnts_rcdc_tot = df['rcdc_name'].value_counts().index[0:5]

In [None]:
df = pd.merge(funded_grnts_funder, grnts_rcdc, how='left', on='grant_id')

In [None]:
print("Average number of RCDC's per Grant:\n")
print(df[['funder_name', 'grant_id', 'nb_rcdc']].drop_duplicates().groupby('funder_name')['nb_rcdc'].mean())

In [None]:
funded_grnts_rcdc_1 = []
funded_grnts_rcdc_2 = []
funded_grnts_rcdc_3 = []
funded_grnts_rcdc_4 = []
funded_grnts_rcdc_5 = []

for funder in funders:
    print("\n{}:".format(funder))
    temp = df[df['funder_name']==funder]
    print(temp['rcdc_name'].value_counts().head())
    funded_grnts_rcdc_1.append(temp['rcdc_name'].value_counts().index[0])
    funded_grnts_rcdc_2.append(temp['rcdc_name'].value_counts().index[1])
    funded_grnts_rcdc_3.append(temp['rcdc_name'].value_counts().index[2])
    funded_grnts_rcdc_4.append(temp['rcdc_name'].value_counts().index[3])
    funded_grnts_rcdc_5.append(temp['rcdc_name'].value_counts().index[4])

In [None]:
pubs_cso_1 = pubs_cso.drop_duplicates('pub_id', keep='first').reset_index(drop=True)

### Publication CSO Codes

In [None]:
df = pd.merge(pubs_mrg, pubs_cso, how='left', on='pub_id')
df = df[['funder_name', 'pub_id', 'cso', 'nb_cso']].drop_duplicates().reset_index(drop=True)

In [None]:
print("Average number of CSO Codes per Publication Associated to Grant:\n")
print(df[['funder_name', 'pub_id', 'nb_cso']].drop_duplicates().groupby('funder_name')['nb_cso'].mean())

In [None]:
cso_1 = []
cso_2 = []
cso_3 = []
cso_4 = []
cso_5 = []

for funder in funders:
    print("\n{}:".format(funder))
    temp = df[df['funder_name']==funder]
    print("Number of Publications with missing CSO Codes: {}".format(temp[temp['cso'].isnull()].shape[0]))
    print(temp['cso'].value_counts().head())
    cso_1.append(temp['cso'].value_counts().index[0])
    cso_2.append(temp['cso'].value_counts().index[1])
    cso_3.append(temp['cso'].value_counts().index[2])
    cso_4.append(temp['cso'].value_counts().index[3])
    cso_5.append(temp['cso'].value_counts().index[4])

### CSO * RCDC Distributions

In [None]:
rcdc_codes_to_remove = ['CANCER', 'CLINICAL RESEARCH', 'PREVENTION', 'DIAGNOSTIC RADIOLOGY', 
                        'BEHAVIORAL AND SOCIAL SCIENCE', 'PATIENT SAFETY', 'HEALTH SERVICES', 'IMMUNIZATION', 
                        'COMPLEMENTARY AND ALTERNATIVE MEDICINE', 'BASIC BEHAVIORAL AND SOCIAL SCIENCE', 
                        'COMPARATIVE EFFECTIVENESS RESEARCH', 'CLINICAL TRIALS AND SUPPORTIVE ACTIVITIES',
                        'NETWORKING AND INFORMATION TECHNOLOGY R&D', 'BURDEN OF ILLNESS']

In [None]:
topic_distributions = pd.merge(pubs_mrg, pubs_cso, on='pub_id', how='left')
topic_distributions = pd.merge(topic_distributions, cso_lookup, on='cso_name', how='left')
topic_distributions = pd.merge(topic_distributions, pubs_rcdc, on='pub_id', how='left')

In [None]:
# Which CSO?
topic_distributions.rename(columns={'cso_cat': 'cso'}, inplace=True)

In [None]:
# Restrictions
topic_distributions = topic_distributions[topic_distributions['status']=="post"]

In [None]:
# RCDC:
df = topic_distributions[['funder_name', 'rsr_id', 'pub_id', 'rcdc_name']].drop_duplicates()
df = df[df['rcdc_name'].notnull()].reset_index(drop=True)
df = df[df['rcdc_name']!="NAN"].reset_index(drop=True)    
for rcdc in rcdc_codes_to_remove:
    df = df[df['rcdc_name']!=rcdc].reset_index(drop=True)

out = pd.DataFrame()
for agency in list(df['funder_name'].unique()):
    temp = df[df['funder_name']==agency].copy().reset_index(drop=True)
    den = temp[['pub_id', 'rsr_id']].drop_duplicates().shape[0]
    temp['nb_rcdc'] = temp.groupby(['pub_id', 'rsr_id'])['pub_id'].transform('count')
    temp[agency+' pubs'] = 1/temp['nb_rcdc']
    temp = temp.groupby('rcdc_name')[agency+' pubs'].sum().reset_index()
    print(temp[agency+' pubs'].sum())
    temp[agency+' share'] = temp[agency+' pubs']/den
    print(temp[agency+' share'].sum())
    temp.sort_values(agency+' pubs', ascending=False, inplace=True)
    del temp[agency+' pubs']
    
    if out.empty:
        out = temp
    else:
        out = pd.merge(out, temp, how='outer', on='rcdc_name')
    
out['average_share'] = (out['INCa/INSERM/DGOS share']+out['National Cancer Institute share']
                          +out['Cancer Research UK share']+out['Wellcome Trust share']
                          +out['National Health and Medical Research Council share'])/5
    
out.to_csv('../output/top_topics/rcdc.csv'.format(agency.replace("/", "-")), index=False)
print(out.shape[0])

In [None]:
# CSO
df = topic_distributions[['funder_name', 'rsr_id', 'pub_id', 'cso']].drop_duplicates()
df = df[df['cso'].notnull()].reset_index(drop=True)
df = df[df['cso']!="NAN"].reset_index(drop=True)    

out = pd.DataFrame()
for agency in list(df['funder_name'].unique()):
    temp = df[df['funder_name']==agency].copy().reset_index(drop=True)
    den = temp[['pub_id', 'rsr_id']].drop_duplicates().shape[0]
    temp['nb_cso'] = temp.groupby(['pub_id', 'rsr_id'])['pub_id'].transform('count')
    temp[agency+' pubs'] = 1/temp['nb_cso']
    temp = temp.groupby('cso')[agency+' pubs'].sum().reset_index()
    print(temp[agency+' pubs'].sum())
    temp[agency+' share'] = temp[agency+' pubs']/den
    print(temp[agency+' share'].sum())
    temp.sort_values(agency+' pubs', ascending=False, inplace=True)
    del temp[agency+' pubs']
    
    if out.empty:
        out = temp
    else:
        out = pd.merge(out, temp, how='outer', on='cso')

out['average_share'] = (out['INCa/INSERM/DGOS share']+out['National Cancer Institute share']
                          +out['Cancer Research UK share']+out['Wellcome Trust share']
                          +out['National Health and Medical Research Council share'])/5
    
out.to_csv('../output/top_topics/cso.csv'.format(agency.replace("/", "-")), index=False)
print(out.shape[0])

In [None]:
df = topic_distributions[['funder_name', 'rsr_id', 'pub_id', 'rcdc_name', 'cso']].drop_duplicates()
df = df[df['rcdc_name'].notnull()].reset_index(drop=True)
df = df[df['rcdc_name']!="NAN"].reset_index(drop=True)    
for rcdc in rcdc_codes_to_remove:
    df = df[df['rcdc_name']!=rcdc].reset_index(drop=True)
# df = df[df['rcdc_name'].str.upper().str.contains("CANCER")]
df = df[df['cso'].notnull()].reset_index(drop=True)
df = df[df['cso']!="NAN"].reset_index(drop=True)    
df['rcdc_cso'] = df['rcdc_name']+" * "+df['cso']

out = pd.DataFrame()
for agency in list(df['funder_name'].unique()):
    temp = df[df['funder_name']==agency].copy().reset_index(drop=True)
    den = temp[['pub_id', 'rsr_id']].drop_duplicates().shape[0]
    print(den)
    temp['nb'] = temp.groupby(['pub_id', 'rsr_id'])['pub_id'].transform('count')
    temp[agency+' pubs'] = 1/temp['nb']
    temp = temp.groupby('rcdc_cso')[agency+' pubs'].sum().reset_index()
    print(temp[agency+' pubs'].sum())
    temp.sort_values(agency+' pubs', ascending=False, inplace=True)
    
    if out.empty:
        out = temp
    else:
        out = pd.merge(out, temp, how='outer', on='rcdc_cso')

out.to_csv('../output/top_topics/rcdc_cso.csv'.format(agency.replace("/", "-")), index=False)
print(out.shape[0])

In [None]:
df = topic_distributions[['funder_name', 'rsr_id', 'pub_id', 'year', 'rcdc_name', 'cso']].drop_duplicates()
df = df[df['rcdc_name'].notnull()].reset_index(drop=True)
df = df[df['rcdc_name']!="NAN"].reset_index(drop=True)    
for rcdc in rcdc_codes_to_remove:
    df = df[df['rcdc_name']!=rcdc].reset_index(drop=True)
# df = df[df['rcdc_name'].str.upper().str.contains("CANCER")]
df = df[df['cso'].notnull()].reset_index(drop=True)
df = df[df['cso']!="NAN"].reset_index(drop=True)    
df['rcdc_cso'] = df['rcdc_name']+" * "+df['cso']

out = pd.DataFrame()
for agency in list(df['funder_name'].unique()):
    temp = df[df['funder_name']==agency].copy().reset_index(drop=True)
    den = temp[['pub_id', 'rsr_id']].drop_duplicates().shape[0]
    print(den)
    temp['nb'] = temp.groupby(['pub_id', 'rsr_id'])['pub_id'].transform('count')
    temp[agency+' pubs'] = 1/temp['nb']
    temp = temp.groupby(['rcdc_cso', 'year'])[agency+' pubs'].sum().reset_index()
    print(temp[agency+' pubs'].sum())
#     temp[agency+' share'] = temp[agency+' pubs']
#     print(temp[agency+' share'].sum())
    temp.sort_values(agency+' pubs', ascending=False, inplace=True)
    
    if out.empty:
        out = temp
    else:
        out = pd.merge(out, temp, how='outer', on=['rcdc_cso', 'year'])

        
# out['average_share'] = (out['INCa/INSERM/DGOS share']+out['National Cancer Institute share']
#                           +out['Cancer Research UK share']+out['Wellcome Trust share']
#                           +out['National Health and Medical Research Council share'])/5
    
out.to_csv('../output/top_topics/rcdc_cso_by_year.csv'.format(agency.replace("/", "-")), index=False)
print(out.shape[0])

## 5. Researcher Level Statistics

In [None]:
rsrs.describe(include='all')

## 6. Balanced Prior Publications

For this section, we restrict data to the first grant of every researcher (which is why averages are not quite the same as before).

In [None]:
pubs_stats = pub_groupby(pubs_mrg, yearly=True)

In [None]:
balanced_pubs_stats = pd.merge(pubs_stats, propensity_scores, on=['rsr_id', 'funder_name', 'grant_id'])
balanced_pubs_stats = pd.merge(balanced_pubs_stats, rsrs[['rsr_id', 'rsr_gender', 'rsr_career_start_year']]
                               , how='left', on='rsr_id')
balanced_pubs_stats['rsr_career_age'] = balanced_pubs_stats['year']-balanced_pubs_stats['rsr_career_start_year']
balanced_pubs_stats = balanced_pubs_stats[balanced_pubs_stats['weight'].notnull()]

In [None]:
cols = ['citations_per_pub', 'nb_pubs', 'team_size', 'nb_collabs', 'nb_collab_countries', 'rsr_career_age']

In [None]:
df = pd.DataFrame()
for var in cols:
    temp = balanced_pubs_stats[['funder_name', var, 'status']].copy()
    temp = pd.pivot_table(temp, index='status', columns='funder_name', values=var, aggfunc='mean').reset_index()
    temp['var'] = var
    temp['propensity_weight'] = False
    if df.empty:
        df = temp
    else:
        df = pd.concat([df, temp], sort=False)
df[['propensity_weight', 'status', 'var']+funders].sort_values(['status', 'var'], ascending=[False,True])    

In [None]:
df = pd.DataFrame()
for var in cols:
    temp = balanced_pubs_stats[['funder_name', var, 'status', 'weight']].copy()
    temp[var+'_temp'] = temp[var]*temp['weight']
    temp = pd.pivot_table(temp, index='status', columns='funder_name', 
                          values=var+'_temp', aggfunc='sum').reset_index()
    temp['var'] = var
    
    for funder in funders:
        pre_den = (np.where(balanced_pubs_stats[balanced_pubs_stats['funder_name']==funder]['status']=="post", 0, 
                            np.where(balanced_pubs_stats[balanced_pubs_stats['funder_name']==funder][var].notnull(), 
                                     balanced_pubs_stats[balanced_pubs_stats['funder_name']==funder]['weight'], 
                                     0))).sum()
        post_den = (np.where(balanced_pubs_stats[balanced_pubs_stats['funder_name']==funder]['status']=="pre", 0, 
                            np.where(balanced_pubs_stats[balanced_pubs_stats['funder_name']==funder][var].notnull(), 
                                     balanced_pubs_stats[balanced_pubs_stats['funder_name']==funder]['weight'], 
                                     0))).sum()
        temp[funder] = np.where(temp['status']=="pre", temp[funder]/pre_den, temp[funder]/post_den)
            
    temp['var'] = var
    temp['propensity_weight'] = True
    if df.empty:
        df = temp
    else:
        df = pd.concat([df, temp])
df[['propensity_weight', 'status', 'var']+funders].sort_values(['status', 'var'], ascending=[False,True])

In [None]:
df = pd.DataFrame()
temp = balanced_pubs_stats[['funder_name', 'rsr_gender', 'status']]
temp = pd.crosstab(temp['funder_name'], temp['rsr_gender'],normalize='index').reset_index()
if df.empty:
    df = temp
else:
    df = pd.concat([df, temp])
df[['funder_name', 'F', 'M', 'UNKNOWN']]

In [None]:
df = pd.DataFrame()
temp = balanced_pubs_stats[['funder_name', 'rsr_gender', 'status', 'weight']]
temp = pd.crosstab(temp['funder_name'], temp['rsr_gender'], temp['weight'], aggfunc = sum, 
                   normalize='index').reset_index()
if df.empty:
    df = temp
else:
    df = pd.concat([df, temp])
df[['funder_name', 'F', 'M', 'UNKNOWN']]

## Creating Output Comparison Table

### Overall Table

In [None]:
# df = pd.DataFrame({
#     'funded_grnts_per_rsrs_tot': funded_grnts_per_rsrs_tot
#     , 'agencies_per_rsrs_tot': agencies_per_rsrs_tot
#     , 'funded_amt_tot': funded_amt_tot
#     , 'funded_len_tot': funded_len_tot
#     , 'nb_grnt_rsrs_tot': nb_grnt_rsrs_tot
#     , 'grnt_fund_len_tot': grnt_fund_len_tot
#     , 'grnt_team_size_tot': grnt_team_size_tot
#     , 'grnt_fund_amt_tot': grnt_fund_amt_tot
#     , 'pre_avg_fund_len_tot': pre_avg_fund_len_tot
#     , 'pre_avg_team_size_tot': pre_avg_team_size_tot
#     , 'pre_fund_amt_tot': pre_fund_amt_tot
#     , 'pre_nb_grnts_tot': pre_nb_grnts_tot
#     , 'post_avg_fund_len_tot': post_avg_fund_len_tot
#     , 'post_avg_team_size_tot': post_avg_team_size_tot
#     , 'post_fund_amt_tot': post_fund_amt_tot
#     , 'post_nb_grnts_tot': post_nb_grnts_tot
#     , 'pub_cit_tot': pub_cit_tot
#     , 'pub_team_size_tot': pub_team_size_tot
#     , 'pre_pub_cit_tot': pre_pub_cit_tot
#     , 'pre_nb_pubs_tot': pre_nb_pubs_tot
#     , 'post_pub_cit_tot': post_pub_cit_tot
#     , 'post_nb_pubs_tot': post_nb_pubs_tot
#     }, index=['mean', 'std']).transpose()
# df

In [None]:
# # Export to Excel
# ls = !ls ../output/
# if 'comparison_statistics.xlsx' in ls:
#     book = load_workbook('../output/comparison_statistics.xlsx')
#     writer = pd.ExcelWriter('../output/comparison_statistics.xlsx', engine='openpyxl') 
#     writer.book = book
#     writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
#     df.to_excel(writer, "raw_all")
#     writer.save()
# else:
#     df.to_excel('../output/comparison_statistics.xlsx', sheet_name = 'raw_all')

### Table by Funding Agency

In [None]:
# df = pd.DataFrame({'rcdc_1':rcdc_1
#                     , 'rcdc_2':rcdc_2
#                     , 'rcdc_3':rcdc_3
#                     , 'rcdc_4':rcdc_4
#                     , 'rcdc_5':rcdc_5                   
#                     , 'nb_unique_rsrs': nb_unique_rsrs
#                     , 'nb_unique_grnts': nb_unique_grnts
#                     , 'funded_amt_avg': funded_amt_avg
#                     , 'funded_amt_med': funded_amt_med
#                     , 'funded_len_avg': funded_len_avg
#                     , 'funded_len_med': funded_len_med
#                     , 'nb_grnt_rsrs_avg': nb_grnt_rsrs_avg
#                     , 'nb_grnt_rsrs_med': nb_grnt_rsrs_med
#                     , 'pre_avg_fund_len_avg': pre_avg_fund_len_avg
#                     , 'pre_avg_fund_len_med': pre_avg_fund_len_med
#                     , 'pre_avg_team_size_avg': pre_avg_team_size_avg
#                     , 'pre_avg_team_size_med': pre_avg_team_size_med
#                     , 'pre_fund_amt_avg': pre_fund_amt_avg
#                     , 'pre_fund_amt_med': pre_fund_amt_med
#                     , 'pre_nb_grnts_avg': pre_nb_grnts_avg
#                     , 'pre_nb_grnts_med': pre_nb_grnts_med
#                     , 'post_avg_fund_len_avg': post_avg_fund_len_avg
#                     , 'post_avg_fund_len_med': post_avg_fund_len_med
#                     , 'post_avg_team_size_avg': post_avg_team_size_avg
#                     , 'post_avg_team_size_med': post_avg_team_size_med
#                     , 'post_fund_amt_avg': post_fund_amt_avg
#                     , 'post_fund_amt_med': post_fund_amt_med
#                     , 'post_nb_grnts_avg': post_nb_grnts_avg
#                     , 'post_nb_grnts_med': post_nb_grnts_med
#                     , 'pre_citations_avg': pre_citations_avg
#                     , 'pre_nb_pubs_avg': pre_nb_pubs_avg
#                     , 'pre_citations_med': pre_citations_med
#                     , 'pre_nb_pubs_med': pre_nb_pubs_med
#                     , 'post_citations_avg': post_citations_avg
#                     , 'post_citations_med': post_citations_med
#                     , 'post_nb_pubs_avg': post_nb_pubs_avg
#                     , 'post_nb_pubs_med': post_nb_pubs_med
#                    , 'pre_team_size_avg': pre_team_size_avg
#                    , 'pre_team_size_med': pre_team_size_med
#                    , 'post_team_size_avg': post_team_size_avg
#                    , 'post_team_size_med': post_team_size_med
#                   }, index=funders).transpose()
# df

In [None]:
# # Export to Excel
# ls = !ls ../output/
# if 'comparison_statistics.xlsx' in ls:
#     book = load_workbook('../output/comparison_statistics.xlsx')
#     writer = pd.ExcelWriter('../output/comparison_statistics.xlsx', engine='openpyxl') 
#     writer.book = book
#     writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
#     df.to_excel(writer, "raw_by_agency")
#     writer.save()
# else:
#     df.to_excel('../output/comparison_statistics.xlsx', sheet_name = 'raw_by_agency')

## Preparing Data for Regression Analysis

In [None]:
yearly = True

### Researcher Info

In [None]:
rsrs.describe(include='all')

### Grant Information

In [None]:
print(funded_grnts_info.set_index(['grant_id']).index.is_unique)
print(funded_grnts_info.shape)
funded_grnts_info.head()

### Prior/Subsequent Grant Statistics

In [None]:
grnts_stats = grant_groupby(grnts_mrg, yearly=yearly)

In [None]:
# Check if unit of observation is Funder-Grant-Researcher:
cols = ['funder_name', 'grant_id', 'rsr_id', 'status']
if yearly == True:
    cols+=['year']
print(grnts_stats.set_index(cols).index.is_unique)
print(grnts_stats.shape)
grnts_stats.head()

### Prior/Subsequent Publication Statistics

In [None]:
pubs_stats = pub_groupby(pubs_mrg, yearly=yearly)

In [None]:
# Check if unit of observation is Funder-Grant-Researcher:
cols = ['funder_name', 'grant_id', 'rsr_id', 'status']
if yearly == True:
    cols+=['year']
print(pubs_stats.set_index(cols).index.is_unique)
print(pubs_stats.shape)
pubs_stats.head()

### Add CSO Codes

In [None]:
pubs_cso.head()

In [None]:
pubs_cso_features = pd.merge(pubs_mrg[['funder_name', 'grant_id', 'rsr_id', 'year', 'pub_id']], pubs_cso, 
                             how='inner', on='pub_id')
pubs_cso_features['cso_id'] = pubs_cso_features.groupby(['cso_name']).ngroup()
pubs_cso_features['cso'] = "cso_"+pubs_cso_features['cso_id'].astype(str)

In [None]:
pubs_cso_features = pubs_cso_features.groupby(['funder_name', 'grant_id', 'rsr_id', 'cso'])
pubs_cso_features = pd.DataFrame({'nb_pubs': pubs_cso_features['pub_id'].count()}).reset_index()

In [None]:
pubs_cso_features = pd.pivot_table(pubs_cso_features, index=['funder_name', 'grant_id', 'rsr_id'], 
                                   columns='cso', values='nb_pubs', aggfunc='sum').reset_index()

In [None]:
print(pubs_cso_features.set_index(['funder_name', 'grant_id', 'rsr_id']).index.is_unique)
print(pubs_cso_features.shape)
pubs_cso_features.head()

### Combine All

In [None]:
df = pd.merge(funded_grnts[['funder_name', 'grant_id', 'start_year', 'start_date', 'rsr_id']].drop_duplicates(), 
              merge_key, how='left', on='start_year')
df = pd.merge(funded_grnts_funder, funded_grnts_rsr, how='outer', on='grant_id')
df = pd.merge(df, funded_grnts_info, how='outer', on='grant_id')
df = pd.merge(df, merge_key, how='left', on='start_year')
df = pd.merge(df, grnts_stats, how='outer', on=['funder_name', 'grant_id', 'rsr_id', 'year', 'status'])
df = pd.merge(df, pubs_stats, how='outer', on=['funder_name', 'grant_id', 'rsr_id', 'year', 'status'])
df = pd.merge(df, rsrs, how='left', on='rsr_id')
df = pd.merge(df, propensity_scores[['rsr_id', 'inca_prob', 'weight']], on='rsr_id', how='left')
df['rsr_career_age'] = df['year']-df['rsr_career_start_year']
# df = pd.merge(df, pubs_cso_features, how='outer', on=['funder_name', 'grant_id', 'rsr_id'])

In [None]:
print(df.set_index(['funder_name', 'grant_id', 'rsr_id', 'year', 'status']).index.is_unique)
print(df.shape)
df.describe(include='all')

In [None]:
df.to_csv('../data/regression_dataset.csv', index=False)

## Sandbox