# Comparison Statistics between Cohorts

## Python Setup

In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from pandas import Series
import glob
from openpyxl import load_workbook
import re
pd.options.display.max_columns = 100

## Load In Data

In [89]:
funded_grnts = pd.read_csv('../data/funded_grants.csv', low_memory=False)
grnts = pd.read_csv('../data/researcher_grants.csv', low_memory=False)
pubs = pd.read_csv('../data/researcher_publications.csv', low_memory=False)
rsrs = pd.read_csv('../data/researcher_info.csv', low_memory=False)
propensity_scores = pd.read_csv('../data/rsr_weights.csv', low_memory=False)

In [3]:
grnts_ct = pd.read_csv('../data/topic_lookups/grants_cancer_type.csv')
grnts_cso = pd.read_csv('../data/topic_lookups/grants_cso.csv')
grnts_rcdc = pd.read_csv('../data/topic_lookups/grants_rcdc.csv')
pubs_ct = pd.read_csv('../data/topic_lookups/publications_cancer_type.csv')
pubs_cso = pd.read_csv('../data/topic_lookups/publications_cso.csv')
pubs_rcdc = pd.read_csv('../data/topic_lookups/publications_rcdc.csv')

cso_lookup = pd.read_csv('../data/cso_codes/cso_lookup.csv')

In [16]:
funded_grnts.head()

Unnamed: 0,funder_name,grant_id,rsr_id,first_name,last_name,start_date,end_date,funding_amount,funding_len,start_year,nb_rsrs
0,INCa/INSERM/DGOS,grant.7426242,ur.0642054564.81,Jérôme,ABADIE,2011-12-13,2013-06-12,65061.0,1.49863,2011,1
1,INCa/INSERM/DGOS,grant.7426178,ur.01177206360.47,Julien,ADAM,2011-11-01,2014-11-01,240110.0,3.00274,2011,2
2,INCa/INSERM/DGOS,grant.7154464,ur.01303404424.36&ur.01067706306.01,Antoine,ADENIS,2010-06-01,2013-06-01,248109.0,3.00274,2010,2
3,INCa/INSERM/DGOS,grant.7154483,ur.0673152200.72,Eric,ADRIAENSSENS,2010-11-04,2013-11-04,400306.0,3.00274,2010,1
4,INCa/INSERM/DGOS,grant.7154359,ur.0673152200.72,Eric,ADRIAENSSENS,2007-12-18,2010-06-30,140617.0,2.534247,2007,2


In [92]:
funded_grnts = pd.merge(funded_grnts, grnts_rcdc, how='left', on='grant_id')
funded_grnts = funded_grnts[funded_grnts['rcdc_name']=="CERVICAL CANCER"]
del funded_grnts['rcdc_name'], funded_grnts['nb_rcdc']

In [93]:
pubs = pd.merge(pubs, pubs_rcdc, how='left', on='pub_id')
pubs = pubs[pubs['rcdc_name']=="CERVICAL CANCER"]
del pubs['rcdc_name'], pubs['nb_rcdc']

In [94]:
print(funded_grnts[funded_grnts['rsr_id'].isnull()].shape)
print(grnts[grnts['rsr_id'].isnull()].shape)
print(pubs[pubs['pub_id'].isnull()].shape)
print(pubs[pubs['date'].isnull()].shape)

(0, 8)
(0, 6)
(0, 17)
(2, 17)


In [95]:
funded_grnts = funded_grnts[(funded_grnts['rsr_id'].notnull())
                            &(funded_grnts['grant_id'].notnull())].reset_index(drop=True)
grnts = grnts[(grnts['rsr_id'].notnull())
              &(grnts['grant_id'].notnull())].reset_index(drop=True)
pubs = pubs[(pubs['pub_id'].notnull())&(pubs['date'].notnull())].reset_index(drop=True)

In [96]:
funders = sorted(list(funded_grnts['funder_name'].unique()))
print(funders)

['Cancer Research UK', 'INCa/INSERM/DGOS', 'National Cancer Institute', 'National Health and Medical Research Council', 'Wellcome Trust']


## Cleaning Data

In [97]:
pubs['citations'] = pd.to_numeric(pubs['citations'])
pubs['nb_authors'] = pd.to_numeric(pubs['nb_authors'], errors='coerce')

In [98]:
funded_grnts['start_date'] = pd.to_datetime(funded_grnts['start_date'])
funded_grnts['end_date'] = pd.to_datetime(funded_grnts['end_date'])
grnts['start_date'] = pd.to_datetime(grnts['start_date'])
grnts['end_date'] = pd.to_datetime(grnts['end_date'])
funded_grnts['funding_len'] = (funded_grnts['end_date']-funded_grnts['start_date'])/timedelta(days=365)
grnts['funding_len'] = (grnts['end_date']-grnts['start_date'])/timedelta(days=365)
funded_grnts['start_year'] = pd.DatetimeIndex(funded_grnts['start_date']).year
grnts['start_year'] = pd.DatetimeIndex(grnts['start_date']).year

In [99]:
funded_grnts['nb_rsrs'] = funded_grnts.groupby('grant_id')['rsr_id'].transform('nunique')
funded_grnts['nb_rsrs'] = funded_grnts['nb_rsrs'].replace(0, np.nan)
grnts['nb_rsrs'] = grnts.groupby('grant_id')['rsr_id'].transform('nunique')
grnts['nb_rsrs'] = grnts['nb_rsrs'].replace(0, np.nan)

In [100]:
grnts_rcdc = grnts_rcdc.drop_duplicates().sort_values('grant_id').reset_index(drop=True)
grnts_rcdc['nb_rcdc'] = grnts_rcdc.groupby('grant_id')['grant_id'].transform('count')

grnts_cso = grnts_cso.drop_duplicates().sort_values('grant_id').reset_index(drop=True)
grnts_cso['nb_cso'] = grnts_cso.groupby('grant_id')['grant_id'].transform('count')

pubs_rcdc = pubs_rcdc.drop_duplicates().sort_values('pub_id').reset_index(drop=True)
pubs_rcdc['nb_rcdc'] = pubs_rcdc.groupby('pub_id')['pub_id'].transform('count')

pubs_cso = pubs_cso.drop_duplicates().sort_values('pub_id').reset_index(drop=True)
pubs_cso['nb_cso'] = pubs_cso.groupby('pub_id')['pub_id'].transform('count')

### Create Lookup Tables

In [101]:
funded_grnts_funder = funded_grnts[['funder_name', 'grant_id']].drop_duplicates().reset_index(drop=True)
grnts_funder = grnts[['funder_name', 'grant_id']].drop_duplicates().reset_index(drop=True)

funded_grnts_rsr = funded_grnts[['rsr_id', 'grant_id']].drop_duplicates().reset_index(drop=True)
grnts_rsr = grnts[['rsr_id', 'grant_id']].drop_duplicates().reset_index(drop=True)

pub_rsr_cols = ['rsr_id', 'rsr_country', 'rsr_city', 'rsr_affiliation', 'rsr_affiliation_id']
pubs_rsr = pubs[pub_rsr_cols+['pub_id']].drop_duplicates().reset_index(drop=True)

In [102]:
to_remove = ('rsr_id', 'funder_name', 'first_name', 'last_name')
cols = [col for col in list(funded_grnts) if col not in to_remove]
funded_grnts_info = funded_grnts[cols].drop_duplicates().reset_index(drop=True)

In [103]:
to_remove = ('rsr_id', 'funder_name')
cols = [col for col in list(grnts) if col not in to_remove]
grnts_info = grnts[cols].drop_duplicates('grant_id').reset_index(drop=True)

In [104]:
cols = [col for col in list(pubs) if col not in pub_rsr_cols]
pubs_info = pubs[cols].drop_duplicates('pub_id').reset_index(drop=True)

### Merging on Prior and Subsequent Grants and Publications

I keep only grants and publications that are within 5 years of the funded grant.

In [105]:
start_year=2007
end_year=2012

col1=[]
col2=[]
col3=[]   
for grant_year in range(start_year, end_year+1):
    for award_year in range(grant_year-5, grant_year+6):
        if grant_year>award_year:
            col1.append(grant_year)
            col2.append("pre")
            col3.append(award_year)
        if grant_year==award_year:
            col1.append(grant_year)
            col2.append("pre")
            col3.append(award_year)
        if grant_year<award_year:
            col1.append(grant_year)
            col2.append("post")
            col3.append(award_year)
merge_key = pd.DataFrame({'start_year':col1, 'status':col2, 'year':col3})

In [106]:
temp = pd.merge(funded_grnts[['funder_name', 'grant_id', 'start_year', 'start_date', 'rsr_id']].drop_duplicates(), 
                merge_key, how='left', on='start_year')

In [107]:
grnt_cols = ['rsr_id', 'grant_id', 'start_year']
grnts_mrg = pd.merge(temp, grnts[grnt_cols].drop_duplicates().rename(columns={'start_year':'year'}),
                     how='left', on=['rsr_id', 'year'], suffixes=('', '_2'))

In [108]:
pub_cols = ['rsr_id', 'pub_id', 'year']
pubs_mrg = pd.merge(temp, pubs[pub_cols].drop_duplicates(),
                    how='left', on=['rsr_id', 'year'], suffixes=('', '_2'))

## Store Results in Dictionary

In [109]:
results = dict()

## Create Statistics Functions

In [110]:
def grant_groupby(grnts_mrg, yearly=False):
    
    grnts_info.columns = [col+"_2" for col in list(grnts_info)]
    df = pd.merge(grnts_mrg, grnts_info.rename(columns={'start_year_2':'year'}), 
                  how='left', on=['grant_id_2', 'year'])
    grnts_info.columns = [col[:-2] for col in list(grnts_info)]

    cols = ['grant_id', 'rsr_id', 'status']
    if yearly == True:
        cols += ['year']
    grnts_stats = df.groupby(cols)
    grnts_stats = pd.DataFrame({'nb_grnts':grnts_stats['grant_id_2'].nunique()
                                , 'fund_amt':grnts_stats['funding_amount_2'].mean()
                                , 'avg_fund_len':grnts_stats['funding_len_2'].mean()
                                , 'avg_team_size':grnts_stats['nb_rsrs_2'].mean()
                               }).reset_index()
    grnts_stats = pd.merge(funded_grnts_funder, grnts_stats, how='left', on='grant_id')
    
    return grnts_stats

In [111]:
def pub_groupby(pubs_mrg, yearly=False):
    
    cols = ['grant_id', 'rsr_id', 'status']
    if yearly == True:
        cols += ['year']
    
    # Publication Statistics
    df = pd.merge(pubs_mrg, pubs_info, how='left', on=['pub_id', 'year'])
    pubs_stats = df.groupby(cols)
    pubs_stats = pd.DataFrame({'nb_pubs':pubs_stats['pub_id'].nunique()
                                , 'citations_per_pub':pubs_stats['citations'].mean()
                                , 'team_size':pubs_stats['nb_authors'].mean()
                               }).reset_index()
    
    # Network Statistics:
    df = pd.merge(pubs_mrg, pubs_rsr.rename(columns={'rsr_id':'author_id'}), 
                  how='left', on='pub_id')
    pubs_collab = df.groupby(cols)
    pubs_collab = pd.DataFrame({'nb_collabs': pubs_collab['author_id'].nunique()
                                , 'nb_collab_countries': pubs_collab['rsr_country'].nunique()
                               }).reset_index()
    
    # Combine the two:
    pubs_stats = pd.merge(pubs_stats, pubs_collab, how='outer', on=cols)
    pubs_stats = pd.merge(funded_grnts_funder, pubs_stats, how='left', on='grant_id')
    
    return pubs_stats

## 1. Funded Grants Statistics

### Number of Grants per Funder

In [43]:
funded_grnts.describe(include='all')

Unnamed: 0,funder_name,grant_id,rsr_id,first_name,last_name,start_date,end_date,funding_amount,funding_len,start_year,nb_rsrs
count,3249,3249,3249,3248,3249,3249,3241,3043.0,3241.0,3249.0,3249.0
unique,5,2399,2484,1978,1997,533,344,,,,
top,National Cancer Institute,grant.2695966,ur.01117731572.33,David,HEIMBROOK,2009-01-01 00:00:00,2013-01-01 00:00:00,,,,
freq,2359,18,40,64,40,172,103,,,,
first,,,,,,2007-01-01 00:00:00,2007-09-30 00:00:00,,,,
last,,,,,,2012-12-31 00:00:00,2023-08-31 00:00:00,,,,
mean,,,,,,,,2066182.0,4.278447,2009.567559,2.244691
std,,,,,,,,4177095.0,2.377289,1.693713,2.399999
min,,,,,,,,3500.0,0.073973,2007.0,1.0
25%,,,,,,,,330779.5,2.665753,2008.0,1.0


In [44]:
results['nb_unique_rsrs'] = {}

print("Number of unique researchers funded:")
print(funded_grnts['rsr_id'].nunique())
results['nb_unique_rsrs']['overall'] = funded_grnts['rsr_id'].nunique()

print("\nNumber of unique researchers funded by each agency:")
print(funded_grnts.groupby('funder_name')['rsr_id'].nunique())
for funder in funders:
    results['nb_unique_rsrs'][funder] = funded_grnts.groupby('funder_name')['rsr_id'].nunique()[funder]

Number of unique researchers funded:
2484

Number of unique researchers funded by each agency:
funder_name
Cancer Research UK                               155
INCa/INSERM/DGOS                                  89
National Cancer Institute                       1848
National Health and Medical Research Council     352
Wellcome Trust                                    55
Name: rsr_id, dtype: int64


In [45]:
results['nb_unique_grnts'] = {}

print("Number of unique grants funded:")
print(funded_grnts['grant_id'].nunique())
results['nb_unique_grnts']['overall'] = funded_grnts['grant_id'].nunique()

print("\nNumber of unique grants funded by each agency:")
print(funded_grnts.groupby('funder_name')['grant_id'].nunique())
for funder in funders:
    results['nb_unique_grnts'][funder] = funded_grnts.groupby('funder_name')['grant_id'].nunique()[funder]

Number of unique grants funded:
2399

Number of unique grants funded by each agency:
funder_name
Cancer Research UK                               201
INCa/INSERM/DGOS                                 118
National Cancer Institute                       1892
National Health and Medical Research Council     156
Wellcome Trust                                    32
Name: grant_id, dtype: int64


### Number of Grants per Researcher

In [46]:
print("Agerage Number of Grants per Researcher:")
print(funded_grnts.groupby('rsr_id')['grant_id'].nunique().mean())
print("\nNumber of grants from agencies per researcher (as % of total):\n")
print(funded_grnts.groupby('rsr_id')['grant_id'].nunique().value_counts(normalize=True).head())

Agerage Number of Grants per Researcher:
1.3079710144927537

Number of grants from agencies per researcher (as % of total):

1    0.816023
2    0.121981
3    0.038245
4    0.012480
5    0.006039
Name: grant_id, dtype: float64


In [47]:
print("Agerage Number of Agencies per Researcher:")
print(funded_grnts.groupby('rsr_id')['funder_name'].nunique().mean())
print("\nBy how many agencies are the researchers funded?\n")
print(funded_grnts.groupby('rsr_id')['funder_name'].nunique().value_counts(normalize=True))

Agerage Number of Agencies per Researcher:
1.0060386473429952

By how many agencies are the researchers funded?

1    0.994767
2    0.004428
3    0.000805
Name: funder_name, dtype: float64


In [48]:
print("How many grants does each agency give to its reserachers in the 5 focal years?")
for funder in funders:
    print("\n{}:".format(funder))
    temp = funded_grnts[funded_grnts['funder_name']==funder].groupby('rsr_id')['grant_id'].nunique().reset_index()
    print(temp['grant_id'].value_counts(normalize=True).head(3))
    print("(Total researchers: {})".format(temp.shape[0]))

How many grants does each agency give to its reserachers in the 5 focal years?

Cancer Research UK:
1    0.838710
2    0.090323
3    0.038710
Name: grant_id, dtype: float64
(Total researchers: 155)

INCa/INSERM/DGOS:
1    0.662921
2    0.191011
4    0.056180
Name: grant_id, dtype: float64
(Total researchers: 89)

National Cancer Institute:
1    0.820887
2    0.127706
3    0.035173
Name: grant_id, dtype: float64
(Total researchers: 1848)

National Health and Medical Research Council:
1    0.812500
2    0.099432
3    0.051136
Name: grant_id, dtype: float64
(Total researchers: 352)

Wellcome Trust:
1    0.836364
2    0.109091
3    0.036364
Name: grant_id, dtype: float64
(Total researchers: 55)


### Grant Characteristics

In [49]:
results['grant_characteristics'] = {}

df = pd.merge(funded_grnts_funder, funded_grnts_info, how='left', on='grant_id')
print("Average Grant Amount: {}".format(df['funding_amount'].mean()))
print("Average Funding Length: {}".format(df['funding_len'].mean()))
print("Average Team Size: {}".format(df['nb_rsrs'].mean()))
results['grant_characteristics']['overall'] = df[['funding_amount', 'funding_len', 'nb_rsrs']].describe()

funded_grnts_amt_avg = []
funded_grnts_amt_med = []
funded_grnts_len_avg = []
funded_grnts_len_med = []
funded_grnts_team_size_avg = []
funded_grnts_team_size_med = []
for funder in funders:
    print("\n{}:".format(funder))
    temp = df[df['funder_name']==funder].copy()
    print(temp[['funding_amount', 'funding_len', 'nb_rsrs']].describe())
    print("(Total grants: {})".format(temp.shape[0]))
    results['grant_characteristics'][funder] = temp[['funding_amount', 'funding_len', 'nb_rsrs']].describe()

Average Grant Amount: 1445552.075978162
Average Funding Length: 4.168169551993043
Average Team Size: 1.35431429762401

Cancer Research UK:
       funding_amount  funding_len     nb_rsrs
count             0.0   201.000000  201.000000
mean              NaN     3.622027    1.024876
std               NaN     1.924575    0.156135
min               NaN     0.073973    1.000000
25%               NaN     2.663014    1.000000
50%               NaN     3.328767    1.000000
75%               NaN     5.000000    1.000000
max               NaN    10.504110    2.000000
(Total grants: 201)

INCa/INSERM/DGOS:
       funding_amount  funding_len     nb_rsrs
count    1.180000e+02   117.000000  118.000000
mean     4.550756e+05     2.835874    1.237288
std      3.262651e+05     0.651797    0.427235
min      3.070900e+04     0.997260    1.000000
25%      2.632482e+05     2.997260    1.000000
50%      3.998370e+05     3.000000    1.000000
75%      5.924550e+05     3.002740    1.000000
max      2.123158e+06  

In [50]:
temp = pd.merge(funded_grnts_funder, funded_grnts_info, how='left', on='grant_id')
temp['year'] = pd.DatetimeIndex(temp['start_date']).year
pd.crosstab(temp['funder_name'], temp['year'])

year,2007,2008,2009,2010,2011,2012
funder_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cancer Research UK,35,42,27,30,37,30
INCa/INSERM/DGOS,19,13,28,12,29,17
National Cancer Institute,303,317,420,283,264,305
National Health and Medical Research Council,23,28,23,15,33,34
Wellcome Trust,3,8,5,7,5,4


## 2. Prior and Subsequent Grants

In [51]:
grnts_stats = grant_groupby(grnts_mrg)

In [52]:
results['pre_grnt_stats'] = {}
results['post_grnt_stats'] = {}

cols = ['nb_grnts', 'fund_amt', 'avg_fund_len', 'avg_team_size']

temp = grnts_stats[grnts_stats['status']=="pre"].copy()
print("Average Funding Length Pre-Grant: {}".format(temp['avg_fund_len'].mean()))
print("Average Team Size Pre-Grant: {}".format(temp['avg_team_size'].mean()))
print("Average Funding Amount Pre-Grant: {}".format(temp['fund_amt'].mean()))
print("Average Total Number of Grants Pre-Grant: {}".format(temp['nb_grnts'].mean()))
results['pre_grnt_stats']['overall'] = temp[cols].describe()

temp = grnts_stats[grnts_stats['status']=="post"].copy()
print("\nAverage Funding Length Post-Grant: {}".format(temp['avg_fund_len'].mean()))
print("Average Team Size Post-Grant: {}".format(temp['avg_team_size'].mean()))
print("Average Funding Amount Post-Grant: {}".format(temp['fund_amt'].mean()))
print("Average Total Number of Grants Post-Grant: {}".format(temp['nb_grnts'].mean()))
results['post_grnt_stats']['overall'] = temp[cols].describe()

print("\n")

print("Pre-Funding Grant Statistics:")
df = grnts_stats[grnts_stats['status']=="pre"].copy()
for funder in funders:
    print("\n{}:".format(funder))
    temp = df[df['funder_name']==funder]
    print(temp[cols].describe())
    results['pre_grnt_stats'][funder] = temp[cols].describe()

print("\n")

print("Post-Funding Grant Statistics:")
df = grnts_stats[grnts_stats['status']=="post"].copy()
for funder in funders:
    print("\n{}:".format(funder))
    temp = df[df['funder_name']==funder]
    print(temp[cols].describe())
    results['pre_grnt_stats'][funder] = temp[cols].describe()

Average Funding Length Pre-Grant: 4.068297631223557
Average Team Size Pre-Grant: 2.2994527252597536
Average Funding Amount Pre-Grant: 1991879.5988396415
Average Total Number of Grants Pre-Grant: 8.297014465989536

Average Funding Length Post-Grant: 3.6753175598654364
Average Team Size Post-Grant: 2.5709125856938693
Average Funding Amount Post-Grant: 1988254.8848456023
Average Total Number of Grants Post-Grant: 1.6189596799015082


Pre-Funding Grant Statistics:

Cancer Research UK:
         nb_grnts      fund_amt  avg_fund_len  avg_team_size
count  206.000000  8.600000e+01    206.000000     206.000000
mean     4.893204  1.809503e+06      3.581729       1.616840
std      4.594615  2.308962e+06      1.457221       1.363630
min      1.000000  5.328500e+03      0.079452       1.000000
25%      1.000000  4.563517e+05      2.929990       1.000000
50%      3.000000  1.054485e+06      3.500685       1.000000
75%      7.000000  2.008241e+06      4.392955       1.685897
max     21.000000  1.57437

## 3. Prior and Subsequent Publications

In [53]:
pubs_stats = pub_groupby(pubs_mrg)

In [54]:
results['pre_pubs_stats'] = {}
results['post_pubs_stats'] = {}

cols = ['citations_per_pub', 'nb_pubs', 'team_size', 'nb_collabs', 'nb_collab_countries']

temp = pubs_stats[pubs_stats['status']=="pre"].copy()
print("Average Citations Pre-Grant: {}".format(temp['citations_per_pub'].mean()))
print("Average Total Number of Publications Pre-Grant: {}".format(temp['nb_pubs'].mean()))
results['pre_pubs_stats']['overall'] = temp[cols].describe()

temp = pubs_stats[pubs_stats['status']=="post"].copy()
print("\nAverage Citations Post-Grant: {}".format(temp['citations_per_pub'].mean()))
print("Average Total Number of Publications Post-Grant: {}".format(temp['nb_pubs'].mean()))
results['post_pubs_stats']['overall'] = temp[cols].describe()

print("\n")

print("Pre-Funding Publication Statistics:")
df = pubs_stats[pubs_stats['status']=="pre"].copy()
for funder in funders:
    print("\n{}:".format(funder))
    temp = df[df['funder_name']==funder]
    print(temp[cols].describe())
    results['pre_pubs_stats'][funder] = temp[cols].describe()

print("\n")

print("Post-Funding Publication Statistics:")
df = pubs_stats[pubs_stats['status']=="post"].copy()
cols = ['citations_per_pub', 'nb_pubs', 'team_size', 'nb_collabs', 'nb_collab_countries']
for funder in funders:
    print("\n{}:".format(funder))
    temp = df[df['funder_name']==funder]
    print(temp[cols].describe())
    results['post_pubs_stats'][funder] = temp[cols].describe()

Average Citations Pre-Grant: 88.89044448273924
Average Total Number of Publications Pre-Grant: 8.587873191751308

Average Citations Post-Grant: 41.301844690468236
Average Total Number of Publications Post-Grant: 11.12588488765774


Pre-Funding Publication Statistics:

Cancer Research UK:
       citations_per_pub     nb_pubs   team_size  nb_collabs  \
count         130.000000  206.000000  130.000000  206.000000   
mean          115.830292   13.407767   13.255647   77.155340   
std           121.002273   22.226303   26.174716  135.590471   
min             1.000000    0.000000    2.000000    0.000000   
25%            45.000000    0.000000    6.440789    0.000000   
50%            96.775000    3.000000    8.704762   12.000000   
75%           147.181338   16.000000   11.458333   96.750000   
max           923.000000   93.000000  283.000000  720.000000   

       nb_collab_countries  
count           206.000000  
mean              4.169903  
std               6.419123  
min               

## 4. Topic Analyses

### Funded Grant RCDC Codes Analysis

In [55]:
df = pd.merge(funded_grnts_info, grnts_rcdc, how='left', on='grant_id')

In [56]:
print("Most Common RCDC Codes:")
print(df['rcdc_name'].value_counts().head())
funded_grnts_rcdc_tot = df['rcdc_name'].value_counts().index[0:5]

Most Common RCDC Codes:
CANCER               2399
BREAST CANCER        2399
CLINICAL RESEARCH    1030
PREVENTION            850
GENETICS              834
Name: rcdc_name, dtype: int64


In [57]:
df = pd.merge(funded_grnts_funder, grnts_rcdc, how='left', on='grant_id')

In [58]:
print("Average number of RCDC's per Grant:\n")
print(df[['funder_name', 'grant_id', 'nb_rcdc']].drop_duplicates().groupby('funder_name')['nb_rcdc'].mean())

Average number of RCDC's per Grant:

funder_name
Cancer Research UK                              5.179104
INCa/INSERM/DGOS                                4.915254
National Cancer Institute                       5.833510
National Health and Medical Research Council    4.493590
Wellcome Trust                                  4.218750
Name: nb_rcdc, dtype: float64


In [59]:
funded_grnts_rcdc_1 = []
funded_grnts_rcdc_2 = []
funded_grnts_rcdc_3 = []
funded_grnts_rcdc_4 = []
funded_grnts_rcdc_5 = []

for funder in funders:
    print("\n{}:".format(funder))
    temp = df[df['funder_name']==funder]
    print(temp['rcdc_name'].value_counts().head())
    funded_grnts_rcdc_1.append(temp['rcdc_name'].value_counts().index[0])
    funded_grnts_rcdc_2.append(temp['rcdc_name'].value_counts().index[1])
    funded_grnts_rcdc_3.append(temp['rcdc_name'].value_counts().index[2])
    funded_grnts_rcdc_4.append(temp['rcdc_name'].value_counts().index[3])
    funded_grnts_rcdc_5.append(temp['rcdc_name'].value_counts().index[4])


Cancer Research UK:
CANCER               201
BREAST CANCER        201
CLINICAL RESEARCH     97
GENETICS              86
PREVENTION            57
Name: rcdc_name, dtype: int64

INCa/INSERM/DGOS:
CANCER               118
BREAST CANCER        118
CLINICAL RESEARCH     58
GENETICS              40
PREVENTION            28
Name: rcdc_name, dtype: int64

National Cancer Institute:
CANCER               1892
BREAST CANCER        1892
CLINICAL RESEARCH     825
PREVENTION            719
GENETICS              652
Name: rcdc_name, dtype: int64

National Health and Medical Research Council:
BREAST CANCER        156
CANCER               156
PREVENTION            44
CLINICAL RESEARCH     41
GENETICS              40
Name: rcdc_name, dtype: int64

Wellcome Trust:
BREAST CANCER        32
CANCER               32
GENETICS             16
HUMAN GENOME          9
CLINICAL RESEARCH     9
Name: rcdc_name, dtype: int64


In [60]:
pubs_cso_1 = pubs_cso.drop_duplicates('pub_id', keep='first').reset_index(drop=True)

### Publication CSO Codes

In [63]:
df = pd.merge(pubs_mrg, pubs_cso, how='left', on='pub_id')
df = df[['funder_name', 'pub_id', 'cso_name', 'nb_cso']].drop_duplicates().reset_index(drop=True)

In [64]:
print("Average number of CSO Codes per Publication Associated to Grant:\n")
print(df[['funder_name', 'pub_id', 'nb_cso']].drop_duplicates().groupby('funder_name')['nb_cso'].mean())

Average number of CSO Codes per Publication Associated to Grant:

funder_name
Cancer Research UK                              1.548647
INCa/INSERM/DGOS                                1.543735
National Cancer Institute                       1.543091
National Health and Medical Research Council    1.516412
Wellcome Trust                                  1.667308
Name: nb_cso, dtype: float64


In [65]:
cso_name_1 = []
cso_name_2 = []
cso_name_3 = []
cso_name_4 = []
cso_name_5 = []

for funder in funders:
    print("\n{}:".format(funder))
    temp = df[df['funder_name']==funder]
    print("Number of Publications with missing cso_name Codes: {}".format(temp[temp['cso_name'].isnull()].shape[0]))
    print(temp['cso_name'].value_counts().head())
    cso_name_1.append(temp['cso_name'].value_counts().index[0])
    cso_name_2.append(temp['cso_name'].value_counts().index[1])
    cso_name_3.append(temp['cso_name'].value_counts().index[2])
    cso_name_4.append(temp['cso_name'].value_counts().index[3])
    cso_name_5.append(temp['cso_name'].value_counts().index[4])


Cancer Research UK:
Number of Publications with missing cso_name Codes: 381
Endogenous Factors in the Origin and Cause of Cancer       349
Cancer Initiation: Oncogenes and Tumor Suppressor Genes    267
Technology Development and/or Marker Discovery             253
Cancer Progression and Metastasis                          242
Systemic Therapies - Discovery and Development             236
Name: cso_name, dtype: int64

INCa/INSERM/DGOS:
Number of Publications with missing cso_name Codes: 276
Technology and/or Marker Testing in a Clinical Setting    251
Technology Development and/or Marker Discovery            222
Endogenous Factors in the Origin and Cause of Cancer      202
Systemic Therapies - Clinical Applications                189
Cancer Progression and Metastasis                         181
Name: cso_name, dtype: int64

National Cancer Institute:
Number of Publications with missing cso_name Codes: 3011
Cancer Progression and Metastasis                          2855
Cancer Initiatio

### CSO * RCDC Distributions

In [66]:
rcdc_codes_to_remove = ['CANCER', 'CLINICAL RESEARCH', 'PREVENTION', 'DIAGNOSTIC RADIOLOGY', 
                        'BEHAVIORAL AND SOCIAL SCIENCE', 'PATIENT SAFETY', 'HEALTH SERVICES', 'IMMUNIZATION', 
                        'COMPLEMENTARY AND ALTERNATIVE MEDICINE', 'BASIC BEHAVIORAL AND SOCIAL SCIENCE', 
                        'COMPARATIVE EFFECTIVENESS RESEARCH', 'CLINICAL TRIALS AND SUPPORTIVE ACTIVITIES',
                        'NETWORKING AND INFORMATION TECHNOLOGY R&D', 'BURDEN OF ILLNESS']

In [67]:
topic_distributions = pd.merge(pubs_mrg, pubs_cso, on='pub_id', how='left')
topic_distributions = pd.merge(topic_distributions, cso_lookup, on='cso_name', how='left')
topic_distributions = pd.merge(topic_distributions, pubs_rcdc, on='pub_id', how='left')

In [68]:
# Which CSO?
topic_distributions.rename(columns={'cso_cat': 'cso'}, inplace=True)

In [69]:
# Restrictions
topic_distributions = topic_distributions[topic_distributions['status']=="post"]

In [70]:
# RCDC:
df = topic_distributions[['funder_name', 'rsr_id', 'pub_id', 'rcdc_name']].drop_duplicates()
df = df[df['rcdc_name'].notnull()].reset_index(drop=True)
df = df[df['rcdc_name']!="NAN"].reset_index(drop=True)    
for rcdc in rcdc_codes_to_remove:
    df = df[df['rcdc_name']!=rcdc].reset_index(drop=True)
# df = df[df['rcdc_name'].str.upper().str.contains("CANCER")]

out = pd.DataFrame()
for agency in list(df['funder_name'].unique()):
    temp = df[df['funder_name']==agency].copy().reset_index(drop=True)
    den = temp[['pub_id', 'rsr_id']].drop_duplicates().shape[0]
    temp['nb_rcdc'] = temp.groupby(['pub_id', 'rsr_id'])['pub_id'].transform('count')
    temp[agency+' pubs'] = 1/temp['nb_rcdc']
    temp = temp.groupby('rcdc_name')[agency+' pubs'].sum().reset_index()
    print(temp[agency+' pubs'].sum())
    temp[agency+' share'] = temp[agency+' pubs']/den
    print(temp[agency+' share'].sum())
    temp.sort_values(agency+' pubs', ascending=False, inplace=True)
    del temp[agency+' pubs']
    
    if out.empty:
        out = temp
    else:
        out = pd.merge(out, temp, how='outer', on='rcdc_name')
    
out['average_share'] = (out['INCa/INSERM/DGOS share']+out['National Cancer Institute share']
                          +out['Cancer Research UK share']+out['Wellcome Trust share']
                          +out['National Health and Medical Research Council share'])/5
    
out.to_csv('../output/top_topics/rcdc.csv'.format(agency.replace("/", "-")), index=False)
print(out.shape[0])

1355.0000000000027
1.0000000000000018
17252.99999999987
0.9999999999999925
395.99999999999994
0.9999999999999999
1653.0000000000048
1.000000000000003
3404.9999999999836
0.9999999999999958
137


In [71]:
# CSO
df = topic_distributions[['funder_name', 'rsr_id', 'pub_id', 'cso']].drop_duplicates()
df = df[df['cso'].notnull()].reset_index(drop=True)
df = df[df['cso']!="NAN"].reset_index(drop=True)    

out = pd.DataFrame()
for agency in list(df['funder_name'].unique()):
    temp = df[df['funder_name']==agency].copy().reset_index(drop=True)
    den = temp[['pub_id', 'rsr_id']].drop_duplicates().shape[0]
    temp['nb_cso'] = temp.groupby(['pub_id', 'rsr_id'])['pub_id'].transform('count')
    temp[agency+' pubs'] = 1/temp['nb_cso']
    temp = temp.groupby('cso')[agency+' pubs'].sum().reset_index()
    print(temp[agency+' pubs'].sum())
    temp[agency+' share'] = temp[agency+' pubs']/den
    print(temp[agency+' share'].sum())
    temp.sort_values(agency+' pubs', ascending=False, inplace=True)
    del temp[agency+' pubs']
    
    if out.empty:
        out = temp
    else:
        out = pd.merge(out, temp, how='outer', on='cso')

out['average_share'] = (out['INCa/INSERM/DGOS share']+out['National Cancer Institute share']
                          +out['Cancer Research UK share']+out['Wellcome Trust share']
                          +out['National Health and Medical Research Council share'])/5
    
out.to_csv('../output/top_topics/cso.csv'.format(agency.replace("/", "-")), index=False)
print(out.shape[0])

1109.0
1.0000000000000002
14521.000000000007
1.0000000000000004
1369.0
1.0
347.0
1.0
2829.0
1.0
6


In [72]:
df = topic_distributions[['funder_name', 'rsr_id', 'pub_id', 'rcdc_name', 'cso']].drop_duplicates()
df = df[df['rcdc_name'].notnull()].reset_index(drop=True)
df = df[df['rcdc_name']!="NAN"].reset_index(drop=True)    
for rcdc in rcdc_codes_to_remove:
    df = df[df['rcdc_name']!=rcdc].reset_index(drop=True)
# df = df[df['rcdc_name'].str.upper().str.contains("CANCER")]
df = df[df['cso'].notnull()].reset_index(drop=True)
df = df[df['cso']!="NAN"].reset_index(drop=True)    
df['rcdc_cso'] = df['rcdc_name']+" * "+df['cso']

out = pd.DataFrame()
for agency in list(df['funder_name'].unique()):
    temp = df[df['funder_name']==agency].copy().reset_index(drop=True)
    den = temp[['pub_id', 'rsr_id']].drop_duplicates().shape[0]
    print(den)
    temp['nb'] = temp.groupby(['pub_id', 'rsr_id'])['pub_id'].transform('count')
    temp[agency+' pubs'] = 1/temp['nb']
    temp = temp.groupby('rcdc_cso')[agency+' pubs'].sum().reset_index()
    print(temp[agency+' pubs'].sum())
    temp[agency+' share'] = temp[agency+' pubs']/den
    print(temp[agency+' share'].sum())
    temp.sort_values(agency+' pubs', ascending=False, inplace=True)
    del temp[agency+' pubs']
    
    if out.empty:
        out = temp
    else:
        out = pd.merge(out, temp, how='outer', on='rcdc_cso')

        
out['average_share'] = (out['INCa/INSERM/DGOS share']+out['National Cancer Institute share']
                          +out['Cancer Research UK share']+out['Wellcome Trust share']
                          +out['National Health and Medical Research Council share'])/5
    
out.to_csv('../output/top_topics/rcdc_cso.csv'.format(agency.replace("/", "-")), index=False)
print(out.shape[0])

1109
1108.9999999999998
1.0
14521
14521.000000000005
1.0000000000000004
1369
1368.9999999999998
1.0
347
347.00000000000017
1.0000000000000004
2829
2828.999999999999
0.9999999999999997
467


## 5. Researcher Level Statistics

In [73]:
rsrs.describe(include='all')

Unnamed: 0,rsr_id,first_name,last_name,rsr_gender,rsr_career_start_year,rsr_affiliation,rsr_affiliation_id,rsr_country,rsr_city,orcid_confirmed,rsr_nb_early_pubs,rsr_nb_early_citations,cso_0,cso_1,cso_10,cso_11,cso_12,cso_13,cso_14,cso_15,cso_16,cso_17,cso_18,cso_19,cso_2,cso_20,cso_21,cso_22,cso_23,cso_24,cso_25,cso_26,cso_27,cso_28,cso_29,cso_3,cso_30,cso_31,cso_32,cso_33,cso_4,cso_5,cso_6,cso_7,cso_8,cso_9,rcdc_0,rcdc_1,rcdc_10,rcdc_100,...,rcdc_54,rcdc_55,rcdc_56,rcdc_57,rcdc_58,rcdc_59,rcdc_6,rcdc_60,rcdc_61,rcdc_62,rcdc_63,rcdc_64,rcdc_65,rcdc_66,rcdc_67,rcdc_68,rcdc_69,rcdc_7,rcdc_70,rcdc_71,rcdc_72,rcdc_73,rcdc_74,rcdc_75,rcdc_76,rcdc_77,rcdc_78,rcdc_79,rcdc_8,rcdc_80,rcdc_81,rcdc_82,rcdc_83,rcdc_84,rcdc_85,rcdc_86,rcdc_87,rcdc_88,rcdc_89,rcdc_9,rcdc_90,rcdc_91,rcdc_92,rcdc_93,rcdc_94,rcdc_95,rcdc_96,rcdc_97,rcdc_98,rcdc_99
count,14004,13978,14004,14004,14004.0,12173,9804,9804,9804,14004,11897.0,11897.0,63.0,754.0,22.0,621.0,4.0,1150.0,240.0,314.0,130.0,224.0,2703.0,259.0,1089.0,596.0,175.0,67.0,63.0,34.0,13.0,34.0,220.0,549.0,1608.0,627.0,796.0,335.0,399.0,81.0,99.0,75.0,3.0,7.0,224.0,137.0,55.0,5.0,2.0,4.0,...,18.0,47.0,21.0,26.0,52.0,185.0,3.0,126.0,189.0,557.0,1450.0,9.0,24.0,5.0,141.0,7.0,1.0,37.0,17.0,9.0,446.0,8.0,9.0,18.0,208.0,134.0,2.0,2.0,37.0,5.0,3.0,2.0,35.0,5.0,146.0,163.0,3731.0,5.0,1.0,18.0,8.0,332.0,329.0,121.0,1231.0,107.0,50.0,30.0,458.0,14.0
unique,14004,7683,9132,3,,4030,1709,60,727,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
top,ur.01006037001.20,David,WANG,M,,Harvard University,grid.38142.3c,US,Cambridge,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
freq,1,142,91,7720,,197,197,6492,414,13834,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,,,,,1996.344045,,,,,,3.131041,210.915777,1.190476,1.472149,1.409091,1.399356,1.5,1.76087,1.595833,1.761146,1.284615,1.616071,1.669256,1.494208,1.586777,1.380872,1.16,1.179104,1.349206,1.088235,1.153846,1.235294,1.368182,1.68306,1.640547,1.524721,1.66206,1.567164,1.513784,1.419753,1.363636,1.173333,1.333333,1.0,1.40625,1.306569,1.290909,1.2,1.0,1.25,...,1.611111,1.468085,1.285714,1.615385,1.269231,1.335135,1.0,1.325397,1.550265,1.834829,1.647586,1.111111,2.041667,1.2,1.496454,1.0,1.0,1.27027,1.235294,1.333333,1.392377,1.125,1.222222,1.277778,1.495192,1.313433,1.0,1.5,1.297297,1.0,1.0,1.0,1.085714,2.0,1.452055,1.245399,1.962745,1.0,1.0,1.277778,1.0,1.454819,1.407295,1.487603,1.706742,1.794393,1.7,1.666667,1.539301,1.071429
std,,,,,10.437329,,,,,,2.444039,477.321106,0.715207,0.85302,0.503236,0.891682,1.0,1.374022,1.263911,1.346022,0.625487,1.36415,1.065632,0.903785,1.025435,0.803921,0.464362,0.423741,0.699279,0.287902,0.5547,1.207522,0.904523,1.198275,1.091985,1.065256,1.424005,1.108269,1.227485,0.756046,0.813841,0.665224,0.57735,0.0,0.81475,0.601005,0.598539,0.447214,0.0,0.5,...,2.354942,0.974703,0.717137,1.202561,0.597885,0.756212,0.0,0.73571,1.168679,1.567891,1.16034,0.333333,1.267629,0.447214,1.092991,0.0,,0.560191,0.437237,0.5,0.884745,0.353553,0.440959,0.574513,0.947907,0.665112,0.0,0.707107,0.617561,0.0,0.0,0.0,0.373491,1.0,0.839394,0.5993,1.397798,0.0,,0.460889,0.0,0.859147,0.89276,0.837812,1.360635,1.138837,1.147313,0.994236,1.06649,0.267261
min,,,,,1949.0,,,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,,,,,1989.0,,,,,,1.0,25.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,,,,,1997.0,,,,,,2.0,81.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.5,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,,,,,2005.0,,,,,,4.0,216.0,1.0,2.0,2.0,1.0,1.5,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.5,1.0,2.0,1.0,1.0,1.0,1.0,1.25,...,1.0,1.5,1.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.75,1.0,1.0,1.0,1.0,1.0,3.0,2.0,1.0,2.0,1.0,1.0,1.75,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0


## 6. Balanced Prior Publications

For this section, we restrict data to the first grant of every researcher (which is why averages are not quite the same as before).

In [85]:
pubs_stats = pub_groupby(pubs_mrg, yearly=True)

In [86]:
balanced_pubs_stats = pd.merge(pubs_stats, propensity_scores, on=['rsr_id', 'funder_name', 'grant_id'])
balanced_pubs_stats = pd.merge(balanced_pubs_stats, rsrs[['rsr_id', 'rsr_gender', 'rsr_career_start_year']]
                               , how='left', on='rsr_id')
balanced_pubs_stats['rsr_career_age'] = balanced_pubs_stats['year']-balanced_pubs_stats['rsr_career_start_year']
balanced_pubs_stats = balanced_pubs_stats[balanced_pubs_stats['weight'].notnull()]

In [103]:
cols = ['citations_per_pub', 'nb_pubs', 'team_size', 'nb_collabs', 'nb_collab_countries', 'rsr_career_age']

In [106]:
df = pd.DataFrame()
for var in cols:
    temp = balanced_pubs_stats[['funder_name', var, 'status']].copy()
    temp = pd.pivot_table(temp, index='status', columns='funder_name', values=var, aggfunc='mean').reset_index()
    temp['var'] = var
    temp['propensity_weight'] = False
    if df.empty:
        df = temp
    else:
        df = pd.concat([df, temp], sort=False)
df[['propensity_weight', 'status', 'var']+funders].sort_values(['status', 'var'], ascending=[False,True])    

funder_name,propensity_weight,status,var,Cancer Research UK,INCa/INSERM/DGOS,National Cancer Institute,National Health and Medical Research Council,Wellcome Trust
1,False,pre,citations_per_pub,68.28364,44.73851,65.441251,57.661015,61.658512
1,False,pre,nb_collab_countries,1.256419,1.93629,1.343677,1.557676,1.402547
1,False,pre,nb_collabs,13.233238,29.136256,20.440187,17.213628,14.612403
1,False,pre,nb_pubs,2.958393,5.682977,4.812341,3.714716,3.299003
1,False,pre,rsr_career_age,7.03923,11.917941,11.120431,8.861125,7.124585
1,False,pre,team_size,6.892701,8.868371,6.972869,8.233057,7.480734
0,False,post,citations_per_pub,40.038041,25.674743,34.756748,30.560683,38.049873
0,False,post,nb_collab_countries,2.217404,3.317839,2.213013,2.738322,2.870432
0,False,post,nb_collabs,27.269187,52.785729,38.925458,35.345756,36.549502
0,False,post,nb_pubs,4.21669,8.198165,6.804695,5.806128,4.813289


In [108]:
df = pd.DataFrame()
for var in cols:
    temp = balanced_pubs_stats[['funder_name', var, 'status', 'weight']].copy()
    temp[var+'_temp'] = temp[var]*temp['weight']
    temp = pd.pivot_table(temp, index='status', columns='funder_name', 
                          values=var+'_temp', aggfunc='sum').reset_index()
    temp['var'] = var
    
    for funder in funders:
        pre_den = (np.where(balanced_pubs_stats[balanced_pubs_stats['funder_name']==funder]['status']=="post", 0, 
                            np.where(balanced_pubs_stats[balanced_pubs_stats['funder_name']==funder][var].notnull(), 
                                     balanced_pubs_stats[balanced_pubs_stats['funder_name']==funder]['weight'], 
                                     0))).sum()
        post_den = (np.where(balanced_pubs_stats[balanced_pubs_stats['funder_name']==funder]['status']=="pre", 0, 
                            np.where(balanced_pubs_stats[balanced_pubs_stats['funder_name']==funder][var].notnull(), 
                                     balanced_pubs_stats[balanced_pubs_stats['funder_name']==funder]['weight'], 
                                     0))).sum()
        temp[funder] = np.where(temp['status']=="pre", temp[funder]/pre_den, temp[funder]/post_den)
            
    temp['var'] = var
    temp['propensity_weight'] = True
    if df.empty:
        df = temp
    else:
        df = pd.concat([df, temp])
df[['propensity_weight', 'status', 'var']+funders].sort_values(['status', 'var'], ascending=[False,True])

funder_name,propensity_weight,status,var,Cancer Research UK,INCa/INSERM/DGOS,National Cancer Institute,National Health and Medical Research Council,Wellcome Trust
1,True,pre,citations_per_pub,64.344293,44.73851,62.380557,56.050739,41.479237
1,True,pre,nb_collab_countries,2.020728,1.93629,1.486949,2.000478,1.651419
1,True,pre,nb_collabs,22.657661,29.136256,23.429964,21.552185,16.06576
1,True,pre,nb_pubs,4.692293,5.682977,5.244051,4.704514,3.850436
1,True,pre,rsr_career_age,12.373125,11.917941,12.941043,11.51637,12.027242
1,True,pre,team_size,7.279211,8.868371,7.153443,7.382119,6.499943
0,True,post,citations_per_pub,36.575277,25.674743,34.595996,29.514192,32.887755
0,True,post,nb_collab_countries,3.607258,3.317839,2.49358,3.342572,2.779446
0,True,post,nb_collabs,49.617983,52.785729,45.176102,44.449205,32.532506
0,True,post,nb_pubs,6.578703,8.198165,7.456653,7.218277,4.886758


In [46]:
df = pd.DataFrame()
temp = balanced_pubs_stats[['funder_name', 'rsr_gender', 'status']]
temp = pd.crosstab(temp['funder_name'], temp['rsr_gender'],normalize='index').reset_index()
if df.empty:
    df = temp
else:
    df = pd.concat([df, temp])
df[['funder_name', 'F', 'M', 'UNKNOWN']]

rsr_gender,funder_name,F,M,UNKNOWN
0,Cancer Research UK,0.28602,0.631241,0.082739
1,INCa/INSERM/DGOS,0.309888,0.594292,0.095821
2,National Cancer Institute,0.282131,0.529103,0.188766
3,National Health and Medical Research Council,0.327976,0.573581,0.098443
4,Wellcome Trust,0.318937,0.578073,0.10299


In [47]:
df = pd.DataFrame()
temp = balanced_pubs_stats[['funder_name', 'rsr_gender', 'status', 'weight']]
temp = pd.crosstab(temp['funder_name'], temp['rsr_gender'], temp['weight'], aggfunc = sum, 
                   normalize='index').reset_index()
if df.empty:
    df = temp
else:
    df = pd.concat([df, temp])
df[['funder_name', 'F', 'M', 'UNKNOWN']]

rsr_gender,funder_name,F,M,UNKNOWN
0,Cancer Research UK,0.348805,0.58665,0.064545
1,INCa/INSERM/DGOS,0.309888,0.594292,0.095821
2,National Cancer Institute,0.364846,0.497372,0.137781
3,National Health and Medical Research Council,0.371612,0.526615,0.101773
4,Wellcome Trust,0.495249,0.479236,0.025515


## Creating Output Comparison Table

### Overall Table

In [None]:
# df = pd.DataFrame({
#     'funded_grnts_per_rsrs_tot': funded_grnts_per_rsrs_tot
#     , 'agencies_per_rsrs_tot': agencies_per_rsrs_tot
#     , 'funded_amt_tot': funded_amt_tot
#     , 'funded_len_tot': funded_len_tot
#     , 'nb_grnt_rsrs_tot': nb_grnt_rsrs_tot
#     , 'grnt_fund_len_tot': grnt_fund_len_tot
#     , 'grnt_team_size_tot': grnt_team_size_tot
#     , 'grnt_fund_amt_tot': grnt_fund_amt_tot
#     , 'pre_avg_fund_len_tot': pre_avg_fund_len_tot
#     , 'pre_avg_team_size_tot': pre_avg_team_size_tot
#     , 'pre_fund_amt_tot': pre_fund_amt_tot
#     , 'pre_nb_grnts_tot': pre_nb_grnts_tot
#     , 'post_avg_fund_len_tot': post_avg_fund_len_tot
#     , 'post_avg_team_size_tot': post_avg_team_size_tot
#     , 'post_fund_amt_tot': post_fund_amt_tot
#     , 'post_nb_grnts_tot': post_nb_grnts_tot
#     , 'pub_cit_tot': pub_cit_tot
#     , 'pub_team_size_tot': pub_team_size_tot
#     , 'pre_pub_cit_tot': pre_pub_cit_tot
#     , 'pre_nb_pubs_tot': pre_nb_pubs_tot
#     , 'post_pub_cit_tot': post_pub_cit_tot
#     , 'post_nb_pubs_tot': post_nb_pubs_tot
#     }, index=['mean', 'std']).transpose()
# df

In [None]:
# # Export to Excel
# ls = !ls ../output/
# if 'comparison_statistics.xlsx' in ls:
#     book = load_workbook('../output/comparison_statistics.xlsx')
#     writer = pd.ExcelWriter('../output/comparison_statistics.xlsx', engine='openpyxl') 
#     writer.book = book
#     writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
#     df.to_excel(writer, "raw_all")
#     writer.save()
# else:
#     df.to_excel('../output/comparison_statistics.xlsx', sheet_name = 'raw_all')

### Table by Funding Agency

In [None]:
# df = pd.DataFrame({'rcdc_1':rcdc_1
#                     , 'rcdc_2':rcdc_2
#                     , 'rcdc_3':rcdc_3
#                     , 'rcdc_4':rcdc_4
#                     , 'rcdc_5':rcdc_5                   
#                     , 'nb_unique_rsrs': nb_unique_rsrs
#                     , 'nb_unique_grnts': nb_unique_grnts
#                     , 'funded_amt_avg': funded_amt_avg
#                     , 'funded_amt_med': funded_amt_med
#                     , 'funded_len_avg': funded_len_avg
#                     , 'funded_len_med': funded_len_med
#                     , 'nb_grnt_rsrs_avg': nb_grnt_rsrs_avg
#                     , 'nb_grnt_rsrs_med': nb_grnt_rsrs_med
#                     , 'pre_avg_fund_len_avg': pre_avg_fund_len_avg
#                     , 'pre_avg_fund_len_med': pre_avg_fund_len_med
#                     , 'pre_avg_team_size_avg': pre_avg_team_size_avg
#                     , 'pre_avg_team_size_med': pre_avg_team_size_med
#                     , 'pre_fund_amt_avg': pre_fund_amt_avg
#                     , 'pre_fund_amt_med': pre_fund_amt_med
#                     , 'pre_nb_grnts_avg': pre_nb_grnts_avg
#                     , 'pre_nb_grnts_med': pre_nb_grnts_med
#                     , 'post_avg_fund_len_avg': post_avg_fund_len_avg
#                     , 'post_avg_fund_len_med': post_avg_fund_len_med
#                     , 'post_avg_team_size_avg': post_avg_team_size_avg
#                     , 'post_avg_team_size_med': post_avg_team_size_med
#                     , 'post_fund_amt_avg': post_fund_amt_avg
#                     , 'post_fund_amt_med': post_fund_amt_med
#                     , 'post_nb_grnts_avg': post_nb_grnts_avg
#                     , 'post_nb_grnts_med': post_nb_grnts_med
#                     , 'pre_citations_avg': pre_citations_avg
#                     , 'pre_nb_pubs_avg': pre_nb_pubs_avg
#                     , 'pre_citations_med': pre_citations_med
#                     , 'pre_nb_pubs_med': pre_nb_pubs_med
#                     , 'post_citations_avg': post_citations_avg
#                     , 'post_citations_med': post_citations_med
#                     , 'post_nb_pubs_avg': post_nb_pubs_avg
#                     , 'post_nb_pubs_med': post_nb_pubs_med
#                    , 'pre_team_size_avg': pre_team_size_avg
#                    , 'pre_team_size_med': pre_team_size_med
#                    , 'post_team_size_avg': post_team_size_avg
#                    , 'post_team_size_med': post_team_size_med
#                   }, index=funders).transpose()
# df

In [None]:
# # Export to Excel
# ls = !ls ../output/
# if 'comparison_statistics.xlsx' in ls:
#     book = load_workbook('../output/comparison_statistics.xlsx')
#     writer = pd.ExcelWriter('../output/comparison_statistics.xlsx', engine='openpyxl') 
#     writer.book = book
#     writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
#     df.to_excel(writer, "raw_by_agency")
#     writer.save()
# else:
#     df.to_excel('../output/comparison_statistics.xlsx', sheet_name = 'raw_by_agency')

### Balance Table

## Preparing Data for Regression Analysis

In [112]:
yearly = True

### Researcher Info

In [113]:
rsrs.describe(include='all')

Unnamed: 0,rsr_id,first_name,last_name,rsr_gender,rsr_career_start_year,rsr_affiliation,rsr_affiliation_id,rsr_country,rsr_city,orcid_confirmed,rsr_nb_early_pubs,rsr_nb_early_citations,cso_0,cso_1,cso_10,cso_11,cso_12,cso_13,cso_14,cso_15,cso_16,cso_17,cso_18,cso_19,cso_2,cso_20,cso_21,cso_22,cso_23,cso_24,cso_25,cso_26,cso_27,cso_28,cso_29,cso_3,cso_30,cso_31,cso_32,cso_33,cso_4,cso_5,cso_6,cso_7,cso_8,cso_9,rcdc_0,rcdc_1,rcdc_10,rcdc_100,...,rcdc_54,rcdc_55,rcdc_56,rcdc_57,rcdc_58,rcdc_59,rcdc_6,rcdc_60,rcdc_61,rcdc_62,rcdc_63,rcdc_64,rcdc_65,rcdc_66,rcdc_67,rcdc_68,rcdc_69,rcdc_7,rcdc_70,rcdc_71,rcdc_72,rcdc_73,rcdc_74,rcdc_75,rcdc_76,rcdc_77,rcdc_78,rcdc_79,rcdc_8,rcdc_80,rcdc_81,rcdc_82,rcdc_83,rcdc_84,rcdc_85,rcdc_86,rcdc_87,rcdc_88,rcdc_89,rcdc_9,rcdc_90,rcdc_91,rcdc_92,rcdc_93,rcdc_94,rcdc_95,rcdc_96,rcdc_97,rcdc_98,rcdc_99
count,14004,13978,14004,14004,14004.0,12173,9804,9804,9804,14004,11897.0,11897.0,63.0,754.0,22.0,621.0,4.0,1150.0,240.0,314.0,130.0,224.0,2703.0,259.0,1089.0,596.0,175.0,67.0,63.0,34.0,13.0,34.0,220.0,549.0,1608.0,627.0,796.0,335.0,399.0,81.0,99.0,75.0,3.0,7.0,224.0,137.0,55.0,5.0,2.0,4.0,...,18.0,47.0,21.0,26.0,52.0,185.0,3.0,126.0,189.0,557.0,1450.0,9.0,24.0,5.0,141.0,7.0,1.0,37.0,17.0,9.0,446.0,8.0,9.0,18.0,208.0,134.0,2.0,2.0,37.0,5.0,3.0,2.0,35.0,5.0,146.0,163.0,3731.0,5.0,1.0,18.0,8.0,332.0,329.0,121.0,1231.0,107.0,50.0,30.0,458.0,14.0
unique,14004,7683,9132,3,,4030,1709,60,727,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
top,ur.01006037001.20,David,WANG,M,,Harvard University,grid.38142.3c,US,Cambridge,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
freq,1,142,91,7720,,197,197,6492,414,13834,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,,,,,1996.344045,,,,,,3.131041,210.915777,1.190476,1.472149,1.409091,1.399356,1.5,1.76087,1.595833,1.761146,1.284615,1.616071,1.669256,1.494208,1.586777,1.380872,1.16,1.179104,1.349206,1.088235,1.153846,1.235294,1.368182,1.68306,1.640547,1.524721,1.66206,1.567164,1.513784,1.419753,1.363636,1.173333,1.333333,1.0,1.40625,1.306569,1.290909,1.2,1.0,1.25,...,1.611111,1.468085,1.285714,1.615385,1.269231,1.335135,1.0,1.325397,1.550265,1.834829,1.647586,1.111111,2.041667,1.2,1.496454,1.0,1.0,1.27027,1.235294,1.333333,1.392377,1.125,1.222222,1.277778,1.495192,1.313433,1.0,1.5,1.297297,1.0,1.0,1.0,1.085714,2.0,1.452055,1.245399,1.962745,1.0,1.0,1.277778,1.0,1.454819,1.407295,1.487603,1.706742,1.794393,1.7,1.666667,1.539301,1.071429
std,,,,,10.437329,,,,,,2.444039,477.321106,0.715207,0.85302,0.503236,0.891682,1.0,1.374022,1.263911,1.346022,0.625487,1.36415,1.065632,0.903785,1.025435,0.803921,0.464362,0.423741,0.699279,0.287902,0.5547,1.207522,0.904523,1.198275,1.091985,1.065256,1.424005,1.108269,1.227485,0.756046,0.813841,0.665224,0.57735,0.0,0.81475,0.601005,0.598539,0.447214,0.0,0.5,...,2.354942,0.974703,0.717137,1.202561,0.597885,0.756212,0.0,0.73571,1.168679,1.567891,1.16034,0.333333,1.267629,0.447214,1.092991,0.0,,0.560191,0.437237,0.5,0.884745,0.353553,0.440959,0.574513,0.947907,0.665112,0.0,0.707107,0.617561,0.0,0.0,0.0,0.373491,1.0,0.839394,0.5993,1.397798,0.0,,0.460889,0.0,0.859147,0.89276,0.837812,1.360635,1.138837,1.147313,0.994236,1.06649,0.267261
min,,,,,1949.0,,,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,,,,,1989.0,,,,,,1.0,25.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,,,,,1997.0,,,,,,2.0,81.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.5,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,,,,,2005.0,,,,,,4.0,216.0,1.0,2.0,2.0,1.0,1.5,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.5,1.0,2.0,1.0,1.0,1.0,1.0,1.25,...,1.0,1.5,1.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.75,1.0,1.0,1.0,1.0,1.0,3.0,2.0,1.0,2.0,1.0,1.0,1.75,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0


### Grant Information

In [114]:
print(funded_grnts_info.set_index(['grant_id']).index.is_unique)
print(funded_grnts_info.shape)
funded_grnts_info.head()

True
(261, 7)


Unnamed: 0,grant_id,start_date,end_date,funding_amount,funding_len,start_year,nb_rsrs
0,grant.7426196,2011-11-01,2012-11-01,45589.0,1.00274,2011,2
1,grant.7154625,2007-02-02,2009-02-02,405251.0,2.00274,2007,1
2,grant.7426176,2011-08-27,2014-08-26,660293.0,3.0,2011,1
3,grant.7426175,2011-08-27,2014-08-26,660293.0,3.0,2011,1
4,grant.7154081,2009-06-01,2012-06-01,338542.0,3.00274,2009,1


### Prior/Subsequent Grant Statistics

In [115]:
grnts_stats = grant_groupby(grnts_mrg, yearly=yearly)

In [116]:
# Check if unit of observation is Funder-Grant-Researcher:
cols = ['funder_name', 'grant_id', 'rsr_id', 'status']
if yearly == True:
    cols+=['year']
print(grnts_stats.set_index(cols).index.is_unique)
print(grnts_stats.shape)
grnts_stats.head()

True
(4224, 9)


Unnamed: 0,funder_name,grant_id,rsr_id,status,year,nb_grnts,fund_amt,avg_fund_len,avg_team_size
0,INCa/INSERM/DGOS,grant.7426196,ur.01010477152.04,post,2012,0,,,
1,INCa/INSERM/DGOS,grant.7426196,ur.01010477152.04,post,2013,0,,,
2,INCa/INSERM/DGOS,grant.7426196,ur.01010477152.04,post,2014,0,,,
3,INCa/INSERM/DGOS,grant.7426196,ur.01010477152.04,post,2015,0,,,
4,INCa/INSERM/DGOS,grant.7426196,ur.01010477152.04,post,2016,0,,,


### Prior/Subsequent Publication Statistics

In [117]:
pubs_stats = pub_groupby(pubs_mrg, yearly=yearly)

In [118]:
# Check if unit of observation is Funder-Grant-Researcher:
cols = ['funder_name', 'grant_id', 'rsr_id', 'status']
if yearly == True:
    cols+=['year']
print(pubs_stats.set_index(cols).index.is_unique)
print(pubs_stats.shape)
pubs_stats.head()

True
(4224, 10)


Unnamed: 0,funder_name,grant_id,rsr_id,status,year,nb_pubs,citations_per_pub,team_size,nb_collabs,nb_collab_countries
0,INCa/INSERM/DGOS,grant.7426196,ur.01010477152.04,post,2012,0,,,0,0
1,INCa/INSERM/DGOS,grant.7426196,ur.01010477152.04,post,2013,1,22.0,6.0,6,1
2,INCa/INSERM/DGOS,grant.7426196,ur.01010477152.04,post,2014,0,,,0,0
3,INCa/INSERM/DGOS,grant.7426196,ur.01010477152.04,post,2015,1,8.0,7.0,7,1
4,INCa/INSERM/DGOS,grant.7426196,ur.01010477152.04,post,2016,0,,,0,0


### Add CSO Codes

In [119]:
pubs_cso.head()

Unnamed: 0,pub_id,cso_name,nb_cso
0,pub.1000000285,Cancer Initiation: Alterations in Chromosomes,2
1,pub.1000000285,Normal Functioning,2
2,pub.1000000357,Normal Functioning,1
3,pub.1000000436,Exogenous Factors in the Origin and Cause of C...,1
4,pub.1000000461,Normal Functioning,1


In [120]:
pubs_cso_features = pd.merge(pubs_mrg[['funder_name', 'grant_id', 'rsr_id', 'year', 'pub_id']], pubs_cso, 
                             how='inner', on='pub_id')
pubs_cso_features['cso_id'] = pubs_cso_features.groupby(['cso_name']).ngroup()
pubs_cso_features['cso'] = "cso_"+pubs_cso_features['cso_id'].astype(str)

In [121]:
pubs_cso_features = pubs_cso_features.groupby(['funder_name', 'grant_id', 'rsr_id', 'cso'])
pubs_cso_features = pd.DataFrame({'nb_pubs': pubs_cso_features['pub_id'].count()}).reset_index()

In [122]:
pubs_cso_features = pd.pivot_table(pubs_cso_features, index=['funder_name', 'grant_id', 'rsr_id'], 
                                   columns='cso', values='nb_pubs', aggfunc='sum').reset_index()

In [123]:
print(pubs_cso_features.set_index(['funder_name', 'grant_id', 'rsr_id']).index.is_unique)
print(pubs_cso_features.shape)
pubs_cso_features.head()

True
(257, 31)


cso,funder_name,grant_id,rsr_id,cso_0,cso_1,cso_10,cso_11,cso_12,cso_13,cso_14,cso_15,cso_16,cso_17,cso_18,cso_19,cso_2,cso_20,cso_21,cso_22,cso_23,cso_24,cso_25,cso_26,cso_27,cso_3,cso_4,cso_5,cso_6,cso_7,cso_8,cso_9
0,Cancer Research UK,grant.5134272,ur.01204716745.88,,3.0,5.0,,,,,,,,,,,,,1.0,,1.0,,1.0,,2.0,,,,,,17.0
1,Cancer Research UK,grant.5134565,ur.01257735610.85,,,1.0,,,,,,,,1.0,,,,2.0,,,,,19.0,2.0,,1.0,,3.0,,,5.0
2,Cancer Research UK,grant.5134771,ur.01131445706.82,,3.0,,,,,1.0,,,,,,1.0,,,,,,,,,,,,,,,7.0
3,Cancer Research UK,grant.5135882,ur.01360521656.25,,1.0,2.0,,,,,,,,,,1.0,,,,,,,,,,,,,,,10.0
4,Cancer Research UK,grant.5136019,ur.01136740201.43,,,,,,,,,,,,,,,,,,5.0,,34.0,7.0,,1.0,,1.0,,,9.0


### Combine All

In [124]:
df = pd.merge(funded_grnts[['funder_name', 'grant_id', 'start_year', 'start_date', 'rsr_id']].drop_duplicates(), 
              merge_key, how='left', on='start_year')
df = pd.merge(funded_grnts_funder, funded_grnts_rsr, how='outer', on='grant_id')
df = pd.merge(df, funded_grnts_info, how='outer', on='grant_id')
df = pd.merge(df, merge_key, how='left', on='start_year')
df = pd.merge(df, grnts_stats, how='outer', on=['funder_name', 'grant_id', 'rsr_id', 'year', 'status'])
df = pd.merge(df, pubs_stats, how='outer', on=['funder_name', 'grant_id', 'rsr_id', 'year', 'status'])
df = pd.merge(df, rsrs, how='left', on='rsr_id')
df = pd.merge(df, propensity_scores[['rsr_id', 'inca_prob', 'weight']], on='rsr_id', how='left')
df['rsr_career_age'] = df['year']-df['rsr_career_start_year']
# df = pd.merge(df, pubs_cso_features, how='outer', on=['funder_name', 'grant_id', 'rsr_id'])

In [125]:
print(df.set_index(['funder_name', 'grant_id', 'rsr_id', 'year', 'status']).index.is_unique)
print(df.shape)
df.describe(include='all')

True
(4224, 303)


Unnamed: 0,funder_name,grant_id,rsr_id,start_date,end_date,funding_amount,funding_len,start_year,nb_rsrs,status,year,nb_grnts,fund_amt,avg_fund_len,avg_team_size,nb_pubs,citations_per_pub,team_size,nb_collabs,nb_collab_countries,first_name,last_name,rsr_gender,rsr_career_start_year,rsr_affiliation,rsr_affiliation_id,rsr_country,rsr_city,orcid_confirmed,rsr_nb_early_pubs,rsr_nb_early_citations,cso_0,cso_1,cso_10,cso_11,cso_12,cso_13,cso_14,cso_15,cso_16,cso_17,cso_18,cso_19,cso_2,cso_20,cso_21,cso_22,cso_23,cso_24,cso_25,...,rcdc_57,rcdc_58,rcdc_59,rcdc_6,rcdc_60,rcdc_61,rcdc_62,rcdc_63,rcdc_64,rcdc_65,rcdc_66,rcdc_67,rcdc_68,rcdc_69,rcdc_7,rcdc_70,rcdc_71,rcdc_72,rcdc_73,rcdc_74,rcdc_75,rcdc_76,rcdc_77,rcdc_78,rcdc_79,rcdc_8,rcdc_80,rcdc_81,rcdc_82,rcdc_83,rcdc_84,rcdc_85,rcdc_86,rcdc_87,rcdc_88,rcdc_89,rcdc_9,rcdc_90,rcdc_91,rcdc_92,rcdc_93,rcdc_94,rcdc_95,rcdc_96,rcdc_97,rcdc_98,rcdc_99,inca_prob,weight,rsr_career_age
count,4224,4224,4224,4224,4224,3905.0,4224.0,4224.0,4224.0,4224,4224.0,4224.0,976.0,1054.0,1056.0,4224.0,1421.0,1421.0,4224.0,4224.0,4213,4224,4224,4224.0,3751,3014,3014,3014,4224,3685.0,3685.0,77.0,110.0,11.0,88.0,0.0,1034.0,165.0,286.0,33.0,77.0,363.0,66.0,154.0,418.0,0.0,66.0,22.0,0.0,0.0,...,0.0,33.0,187.0,0.0,66.0,88.0,132.0,253.0,0.0,0.0,0.0,99.0,11.0,0.0,22.0,55.0,11.0,165.0,11.0,0.0,0.0,88.0,33.0,0.0,0.0,22.0,0.0,0.0,0.0,44.0,0.0,44.0,11.0,902.0,0.0,0.0,33.0,0.0,341.0,66.0,22.0,297.0,22.0,11.0,0.0,616.0,0.0,4224.0,4224.0,4224.0
unique,5,261,318,144,116,,,,,2,,,,,,,,,,,293,297,3,,218,164,14,113,2,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
top,National Cancer Institute,grant.6717347,ur.01117731572.33,2011-01-01 00:00:00,2020-08-31 00:00:00,,,,,pre,,,,,,,,,,,David,HEIMBROOK,M,,National Cancer Institute,grid.48336.3a,US,Melbourne,False,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
freq,3014,143,121,429,275,,,,,2304,,,,,,,,,,,143,121,1991,,165,165,2068,165,4202,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
first,,,,2007-01-01 00:00:00,2008-01-01 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
last,,,,2012-12-01 00:00:00,2021-06-30 00:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,,,,,,2244098.0,4.51499,2009.520833,3.151042,,2009.520833,1.410748,2509532.0,4.241348,3.191212,1.237453,51.189165,8.188295,6.763731,0.883049,,,,1996.765625,,,,,,3.223881,187.062687,2.714286,1.4,2.0,1.125,,2.170213,1.533333,1.692308,1.0,1.0,1.848485,1.5,1.214286,1.210526,,1.166667,1.0,,,...,,1.0,1.411765,,1.166667,1.25,1.666667,1.391304,,,,1.777778,1.0,,1.0,1.2,2.0,1.4,1.0,,,1.25,1.333333,,,1.0,,,,1.0,,1.5,1.0,1.804878,,,1.0,,1.83871,1.5,1.0,1.37037,1.5,1.0,,1.892857,,0.12216,0.585938,12.755208
std,,,,,,3073548.0,2.659224,1.544522,3.641645,,3.519647,19.7565,4746639.0,2.515355,3.587582,2.893272,74.394915,4.659218,15.431593,2.256898,,,,9.620849,,,,,,2.61273,364.992392,2.199624,0.49214,0.0,0.332614,,1.693345,0.500406,1.614543,0.0,0.0,1.397182,0.769615,0.559695,0.46836,,0.375534,0.0,,,...,,0.0,0.773534,,0.375534,0.435494,1.183001,0.707838,,,,1.233508,0.0,,0.0,0.403687,0.0,1.022908,0.0,,,0.435494,0.478714,,,0.0,,,,0.0,,0.876038,0.0,1.163474,,,0.0,,1.299467,1.126601,0.0,0.72889,0.511766,0.0,,1.62347,,0.226232,6.297892,10.094895
min,,,,,,3000.0,0.493151,2007.0,1.0,,2002.0,0.0,2162.0,0.079452,1.0,0.0,0.0,1.0,0.0,0.0,,,,1966.0,,,,,,1.0,0.0,1.0,1.0,2.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,,,...,,1.0,1.0,,1.0,1.0,1.0,1.0,,,,1.0,1.0,,1.0,1.0,2.0,1.0,1.0,,,1.0,1.0,,,1.0,,,,1.0,,1.0,1.0,1.0,,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,,0.0,0.0,-5.0
25%,,,,,,340943.0,2.444521,2008.0,1.0,,2007.0,0.0,370923.5,2.50137,1.0,0.0,14.5,5.0,0.0,0.0,,,,1990.0,,,,,,1.0,27.0,1.0,1.0,2.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,,,...,,1.0,1.0,,1.0,1.0,1.0,1.0,,,,1.0,1.0,,1.0,1.0,2.0,1.0,1.0,,,1.0,1.0,,,1.0,,,,1.0,,1.0,1.0,1.0,,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,,0.000484,0.000484,5.0


In [126]:
df.to_csv('../data/regression_dataset_cervical_cancer.csv', index=False)

## Sandbox

### Subsequent Publication RCDC Codes

### Create Lookup Tables

In [None]:
# grnts_rcdc_1 = grnts_rcdc.drop_duplicates('grant_id', keep='first').reset_index(drop=True)
# pubs_rcdc_1 = pubs_rcdc.drop_duplicates('pub_id', keep='first').reset_index(drop=True)

### Method 1: 1-to-1

In [None]:
# cols = ['funder_name', 'funding_amount', 'grant_id', 'rsr_id', 'start_date', 'pub_id', 'citations']
# rcdc_comp = pubs_mrg[cols].copy()

In [None]:
# rcdc_comp = pd.merge(rcdc_comp, funded_grnt_rcdc_1, how='left', on='grant_id', suffixes=('', '_1_grnt'))
# rcdc_comp = pd.merge(rcdc_comp, pub_rcdc_1, how='left', on='pub_id', suffixes=('', '_1_pub'))
# rcdc_comp.rename(columns={'rcdc_name': 'rcdc_name_1_grnt', 'rcdc_code': 'rcdc_code_1_grnt'}, inplace=True)

In [None]:
# # Get rid or ignore Null Values?
# process = 'ignore' # or 'get_rid'
# if process == 'get_rid':
#     rcdc_comp = rcdc_1_comp[(rcdc_comp['rcdc_name_1_grnt'].notnull())&(rcdc_comp['rcdc_name_1_pub'].notnull())]
# elif process == 'ignore':
#     rcdc_comp['rcdc_name_1_grnt'] = rcdc_comp['rcdc_name_1_grnt'].fillna('')
#     rcdc_comp['rcdc_name_1_pub'] = rcdc_comp['rcdc_name_1_pub'].fillna('')
# else:
#     print("Don't forget to choose !")

In [None]:
# rcdc_comp_agency = rcdc_comp.groupby(['funder_name', 'rcdc_name_1_grnt', 'rcdc_name_1_pub'])
# rcdc_comp_agency = pd.DataFrame({'nb_obs': rcdc_comp_agency.size()
#                                    , 'nb_grnts': rcdc_comp_agency['grant_id'].nunique()
#                                    , 'nb_pubs': rcdc_comp_agency['pub_id'].nunique()
#                                   }).reset_index()

In [None]:
# rcdc_comp_agency.head()

In [None]:
# rcdc_comp_agency.tail()

In [None]:
# rcdc_comp_agency.columns = [['funder_name', 'grnt_rcdc', 'pub_rcdc', 'nb_grnts', 'nb_obs', 'nb_pubs']]
# rcdc_comp_agency.to_csv('../output/rcdc_grnt_1_pubs_1_comp.csv', index=False)

### Method 2: 1 to Many

In [None]:
# cols = ['funder_name', 'funding_amount', 'grant_id', 'rsr_id', 'start_date', 'pub_id', 'citations']
# cols += ['rcdc_names_pub', 'rcdc_codes_pub']
# rcdc_comp = pubs_mrg[cols].copy()

In [None]:
# rcdc_comp = pd.merge(rcdc_comp, funded_grnt_rcdc_1, how='left', on='grant_id', suffixes=('', '_1_grnt'))
# rcdc_comp = pd.merge(rcdc_comp, pub_rcdc_1, how='left', on='pub_id', suffixes=('', '_1_pub'))
# rcdc_comp.rename(columns={'rcdc_name': 'rcdc_name_1_grnt', 'rcdc_code': 'rcdc_code_1_grnt'}, inplace=True)

In [None]:
# # Get rid or ignore Null Values?
# process = 'ignore' # or 'get_rid'
# if process == 'get_rid':
#     rcdc_comp = rcdc_1_comp[(rcdc_comp['rcdc_name_1_grnt'].notnull())&(rcdc_comp['rcdc_names_pub'].notnull())]
# elif process == 'ignore':
#     rcdc_comp['rcdc_name_1_grnt'] = rcdc_comp['rcdc_name_1_grnt'].fillna('')
#     rcdc_comp['rcdc_names_pub'] = rcdc_comp['rcdc_names_pub'].fillna('')
#     rcdc_comp['rcdc_name_1_pub'] = rcdc_comp['rcdc_name_1_pub'].fillna('')
# else:
#     print("Don't forget to choose !")

In [None]:
# def regin(df):
#     return bool(re.search(r"(^|; )\b{}\b(; |$)".format(df['rcdc_name_1_grnt']), df['rcdc_names_pub']))
# rcdc_comp['flag'] = rcdc_comp.apply(regin, axis=1)

In [None]:
# rcdc_comp['rcdc_name_pub_impute'] = np.where(rcdc_comp['flag']==True
#                                              , rcdc_comp['rcdc_name_1_grnt'], rcdc_comp['rcdc_name_1_pub'])

In [None]:
# rcdc_comp_agency = rcdc_comp.groupby(['funder_name', 'rcdc_name_1_grnt', 'rcdc_name_pub_impute'])
# rcdc_comp_agency = pd.DataFrame({'nb_obs': rcdc_comp_agency.size()
#                                  , 'nb_grnts': rcdc_comp_agency['grant_id'].nunique()
#                                  , 'nb_pubs': rcdc_comp_agency['pub_id'].nunique()
#                                 }).reset_index()

In [None]:
# rcdc_comp_agency.head()

In [None]:
# rcdc_comp_agency.tail()

In [None]:
# rcdc_comp_agency.columns = [['funder_name', 'grnt_rcdc', 'pub_rcdc', 'nb_grnts', 'nb_obs', 'nb_pubs']]
# rcdc_comp_agency.to_csv('../output/rcdc_grnt_1_pubs_comp.csv', index=False)

### Method 3: Many to Many

In [None]:
# cols = ['funder_name', 'funding_amount', 'grant_id', 'rsr_id', 'start_date', 'pub_id', 'citations']
# cols += ['rcdc_names_pub', 'rcdc_codes_pub']
# rcdc_comp = pubs_mrg[cols].copy()

In [None]:
# rcdc_comp = pd.merge(rcdc_comp, funded_grnt_rcdc, how='left', on='grant_id', suffixes=('', '_grnt'))
# rcdc_comp = pd.merge(rcdc_comp, pub_rcdc_1, how='left', on='pub_id', suffixes=('', '_1_pub'))
# rcdc_comp.rename(columns={'rcdc_name': 'rcdc_name_grnt', 'rcdc_code': 'rcdc_code_grnt'}, inplace=True)

In [None]:
# # Get rid or ignore Null Values?
# process = 'ignore' # or 'get_rid'
# if process == 'get_rid':
#     rcdc_comp = rcdc_1_comp[(rcdc_comp['rcdc_name_grnt'].notnull())&(rcdc_comp['rcdc_names_pub'].notnull())]
# elif process == 'ignore':
#     rcdc_comp['rcdc_name_grnt'] = rcdc_comp['rcdc_name_grnt'].fillna('')
#     rcdc_comp['rcdc_names_pub'] = rcdc_comp['rcdc_names_pub'].fillna('')
#     rcdc_comp['rcdc_name_1_pub'] = rcdc_comp['rcdc_name_1_pub'].fillna('')
# else:
#     print("Don't forget to choose !")

In [None]:
# def regin(df):
#     return bool(re.search(r"(^|; )\b{}\b(; |$)".format(df['rcdc_name_grnt']), df['rcdc_names_pub']))
# rcdc_comp['flag'] = rcdc_comp.apply(regin, axis=1)

In [None]:
# rcdc_comp['rcdc_name_pub_impute'] = np.where(rcdc_comp['flag']==True
#                                              , rcdc_comp['rcdc_name_grnt'], rcdc_comp['rcdc_name_1_pub'])

In [None]:
# rcdc_comp_agency = rcdc_comp.groupby(['funder_name', 'rcdc_name_grnt', 'rcdc_name_pub_impute'])
# rcdc_comp_agency = pd.DataFrame({'nb_obs': rcdc_comp_agency.size()
#                                  , 'nb_grnts': rcdc_comp_agency['grant_id'].nunique()
#                                  , 'nb_pubs': rcdc_comp_agency['pub_id'].nunique()
#                                 }).reset_index()

In [None]:
# rcdc_comp_agency.head()

In [None]:
# rcdc_comp_agency.tail()

In [None]:
# rcdc_comp_agency.columns = [['funder_name', 'grnt_rcdc', 'pub_rcdc', 'nb_grnts', 'nb_obs', 'nb_pubs']]
# rcdc_comp_agency.to_csv('../output/rcdc_grnt_pubs_comp.csv', index=False)