# Comparison Statistics between Cohorts

## Python Setup

In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from pandas import Series
from openpyxl import load_workbook
import re
pd.options.display.max_columns = 100

## Load In Data

### Original Data

In [2]:
inca_grnts = pd.read_csv('../data/inca/inca_grants_details.csv', low_memory=False)
inca_pubs = pd.read_csv('../data/inca/inca_pub_details.csv', low_memory=False)
inca_orcid_responses = pd.read_csv('../output/researcher_info/researcher_info_ORCID_returns.csv')

In [3]:
# Clean Grants File
inca_grnts.columns = [x.lower().replace(' ', '_') for x in inca_grnts.columns]
inca_grnts.rename(columns={'inca_id': 'rsr_id'
                            , 'funding_amount_($)': 'funding_amount'
                            , 'dimensions_grant_id': 'grant_id'
                            , 'funder': 'funder_name'
                            , 'rcdc': 'rcdc_names'
                           }, inplace=True)
del inca_grnts['prenom_port'], inca_grnts['nom_port'], inca_grnts['title']
del inca_grnts['abstract'], inca_grnts['reference'], inca_grnts['organisme_port']
del inca_grnts['research_org_names'], inca_grnts['research_org_ids'], inca_grnts['for']
inca_grnts['rcdc_names'] = inca_grnts['rcdc_names'].replace(';', '; ', regex=True)

In [4]:
# Get INCA-funded Grants from file
inca_funders = ["French National Cancer Institute", "French Institute of Health and Medical Research"]
inca_funders += ["Ministère des Affaires sociales et de la Santé"]
inca_funded_grnts = inca_grnts[(inca_grnts['funder_name'].isin(inca_funders))
                               & (pd.DatetimeIndex(inca_grnts['start_date']).year>=2007)
                               & (pd.DatetimeIndex(inca_grnts['start_date']).year<=2012)].reset_index(drop=True)

In [5]:
# Inca ORCID-responses: keep only when there was an ORCID response.
inca_orcid_responses = inca_orcid_responses[inca_orcid_responses['ORCID Return']=="YES"].reset_index(drop=True)
conf_ids = list(inca_orcid_responses['INCA ID'])

# Change Funder Name in INCa-Funded grants when they responded to ORCID
inca_funded_grnts['funder_name'] = (inca_funded_grnts['funder_name'] 
                                    + np.where(inca_funded_grnts['rsr_id'].isin(conf_ids), ' - ORCID Confirmed', ''))

In [6]:
# Clean Pubs File
inca_pubs.columns = [x.lower().replace(' ', '_') for x in inca_pubs.columns]
inca_pubs.rename(columns={'inca_id': 'rsr_id'
                          , 'dimensions_publication_id': 'pub_id'
                          , 'publication_year': 'date'
                          , 'rcdc': 'rcdc_names'
                          , 'times_cited': 'citations'
                         }, inplace=True)
del inca_pubs['prenom_port'], inca_pubs['nom_port'], inca_pubs['organisme_port']
del inca_pubs['dimensions_researcher_id'], inca_pubs['additional_researcher_dim_id_to_combine']
del inca_pubs['additional_researcher_dim_id_to_combine_2'], inca_pubs['orcid'], inca_pubs['title'], inca_pubs['issue']
del inca_pubs['pages'], inca_pubs['pubmed_id'], inca_pubs['volume'], inca_pubs['relative_citation_ratio']
del inca_pubs['altmetric'], inca_pubs['open_access'], inca_pubs['author_names'], inca_pubs['research_org_names']
del inca_pubs['research_org_ids'], inca_pubs['for'], inca_pubs['journal_id'], inca_pubs['journal_title']
del inca_pubs['publication_date']
inca_pubs['date'] = inca_pubs['date'].apply(str).replace('\.0', '', regex=True)
inca_pubs['date'] = inca_pubs['date'].apply(lambda x: np.nan if x=="nan" else x+"-01-01")
inca_pubs['rcdc_names'] = inca_pubs['rcdc_names'].replace(';', '; ', regex=True)
inca_pubs['citations'] = pd.to_numeric(inca_pubs['citations'])

### Counterfactual Data

In [7]:
funded_grnts = pd.read_csv('../data/counterfactual/counterfactual_funded_grants.csv')
grnts = pd.read_csv('../data/counterfactual/counterfactual_researcher_grants.csv')
pubs = pd.read_csv('../data/counterfactual/counterfactual_researcher_publications.csv')

In [8]:
# Restrict to Grants funded between 2007 and 2012
funded_grnts = funded_grnts[(pd.DatetimeIndex(funded_grnts['start_date']).year>=2007)
                            & (pd.DatetimeIndex(funded_grnts['start_date']).year<=2012)].reset_index(drop=True)

### Combining Data

In [9]:
funded_grnts = pd.concat([funded_grnts, inca_funded_grnts]).reset_index(drop = True)
grnts = pd.concat([grnts, inca_grnts]).reset_index(drop = True)
pubs = pd.concat([pubs, inca_pubs]).reset_index(drop = True)

### List of All Funders

In [10]:
funders = sorted(list(funded_grnts['funder_name'].unique()))
print(funders)

['Cancer Research UK', 'French Institute of Health and Medical Research', 'French Institute of Health and Medical Research - ORCID Confirmed', 'French National Cancer Institute', 'French National Cancer Institute - ORCID Confirmed', 'Ministère des Affaires sociales et de la Santé', 'Ministère des Affaires sociales et de la Santé - ORCID Confirmed', 'National Cancer Institute', 'National Health and Medical Research Council', 'Wellcome Trust']


## Cleaning Data

In [11]:
funded_grnts['start_date'] = pd.to_datetime(funded_grnts['start_date'])
funded_grnts['end_date'] = pd.to_datetime(funded_grnts['end_date'])
grnts['start_date'] = pd.to_datetime(grnts['start_date'])
grnts['end_date'] = pd.to_datetime(grnts['end_date'])
pubs['date'] = pd.to_datetime(pubs['date'])

In [12]:
funded_grnts['funding_len'] = (funded_grnts['end_date']-funded_grnts['start_date'])/timedelta(days=365)
grnts['funding_len'] = (grnts['end_date']-grnts['start_date'])/timedelta(days=365)

In [13]:
funded_grnts['nb_rsrs'] = funded_grnts.groupby('grant_id')['rsr_id'].transform('nunique')
funded_grnts['nb_rsrs'] = funded_grnts['nb_rsrs'].replace(0, np.nan)
grnts['nb_rsrs'] = grnts.groupby('grant_id')['rsr_id'].transform('nunique')
grnts['nb_rsrs'] = grnts['nb_rsrs'].replace(0, np.nan)
pubs['nb_rsrs'] = pubs.groupby('pub_id')['rsr_id'].transform('nunique')
pubs['nb_rsrs'] = pubs['nb_rsrs'].replace(0, np.nan)

In [14]:
grnt_cols = list(funded_grnts)
grnt_cols.remove('rsr_id')
funded_grnts_info = funded_grnts[grnt_cols].drop_duplicates().reset_index(drop=True)

## Funded Grants Statistics

### Number of Grants per Funder

In [15]:
funded_grnts.describe(include='all')

Unnamed: 0,end_date,funder_id,funder_name,funding_amount,grant_id,rcdc_codes,rcdc_names,rsr_id,start_date,funding_len,nb_rsrs
count,21496,20041,21543,19276.0,21543,18580.0,20062,20282,21543,21496.0,20909.0
unique,1088,5,10,,17351,7436.0,7697,13146,1178,,
top,2017-01-01 00:00:00,grid.48336.3a,National Cancer Institute,,grant.2695966,503.0,Cancer,ur.01117731572.33,2009-01-01 00:00:00,,
freq,668,13701,13701,,17,740.0,777,384,1566,,
first,2007-01-31 00:00:00,,,,,,,,2007-01-01 00:00:00,,
last,2024-10-31 00:00:00,,,,,,,,2012-12-31 00:00:00,,
mean,,,,1603461.0,,,,,,3.863223,1.694773
std,,,,8492969.0,,,,,,2.413701,1.611995
min,,,,0.0,,,,,,0.00274,1.0
25%,,,,270963.0,,,,,,2.00274,1.0


In [16]:
print("Number of unique researchers funded by each agency:\n")
print(funded_grnts.groupby('funder_name')['rsr_id'].nunique())
nb_unique_rsrs = list(funded_grnts.groupby('funder_name')['rsr_id'].nunique())

Number of unique researchers funded by each agency:

funder_name
Cancer Research UK                                                   1370
French Institute of Health and Medical Research                       122
French Institute of Health and Medical Research - ORCID Confirmed      34
French National Cancer Institute                                      669
French National Cancer Institute - ORCID Confirmed                     95
Ministère des Affaires sociales et de la Santé                        333
Ministère des Affaires sociales et de la Santé - ORCID Confirmed       50
National Cancer Institute                                            8450
National Health and Medical Research Council                         1944
Wellcome Trust                                                        328
Name: rsr_id, dtype: int64


In [17]:
print("Number of unique grants funded by each agency:\n")
print(funded_grnts.groupby('funder_name')['grant_id'].nunique())
nb_unique_grnts = list(funded_grnts.groupby('funder_name')['grant_id'].nunique())

Number of unique grants funded by each agency:

funder_name
Cancer Research UK                                                    2225
French Institute of Health and Medical Research                        114
French Institute of Health and Medical Research - ORCID Confirmed       35
French National Cancer Institute                                       730
French National Cancer Institute - ORCID Confirmed                     132
Ministère des Affaires sociales et de la Santé                         406
Ministère des Affaires sociales et de la Santé - ORCID Confirmed        64
National Cancer Institute                                            12468
National Health and Medical Research Council                          1064
Wellcome Trust                                                         261
Name: grant_id, dtype: int64


### Number of Grants per Researcher

In [18]:
print("There are {} total researchers.".format(funded_grnts['rsr_id'].nunique()))
print("There are {} total grants.".format(funded_grnts['grant_id'].nunique()))

There are 13146 total researchers.
There are 17351 total grants.


In [19]:
print("Number of grants from agencies per researcher (as % of total):\n")
print(funded_grnts.groupby('rsr_id')['grant_id'].nunique().value_counts(normalize=True).head())

Number of grants from agencies per researcher (as % of total):

1    0.727674
2    0.165678
3    0.053628
4    0.023886
5    0.013768
Name: grant_id, dtype: float64


In [20]:
print("By how many agencies are the researchers funded?\n")
print(funded_grnts.groupby('rsr_id')['funder_name'].nunique().value_counts(normalize=True))

By how many agencies are the researchers funded?

1    0.982200
2    0.016735
3    0.000989
4    0.000076
Name: funder_name, dtype: float64


In [21]:
print("How many grants does each agency give to its reserachers in the 5 focal years?")
for funder in funders:
    print("\n{}:".format(funder))
    temp = funded_grnts[funded_grnts['funder_name']==funder].groupby('rsr_id')['grant_id'].nunique().reset_index()
    print(temp['grant_id'].value_counts(normalize=True).head(3))
    print("(Total researchers: {})".format(temp.shape[0]))

How many grants does each agency give to its reserachers in the 5 focal years?

Cancer Research UK:
1    0.705109
2    0.161314
3    0.053285
Name: grant_id, dtype: float64
(Total researchers: 1370)

French Institute of Health and Medical Research:
1    0.983607
2    0.016393
Name: grant_id, dtype: float64
(Total researchers: 122)

French Institute of Health and Medical Research - ORCID Confirmed:
1    0.970588
2    0.029412
Name: grant_id, dtype: float64
(Total researchers: 34)

French National Cancer Institute:
1    0.814649
2    0.140508
3    0.026906
Name: grant_id, dtype: float64
(Total researchers: 669)

French National Cancer Institute - ORCID Confirmed:
1    0.726316
2    0.168421
3    0.073684
Name: grant_id, dtype: float64
(Total researchers: 95)

Ministère des Affaires sociales et de la Santé:
1    0.756757
2    0.165165
3    0.039039
Name: grant_id, dtype: float64
(Total researchers: 333)

Ministère des Affaires sociales et de la Santé - ORCID Confirmed:
1    0.78
2    0.16

### Grant Characteristics

In [22]:
funded_grnts_info.head()

Unnamed: 0,end_date,funder_id,funder_name,funding_amount,grant_id,rcdc_codes,rcdc_names,start_date,funding_len,nb_rsrs
0,2018-11-30,grid.48336.3a,National Cancer Institute,3181288.0,grant.2482176,503; 546; 337; 316; 507,Cancer; Patient Safety; Bioengineering; Breast...,2012-12-31,5.917808,2.0
1,2015-12-27,grid.455095.8,French National Cancer Institute,727240.0,grant.7154902,526; 559; 344; 503,Genetics; Rare Diseases; Digestive Diseases; C...,2012-12-27,3.0,1.0
2,2015-12-27,grid.455095.8,French National Cancer Institute,799578.0,grant.7154248,526; 559; 344; 503; 411,Genetics; Rare Diseases; Digestive Diseases; C...,2012-12-27,3.0,1.0
3,2015-12-27,grid.455095.8,French National Cancer Institute,564589.0,grant.7154673,583; 503; 580,Vaccine Related; Cancer; Urologic Diseases,2012-12-27,3.0,1.0
4,2015-12-27,grid.455095.8,French National Cancer Institute,615157.0,grant.7154822,559; 484; 313; 387; 503; 501; 568; 569,Rare Diseases; Stem Cell Research; Brain Cance...,2012-12-27,3.0,


In [23]:
funded_amt_avg = []
funded_amt_med = []
funded_len_avg = []
funded_len_med = []
nb_grnt_rsrs_avg = []
nb_grnt_rsrs_med = []
for funder in funders:
    print("\n{}:".format(funder))
    temp = funded_grnts_info[funded_grnts_info['funder_name']==funder]
    print(temp.describe())
    print("(Total grants: {})".format(funded_grnts_info[funded_grnts_info['funder_name']==funder].shape[0]))
    funded_amt_avg.append(temp['funding_amount'].mean())
    funded_amt_med.append(temp['funding_amount'].median())
    funded_len_avg.append(temp['funding_len'].mean())
    funded_len_med.append(temp['funding_len'].median())
    nb_grnt_rsrs_avg.append(temp['nb_rsrs'].mean())
    nb_grnt_rsrs_med.append(temp['nb_rsrs'].median())


Cancer Research UK:
       funding_amount  funding_len      nb_rsrs
count             0.0  2225.000000  2223.000000
mean              NaN     3.372391     1.016644
std               NaN     2.076851     0.127963
min               NaN     0.002740     1.000000
25%               NaN     1.997260     1.000000
50%               NaN     3.000000     1.000000
75%               NaN     5.000000     1.000000
max               NaN    13.008219     2.000000
(Total grants: 2225)

French Institute of Health and Medical Research:
       funding_amount  funding_len     nb_rsrs
count    1.140000e+02   113.000000  114.000000
mean     2.342411e+05     2.150152    1.114035
std      1.869081e+05     1.021059    0.345867
min      2.448100e+04     0.509589    1.000000
25%      1.279798e+05     1.408219    1.000000
50%      2.015395e+05     2.000000    1.000000
75%      3.020678e+05     2.997260    1.000000
max      1.259731e+06     5.002740    3.000000
(Total grants: 114)

French Institute of Health and M

In [24]:
funded_grnts_info['year'] = pd.DatetimeIndex(funded_grnts_info['start_date']).year
pd.crosstab(funded_grnts_info['funder_name'], funded_grnts_info['year'])

year,2007,2008,2009,2010,2011,2012
funder_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cancer Research UK,549,428,346,306,331,265
French Institute of Health and Medical Research,0,0,0,0,61,53
French Institute of Health and Medical Research - ORCID Confirmed,0,0,0,0,16,19
French National Cancer Institute,364,170,222,215,179,191
French National Cancer Institute - ORCID Confirmed,34,21,20,21,21,15
Ministère des Affaires sociales et de la Santé,76,47,67,69,71,76
Ministère des Affaires sociales et de la Santé - ORCID Confirmed,15,9,7,15,11,7
National Cancer Institute,1777,2592,2870,1780,1743,1706
National Health and Medical Research Council,141,198,198,106,210,211
Wellcome Trust,33,40,37,53,55,43


### RCDC Codes Analysis

In [25]:
funded_grnts_info['nb_rcdc'] = (funded_grnts_info['rcdc_names'].str.count(';')+1)

In [26]:
print("Average number of RCDC's per Grant:\n")
print(funded_grnts_info.groupby('funder_name')['nb_rcdc'].mean())

Average number of RCDC's per Grant:

funder_name
Cancer Research UK                                                   4.223325
French Institute of Health and Medical Research                      4.711712
French Institute of Health and Medical Research - ORCID Confirmed    5.117647
French National Cancer Institute                                     5.169324
French National Cancer Institute - ORCID Confirmed                   5.190840
Ministère des Affaires sociales et de la Santé                       6.109181
Ministère des Affaires sociales et de la Santé - ORCID Confirmed     6.333333
National Cancer Institute                                            5.754231
National Health and Medical Research Council                         4.725564
Wellcome Trust                                                       4.498084
Name: nb_rcdc, dtype: float64


In [27]:
rcdc = funded_grnts_info[['grant_id', 'rcdc_names']]
rcdc = rcdc[rcdc['rcdc_names'].notnull()]

In [28]:
rcdc = pd.concat([Series(row['grant_id'], row['rcdc_names'].split('; '))
                  for _, row in rcdc.iterrows()]).reset_index()
rcdc.columns = ['rcdc_name', 'grant_id']

In [29]:
rcdc = pd.merge(funded_grnts_info[['funder_name', 'grant_id', 'year']], rcdc, how='left', on='grant_id')

In [30]:
rcdc.head()

Unnamed: 0,funder_name,grant_id,year,rcdc_name
0,National Cancer Institute,grant.2482176,2012,Cancer
1,National Cancer Institute,grant.2482176,2012,Patient Safety
2,National Cancer Institute,grant.2482176,2012,Bioengineering
3,National Cancer Institute,grant.2482176,2012,Breast Cancer
4,National Cancer Institute,grant.2482176,2012,Clinical Research


In [31]:
rcdc_1 = []
rcdc_2 = []
rcdc_3 = []
rcdc_4 = []
rcdc_5 = []

for funder in funders:
    print("\n{}:".format(funder))
    temp = rcdc[rcdc['funder_name']==funder]
    print(temp['rcdc_name'].value_counts().head())
    rcdc_1.append(temp['rcdc_name'].value_counts().index[0])
    rcdc_2.append(temp['rcdc_name'].value_counts().index[1])
    rcdc_3.append(temp['rcdc_name'].value_counts().index[2])
    rcdc_4.append(temp['rcdc_name'].value_counts().index[3])
    rcdc_5.append(temp['rcdc_name'].value_counts().index[4])


Cancer Research UK:
Cancer               1415
Clinical Research     628
Genetics              574
Rare Diseases         472
Biotechnology         316
Name: rcdc_name, dtype: int64

French Institute of Health and Medical Research:
Cancer               93
Rare Diseases        41
Clinical Research    36
Genetics             36
Biotechnology        31
Name: rcdc_name, dtype: int64

French Institute of Health and Medical Research - ORCID Confirmed:
Cancer               31
Genetics             16
Rare Diseases        15
Clinical Research    14
Biotechnology         9
Name: rcdc_name, dtype: int64

French National Cancer Institute:
Cancer               2500
Rare Diseases        1125
Genetics             1057
Clinical Research     954
Biotechnology         769
Name: rcdc_name, dtype: int64

French National Cancer Institute - ORCID Confirmed:
Cancer               268
Rare Diseases        119
Clinical Research    117
Genetics             114
Biotechnology         89
Name: rcdc_name, dtype: int6

## Prior and Subsequent Grants

In [32]:
def grant_groupby(grnts_mrg):
    cols = [col for col in grnts_mrg.columns if col[-2:]!="_2"]
    grnts_mrg['pre_flag'] = grnts_mrg['start_date']>grnts_mrg['start_date_2']
    grnts_mrg['post_flag'] = grnts_mrg['start_date']<grnts_mrg['start_date_2']
    for col in ['funding_amount', 'funding_len', 'nb_rsrs']:
        grnts_mrg['pre_'+col] = grnts_mrg[col+'_2']*grnts_mrg['pre_flag']
        grnts_mrg['post_'+col] = grnts_mrg[col+'_2']*grnts_mrg['post_flag']

    grnts_stats = grnts_mrg.groupby(['grant_id', 'rsr_id'])
    grnts_stats = pd.DataFrame({'pre_nb_grnts':grnts_stats['pre_flag'].sum()
                                , 'pre_fund_amt':grnts_stats['pre_funding_amount'].sum()
                                , 'pre_avg_fund_len':grnts_stats['pre_funding_len'].mean()
                                , 'pre_avg_team_size':grnts_stats['pre_nb_rsrs'].mean()
                                , 'post_nb_grnts':grnts_stats['post_flag'].sum()
                                , 'post_fund_amt':grnts_stats['post_funding_amount'].sum()
                                , 'post_avg_fund_len':grnts_stats['post_funding_len'].mean()
                                , 'post_avg_team_size':grnts_stats['post_nb_rsrs'].mean()
                               }).reset_index()
    temp = grnts_mrg[cols].copy().drop_duplicates()
    grnts_stats = pd.merge(temp, grnts_stats, how='left', on=['grant_id', 'rsr_id'])
    
    return grnts_stats

In [33]:
def grant_stats(grnts_stats):
    
    print("Pre-Funding Grant Statistics:")
    pre_avg_fund_len_avg = []
    pre_avg_fund_len_med = []
    pre_avg_team_size_avg = []
    pre_avg_team_size_med = []
    pre_fund_amt_avg = []
    pre_fund_amt_med = []
    pre_nb_grnts_avg = []
    pre_nb_grnts_med = []
    cols = [col for col in grnts_stats.columns if col[:4]=="pre_"]
    for funder in funders:
        print("\n{}:".format(funder))
        temp = grnts_stats[grnts_stats['funder_name']==funder]
        print(temp[cols].describe())
        pre_avg_fund_len_avg.append(temp['pre_avg_fund_len'].mean())
        pre_avg_fund_len_med.append(temp['pre_avg_fund_len'].median())
        pre_avg_team_size_avg.append(temp['pre_avg_team_size'].mean())
        pre_avg_team_size_med.append(temp['pre_avg_team_size'].median())
        pre_fund_amt_avg.append(temp['pre_fund_amt'].mean())
        pre_fund_amt_med.append(temp['pre_fund_amt'].median())
        pre_nb_grnts_avg.append(temp['pre_nb_grnts'].mean())
        pre_nb_grnts_med.append(temp['pre_nb_grnts'].median())
        
    print("\n")
    
    print("Post-Funding Grant Statistics:")
    post_avg_fund_len_avg = []
    post_avg_fund_len_med = []
    post_avg_team_size_avg = []
    post_avg_team_size_med = []
    post_fund_amt_avg = []
    post_fund_amt_med = []
    post_nb_grnts_avg = []
    post_nb_grnts_med = []
    cols = [col for col in grnts_stats.columns if col[:5]=="post_"]
    for funder in funders:
        print("\n{}:".format(funder))
        temp = grnts_stats[grnts_stats['funder_name']==funder]
        print(temp[cols].describe())
        post_avg_fund_len_avg.append(temp['post_avg_fund_len'].mean())
        post_avg_fund_len_med.append(temp['post_avg_fund_len'].median())
        post_avg_team_size_avg.append(temp['post_avg_team_size'].mean())
        post_avg_team_size_med.append(temp['post_avg_team_size'].median())
        post_fund_amt_avg.append(temp['post_fund_amt'].mean())
        post_fund_amt_med.append(temp['post_fund_amt'].median())
        post_nb_grnts_avg.append(temp['post_nb_grnts'].mean())
        post_nb_grnts_med.append(temp['post_nb_grnts'].median())

    return pre_avg_fund_len_avg, pre_avg_fund_len_med, pre_avg_team_size_avg, pre_avg_team_size_med, pre_fund_amt_avg, pre_fund_amt_med, pre_nb_grnts_avg, pre_nb_grnts_med, post_avg_fund_len_avg, post_avg_fund_len_med, post_avg_team_size_avg, post_avg_team_size_med, post_fund_amt_avg, post_fund_amt_med, post_nb_grnts_avg, post_nb_grnts_med

### For all Grants

In [34]:
# grnts_mrg = pd.merge(funded_grnts, grnts, how='left', on='rsr_id', suffixes=('', '_2'))
# grnts_stats_all = grant_groupby(grnts_mrg)

In [35]:
# grnts_stats_all.shape

In [36]:
# grnts_stats_all.describe(include='all')

In [37]:
# grant_stats(grnts_stats_all)

### Within 5 years of the funded grant

In [38]:
grnts_mrg = pd.merge(funded_grnts, grnts, how='left', on='rsr_id', suffixes=('', '_2'))
grnts_mrg = grnts_mrg[abs(grnts_mrg['start_date']-grnts_mrg['start_date_2'])/timedelta(days=365)<=5]
grnts_mrg = pd.merge(funded_grnts_info, grnts_mrg, how='left', on=grnt_cols)
grnts_mrg = grnts_mrg.reset_index(drop=True)
grnts_stats_5y = grant_groupby(grnts_mrg)

In [39]:
grnts_stats_5y.shape

(21542, 21)

In [40]:
grnts_stats_5y.describe(include='all')

Unnamed: 0,end_date,funder_id,funder_name,funding_amount,grant_id,rcdc_codes,rcdc_names,start_date,funding_len,nb_rsrs,year,nb_rcdc,rsr_id,post_avg_fund_len,post_avg_team_size,post_fund_amt,post_nb_grnts,pre_avg_fund_len,pre_avg_team_size,pre_fund_amt,pre_nb_grnts
count,21495,20041,21542,19275.0,21542,18580.0,20061,21542,21495.0,20908.0,21542.0,20061.0,20281,20258.0,20281.0,20281.0,20281.0,20258.0,20281.0,20281.0,20281.0
unique,1088,5,10,,17351,7436.0,7697,1178,,,,,13146,,,,,,,,
top,2017-01-01 00:00:00,grid.48336.3a,National Cancer Institute,,grant.2695966,503.0,Cancer,2009-01-01 00:00:00,,,,,ur.01117731572.33,,,,,,,,
freq,668,13701,13701,,17,740.0,777,1566,,,,,384,,,,,,,,
first,2007-01-31 00:00:00,,,,,,,2007-01-01 00:00:00,,,,,,,,,,,,,
last,2024-10-31 00:00:00,,,,,,,2012-12-31 00:00:00,,,,,,,,,,,,,
mean,,,,1603514.0,,,,,3.863217,1.694806,2009.416953,5.506655,,0.896033,0.588547,2156845.0,1.728169,0.815547,0.449696,1552499.0,1.600414
std,,,,8493186.0,,,,,2.413757,1.612027,1.676731,2.73898,,1.045806,0.955106,8154655.0,4.519345,1.081906,0.792079,5382502.0,4.305996
min,,,,0.0,,,,,0.00274,1.0,2007.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,,270963.0,,,,,2.00274,1.0,2008.0,3.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
pre_avg_fund_len_avg, pre_avg_fund_len_med, pre_avg_team_size_avg, pre_avg_team_size_med, pre_fund_amt_avg, pre_fund_amt_med, pre_nb_grnts_avg, pre_nb_grnts_med, post_avg_fund_len_avg, post_avg_fund_len_med, post_avg_team_size_avg, post_avg_team_size_med, post_fund_amt_avg, post_fund_amt_med, post_nb_grnts_avg, post_nb_grnts_med = grant_stats(grnts_stats_5y)

Pre-Funding Grant Statistics:

Cancer Research UK:
       pre_avg_fund_len  pre_avg_team_size  pre_fund_amt  pre_nb_grnts
count       2260.000000        2260.000000  2.260000e+03   2260.000000
mean           0.996603           0.546492  9.956313e+05      2.298230
std            1.035040           0.891362  2.824911e+06      3.028198
min            0.000000           0.000000  0.000000e+00      0.000000
25%            0.000000           0.000000  0.000000e+00      0.000000
50%            0.885073           0.333333  0.000000e+00      1.000000
75%            1.708333           0.666667  5.200978e+05      3.000000
max            7.153973          12.000000  3.148749e+07     19.000000

French Institute of Health and Medical Research:
       pre_avg_fund_len  pre_avg_team_size  pre_fund_amt  pre_nb_grnts
count        123.000000         124.000000  1.240000e+02    124.000000
mean           0.373357           0.204397  2.722241e+05      0.564516
std            0.681347           0.386033  7.8

       post_avg_fund_len  post_avg_team_size  post_fund_amt  post_nb_grnts
count          65.000000           65.000000   6.500000e+01      65.000000
mean            0.631883            0.330769   5.399397e+05       0.692308
std             0.832159            0.671850   1.742532e+06       1.029610
min             0.000000            0.000000   0.000000e+00       0.000000
25%             0.000000            0.000000   0.000000e+00       0.000000
50%             0.000000            0.000000   0.000000e+00       0.000000
75%             1.500000            0.500000   3.859360e+05       1.000000
max             2.335160            4.500000   1.074226e+07       5.000000

National Cancer Institute:
       post_avg_fund_len  post_avg_team_size  post_fund_amt  post_nb_grnts
count       13084.000000        13084.000000   1.308400e+04   13084.000000
mean            0.945569            0.430331   2.466314e+06       1.685952
std             1.118963            0.632308   9.646855e+06       5.3039

## Prior and Subsequent Publications

In [42]:
def pub_groupby(pubs_mrg):
    cols = [col for col in pubs_mrg.columns if col[-2:]!="_2"]
    pubs_mrg['pre_flag'] = pubs_mrg['start_date']>pubs_mrg['date_2']
    pubs_mrg['post_flag'] = pubs_mrg['start_date']<pubs_mrg['date_2']
    for col in ['citations']:
        pubs_mrg['pre_'+col] = pubs_mrg[col+'_2']*pubs_mrg['pre_flag']
        pubs_mrg['post_'+col] = pubs_mrg[col+'_2']*pubs_mrg['post_flag']

    pubs_stats = pubs_mrg.groupby(['grant_id', 'rsr_id'])
    pubs_stats = pd.DataFrame({'pre_nb_pubs':pubs_stats['pre_flag'].sum()
                                , 'pre_citations':pubs_stats['pre_citations'].sum()
                                , 'post_nb_pubs':pubs_stats['post_flag'].sum()
                                , 'post_citations':pubs_stats['post_citations'].sum()
                               }).reset_index()
    temp = pubs_mrg[cols].copy().drop_duplicates()
    pubs_stats = pd.merge(temp, pubs_stats, how='left', on=['grant_id', 'rsr_id'])
    
    return pubs_stats

In [43]:
def pub_stats(pubs_stats):
    
    print("Pre-Funding Publication Statistics:")
    pre_citations_avg = []
    pre_nb_pubs_avg = []
    pre_citations_med = []
    pre_nb_pubs_med = []
    cols = [col for col in pubs_stats.columns if col[:4]=="pre_"]
    for funder in funders:
        print("\n{}:".format(funder))
        temp = pubs_stats[pubs_stats['funder_name']==funder]
        print(temp[cols].describe())
        pre_citations_avg.append(temp['pre_citations'].mean())
        pre_citations_med.append(temp['pre_citations'].median())
        pre_nb_pubs_avg.append(temp['pre_nb_pubs'].mean())
        pre_nb_pubs_med.append(temp['pre_nb_pubs'].median())
    
    print("\n")
    
    print("Post-Funding Publication Statistics:")
    post_citations_avg = []
    post_citations_med = []
    post_nb_pubs_avg = []
    post_nb_pubs_med = []
    cols = [col for col in pubs_stats.columns if col[:5]=="post_"]
    for funder in funders:
        print("\n{}:".format(funder))
        temp = pubs_stats[pubs_stats['funder_name']==funder]
        print(temp[cols].describe())
        post_citations_avg.append(temp['post_citations'].mean())
        post_citations_med.append(temp['post_citations'].median())
        post_nb_pubs_avg.append(temp['post_nb_pubs'].mean())
        post_nb_pubs_med.append(temp['post_nb_pubs'].median())
        
    return pre_citations_avg, pre_nb_pubs_avg, pre_citations_med, pre_nb_pubs_med, post_citations_avg, post_citations_med, post_nb_pubs_avg, post_nb_pubs_med

### For all Publications

In [44]:
# pubs_mrg = pd.merge(funded_grnts, pubs, how='left', on='rsr_id')
# pubs_mrg.columns = [col+"_2" if (col not in list(funded_grnts))&(col[-2:]!="_2") else col for col in pubs_mrg.columns]
# pubs_stats_all = pub_groupby(pubs_mrg)

In [45]:
# pubs_stats_all.shape

In [46]:
# pubs_stats_all.describe(include='all')

In [47]:
# pre_citations_avg, pre_nb_pubs_avg, pre_citations_med, pre_nb_pubs_med, post_citations_avg, post_citations_med, post_nb_pubs_avg, post_nb_pubs_med = pub_stats(pubs_stats_all)

### Within 5 years of the funded grant

In [48]:
pubs_mrg = pd.merge(funded_grnts, pubs, how='left', on='rsr_id', suffixes=('', '_2'))
pubs_mrg.columns = [col+"_2" if (col not in list(funded_grnts))&(col[-2:]!="_2") else col for col in pubs_mrg.columns]
pubs_mrg = pubs_mrg[abs(pubs_mrg['start_date']-pubs_mrg['date_2'])/timedelta(days=365)<=5]
pubs_mrg = pd.merge(funded_grnts_info, pubs_mrg, how='left', on=grnt_cols)
pubs_mrg = pubs_mrg.reset_index(drop=True)
pubs_stats_5y = pub_groupby(pubs_mrg)

In [49]:
pubs_stats_5y.shape

(20685, 17)

In [50]:
pubs_stats_5y.describe(include='all')

Unnamed: 0,end_date,funder_id,funder_name,funding_amount,grant_id,rcdc_codes,rcdc_names,start_date,funding_len,nb_rsrs,year,nb_rcdc,rsr_id,post_citations,post_nb_pubs,pre_citations,pre_nb_pubs
count,20650,19197,20685,18438.0,20685,17739.0,19207,20685,20650.0,20051.0,20685.0,19207.0,16434,16434.0,16434.0,16434.0,16434.0
unique,1088,5,10,,17351,7436.0,7697,1178,,,,,10695,,,,
top,2017-01-01 00:00:00,grid.48336.3a,National Cancer Institute,,grant.2695966,503.0,Cancer,2009-01-01 00:00:00,,,,,ur.0634201432.39,,,,
freq,657,13524,13524,,17,699.0,734,1482,,,,,127,,,,
first,2007-01-31 00:00:00,,,,,,,2007-01-01 00:00:00,,,,,,,,,
last,2024-10-31 00:00:00,,,,,,,2012-12-31 00:00:00,,,,,,,,,
mean,,,,1622509.0,,,,,3.88307,1.592988,2009.419096,5.518821,,1493.255568,41.868991,1858.438238,30.436534
std,,,,8667741.0,,,,,2.426774,1.496852,1.672491,2.746756,,2712.108424,47.322576,3131.28332,35.534517
min,,,,0.0,,,,,0.00274,1.0,2007.0,1.0,,0.0,0.0,0.0,0.0
25%,,,,264352.8,,,,,2.00274,1.0,2008.0,3.0,,235.0,13.0,324.0,9.0


In [51]:
pre_citations_avg, pre_nb_pubs_avg, pre_citations_med, pre_nb_pubs_med, post_citations_avg, post_citations_med, post_nb_pubs_avg, post_nb_pubs_med = pub_stats(pubs_stats_5y)

Pre-Funding Publication Statistics:

Cancer Research UK:
       pre_citations  pre_nb_pubs
count    1462.000000  1462.000000
mean     2095.625171    29.878933
std      2728.844275    31.750003
min         0.000000     0.000000
25%       418.750000    10.000000
50%      1080.500000    21.000000
75%      2543.250000    40.000000
max     21056.000000   393.000000

French Institute of Health and Medical Research:
       pre_citations  pre_nb_pubs
count     116.000000   116.000000
mean      794.896552    28.413793
std      1336.182390    37.870734
min         0.000000     0.000000
25%        88.250000     7.000000
50%       350.000000    18.500000
75%       849.250000    35.250000
max      9027.000000   277.000000

French Institute of Health and Medical Research - ORCID Confirmed:
       pre_citations  pre_nb_pubs
count      35.000000    35.000000
mean      518.228571    22.714286
std       719.033505    23.951632
min         0.000000     0.000000
25%        73.500000     8.000000
50%      

## Subsequent Publication RCDC Codes

In [52]:
funded_grnt_rcdc = funded_grnts[['grant_id', 'rcdc_codes', 'rcdc_names']].drop_duplicates()
funded_grnt_rcdc = funded_grnt_rcdc[funded_grnt_rcdc['rcdc_codes'].notnull()].reset_index()
temp1 = pd.concat([Series(row['grant_id'], row['rcdc_names'].split('; '))
                   for _, row in funded_grnt_rcdc.iterrows()]).reset_index()
temp1.columns = ['rcdc_name', 'grant_id']
temp2 = pd.concat([Series(row['grant_id'], row['rcdc_codes'].split('; '))
                   for _, row in funded_grnt_rcdc.iterrows()]).reset_index()
temp2.columns = ['rcdc_code', 'grant_id_2']
funded_grnt_rcdc = pd.concat([temp1, temp2], axis=1)
del funded_grnt_rcdc['grant_id_2']

In [53]:
funded_grnt_rcdc.head()

Unnamed: 0,rcdc_name,grant_id,rcdc_code
0,Cancer,grant.2482176,503
1,Patient Safety,grant.2482176,546
2,Bioengineering,grant.2482176,337
3,Breast Cancer,grant.2482176,316
4,Clinical Research,grant.2482176,507


In [54]:
funded_grnt_rcdc_1 = funded_grnt_rcdc.drop_duplicates('grant_id', keep='first').reset_index(drop=True)

In [55]:
funded_grnt_rcdc_1.head()

Unnamed: 0,rcdc_name,grant_id,rcdc_code
0,Cancer,grant.2482176,503
1,Genetics,grant.7154902,526
2,Genetics,grant.7154248,526
3,Vaccine Related,grant.7154673,583
4,Rare Diseases,grant.7154822,559


In [56]:
pub_rcdc = pubs[['pub_id', 'rcdc_codes', 'rcdc_names']].drop_duplicates()
pub_rcdc = pub_rcdc[pub_rcdc['rcdc_codes'].notnull()].reset_index()
temp1 = pd.concat([Series(row['pub_id'], row['rcdc_names'].split('; '))
                   for _, row in pub_rcdc.iterrows()]).reset_index()
temp1.columns = ['rcdc_name', 'pub_id']
temp2 = pd.concat([Series(row['pub_id'], row['rcdc_codes'].split('; '))
                   for _, row in pub_rcdc.iterrows()]).reset_index()
temp2.columns = ['rcdc_code', 'pub_id_2']
pub_rcdc = pd.concat([temp1, temp2], axis=1)
del pub_rcdc['pub_id_2']

In [57]:
pub_rcdc.head()

Unnamed: 0,rcdc_name,pub_id,rcdc_code
0,Biotechnology,pub.1101436524,338
1,Human Genome,pub.1101436524,363
2,Bioengineering,pub.1101436524,337
3,Genetics,pub.1101436524,526
4,Breast Cancer,pub.1100823773,316


In [58]:
pub_rcdc_1 = pub_rcdc.drop_duplicates('pub_id', keep='first').reset_index(drop=True)

In [59]:
pub_rcdc_1.head()

Unnamed: 0,rcdc_name,pub_id,rcdc_code
0,Biotechnology,pub.1101436524,338
1,Breast Cancer,pub.1100823773,316
2,Estrogen,pub.1101201637,353
3,Nutrition,pub.1100619667,388
4,Digestive Diseases,pub.1101006083,344


### Using First RCDC Code

In [60]:
pubs_mrg = pd.merge(funded_grnts, pubs, how='left', on='rsr_id', suffixes=('', '_pub'))
pubs_mrg = pubs_mrg[(abs(pubs_mrg['date']-pubs_mrg['start_date'])/timedelta(days=365)<=5)
                    & (pubs_mrg['date']>pubs_mrg['start_date'])].reset_index(drop=True)
pubs_mrg = pd.merge(funded_grnts_info, pubs_mrg, how='left', on=grnt_cols)

In [61]:
cols = ['funder_name', 'funding_amount', 'grant_id', 'rsr_id', 'start_date', 'pub_id', 'citations']
rcdc_1_comp = pubs_mrg[cols].copy()

In [62]:
rcdc_1_comp = pd.merge(rcdc_1_comp, funded_grnt_rcdc_1, how='left', on='grant_id', suffixes=('', '_1_grnt'))
rcdc_1_comp = pd.merge(rcdc_1_comp, pub_rcdc_1, how='left', on='pub_id', suffixes=('', '_1_pub'))
rcdc_1_comp.rename(columns={'rcdc_name': 'rcdc_name_1_grnt', 'rcdc_code': 'rcdc_code_1_grnt'}, inplace=True)

In [63]:
rcdc_1_comp.head()

Unnamed: 0,funder_name,funding_amount,grant_id,rsr_id,start_date,pub_id,citations,rcdc_name_1_grnt,rcdc_code_1_grnt,rcdc_name_1_pub,rcdc_code_1_pub
0,National Cancer Institute,3181288.0,grant.2482176,ur.0755627762.12,2012-12-31,pub.1022823306,2.0,Cancer,503,Clinical Research,507
1,National Cancer Institute,3181288.0,grant.2482176,ur.0755627762.12,2012-12-31,pub.1085311781,0.0,Cancer,503,Breast Cancer,316
2,National Cancer Institute,3181288.0,grant.2482176,ur.0755627762.12,2012-12-31,pub.1091145053,1.0,Cancer,503,Prevention,558
3,National Cancer Institute,3181288.0,grant.2482176,ur.0755627762.12,2012-12-31,pub.1085122718,0.0,Cancer,503,Cancer,503
4,National Cancer Institute,3181288.0,grant.2482176,ur.0755627762.12,2012-12-31,pub.1084430951,57.0,Cancer,503,Breast Cancer,316


In [64]:
rcdc_1_comp_agency = rcdc_1_comp.groupby(['funder_name', 'rcdc_name_1_grnt', 'rcdc_name_1_pub'])
rcdc_1_comp_agency = pd.DataFrame({'nb_obs': rcdc_1_comp_agency.size()
                                   , 'nb_grnts': rcdc_1_comp_agency['grant_id'].nunique()
                                   , 'nb_pubs': rcdc_1_comp_agency['pub_id'].nunique()
                                   , 'nb_rsrs': rcdc_1_comp_agency['rsr_id'].nunique()
                                  }).reset_index()

In [65]:
rcdc_1_comp_agency.head()

Unnamed: 0,funder_name,rcdc_name_1_grnt,rcdc_name_1_pub,nb_grnts,nb_obs,nb_pubs,nb_rsrs
0,Cancer Research UK,Aging,Aging,2,3,3,2
1,Cancer Research UK,Aging,Bioengineering,1,1,1,1
2,Cancer Research UK,Aging,Cancer,2,11,11,2
3,Cancer Research UK,Aging,Cervical Cancer,1,4,4,1
4,Cancer Research UK,Aging,Clinical Research,2,4,4,2


In [66]:
rcdc_1_comp_agency.to_csv('../output/first_rcdc_grnt_pubs_comp.csv', index=False)

### Using All RCDC Codes

In [67]:
pubs_mrg = pd.merge(funded_grnts, pubs, how='left', on='rsr_id', suffixes=('', '_pub'))
pubs_mrg = pubs_mrg[(abs(pubs_mrg['date']-pubs_mrg['start_date'])/timedelta(days=365)<=5)
                    & (pubs_mrg['date']>pubs_mrg['start_date'])].reset_index(drop=True)
pubs_mrg = pd.merge(funded_grnts_info, pubs_mrg, how='left', on=grnt_cols)

In [68]:
cols = ['funder_name', 'funding_amount', 'grant_id', 'rsr_id', 'start_date', 'pub_id', 'citations']
cols += ['rcdc_names_pub', 'rcdc_codes_pub']
rcdc_comp = pubs_mrg[cols].copy()

In [69]:
rcdc_comp = pd.merge(rcdc_comp, funded_grnt_rcdc, how='left', on='grant_id')

# Get rid or ignore Null Values?
process = 'ignore' # or 'get_rid'
if process == 'get_rid':
    rcdc_comp = rcdc_comp[(rcdc_comp['rcdc_name'].notnull())&(rcdc_comp['rcdc_names_pub'].notnull())]
elif process == 'ignore':
    rcdc_comp['rcdc_name'] = rcdc_comp['rcdc_name'].fillna('')
    rcdc_comp['rcdc_names_pub'] = rcdc_comp['rcdc_names_pub'].fillna('')
else:
    print("Don't forget to choose !")

rcdc_comp = rcdc_comp.reset_index(drop=True)

In [70]:
def regin(df):
    return bool(re.search(r"(^|; )\b{}\b(; |$)".format(df['rcdc_name']), df['rcdc_names_pub']))
rcdc_comp['flag'] = rcdc_comp.apply(regin, axis=1)

In [71]:
rcdc_comp.head()

Unnamed: 0,funder_name,funding_amount,grant_id,rsr_id,start_date,pub_id,citations,rcdc_names_pub,rcdc_codes_pub,rcdc_name,rcdc_code,flag
0,National Cancer Institute,3181288.0,grant.2482176,ur.0755627762.12,2012-12-31,pub.1022823306,2.0,Clinical Research; Breast Cancer; Obesity; Can...,507; 316; 389; 503; 546; 498,Cancer,503,True
1,National Cancer Institute,3181288.0,grant.2482176,ur.0755627762.12,2012-12-31,pub.1022823306,2.0,Clinical Research; Breast Cancer; Obesity; Can...,507; 316; 389; 503; 546; 498,Patient Safety,546,True
2,National Cancer Institute,3181288.0,grant.2482176,ur.0755627762.12,2012-12-31,pub.1022823306,2.0,Clinical Research; Breast Cancer; Obesity; Can...,507; 316; 389; 503; 546; 498,Bioengineering,337,False
3,National Cancer Institute,3181288.0,grant.2482176,ur.0755627762.12,2012-12-31,pub.1022823306,2.0,Clinical Research; Breast Cancer; Obesity; Can...,507; 316; 389; 503; 546; 498,Breast Cancer,316,True
4,National Cancer Institute,3181288.0,grant.2482176,ur.0755627762.12,2012-12-31,pub.1022823306,2.0,Clinical Research; Breast Cancer; Obesity; Can...,507; 316; 389; 503; 546; 498,Clinical Research,507,True


In [72]:
rcdc_comp_grnts = rcdc_comp.copy()
# del rcdc_comp_grnts['rsr_id']
# rcdc_comp_grnts = rcdc_comp_grnts.drop_duplicates().reset_index(drop=True)

rcdc_comp_grnts = rcdc_comp_grnts.groupby(['rcdc_name', 'grant_id', 'rcdc_code', 'funder_name', 'funding_amount'])
rcdc_comp_grnts = pd.DataFrame({'nb_pubs': rcdc_comp_grnts.size()
                                , 'rcdc_pubs': rcdc_comp_grnts['flag'].sum()
                               }).reset_index()
rcdc_comp_grnts['rcdc_pct'] = rcdc_comp_grnts['rcdc_pubs']/rcdc_comp_grnts['nb_pubs']
rcdc_comp_grnts.sort_values('grant_id').head()

Unnamed: 0,rcdc_name,grant_id,rcdc_code,funder_name,funding_amount,nb_pubs,rcdc_pubs,rcdc_pct
3194,Bioengineering,grant.2343411,337,National Cancer Institute,1200000.0,1,0.0,0.0
13247,Cancer,grant.2343411,503,National Cancer Institute,1200000.0,1,0.0,0.0
54564,Networking and Information Technology R&D,grant.2343411,541,National Cancer Institute,1200000.0,1,0.0,0.0
13248,Cancer,grant.2343414,503,National Cancer Institute,1463779.0,1,0.0,0.0
13249,Cancer,grant.2343415,503,National Cancer Institute,1931503.0,1,0.0,0.0


In [73]:
rcdc_comp_fund = rcdc_comp_grnts.groupby(['rcdc_name', 'rcdc_code', 'funder_name'])
rcdc_comp_fund = pd.DataFrame({'nb_grants': rcdc_comp_fund.size()
                               , 'nb_pubs': rcdc_comp_fund['nb_pubs'].sum()
                               , 'rcdc_pubs': rcdc_comp_fund['rcdc_pubs'].sum()
                               , 'avg_rcdc_pct': rcdc_comp_fund['rcdc_pct'].mean()
                              }).reset_index()
rcdc_comp_fund['rcdc_pct'] = rcdc_comp_fund['rcdc_pubs']/rcdc_comp_fund['nb_pubs']
rcdc_comp_fund.sort_values(['rcdc_name', 'funder_name']).head()

Unnamed: 0,rcdc_name,rcdc_code,funder_name,avg_rcdc_pct,nb_grants,nb_pubs,rcdc_pubs,rcdc_pct
0,ALS,292,National Cancer Institute,0.052632,2,76,4.0,0.052632
1,Acquired Cognitive Impairment,487,National Cancer Institute,0.055772,12,282,20.0,0.070922
2,Acquired Cognitive Impairment,487,National Health and Medical Research Council,0.119929,3,97,6.0,0.061856
3,Acute Respiratory Distress Syndrome,293,National Cancer Institute,0.083333,1,60,5.0,0.083333
4,Acute Respiratory Distress Syndrome,293,National Health and Medical Research Council,0.026801,5,858,13.0,0.015152


## Creating Output Table

Table with funders as columns and:
- 1st RCDC Code
- 2nd RCDC Code
- 3rd RCDC Code
- Mean number of previous grants
- Median number of previous grants
- Mean amount of previous grants
- Median number of previous grants
- Mean length of previous grants
- Median length of previous grants

In [74]:
df = pd.DataFrame({'rcdc_1':rcdc_1
                    , 'rcdc_2':rcdc_2
                    , 'rcdc_3':rcdc_3
                    , 'rcdc_4':rcdc_4
                    , 'rcdc_5':rcdc_5                   
                    , 'nb_unique_rsrs': nb_unique_rsrs
                    , 'nb_unique_grnts': nb_unique_grnts
                    , 'funded_amt_avg': funded_amt_avg
                    , 'funded_amt_med': funded_amt_med
                    , 'funded_len_avg': funded_len_avg
                    , 'funded_len_med': funded_len_med
                    , 'nb_grnt_rsrs_avg': nb_grnt_rsrs_avg
                    , 'nb_grnt_rsrs_med': nb_grnt_rsrs_med
                    , 'pre_avg_fund_len_avg': pre_avg_fund_len_avg
                    , 'pre_avg_fund_len_med': pre_avg_fund_len_med
                    , 'pre_avg_team_size_avg': pre_avg_team_size_avg
                    , 'pre_avg_team_size_med': pre_avg_team_size_med
                    , 'pre_fund_amt_avg': pre_fund_amt_avg
                    , 'pre_fund_amt_med': pre_fund_amt_med
                    , 'pre_nb_grnts_avg': pre_nb_grnts_avg
                    , 'pre_nb_grnts_med': pre_nb_grnts_med
                    , 'post_avg_fund_len_avg': post_avg_fund_len_avg
                    , 'post_avg_fund_len_med': post_avg_fund_len_med
                    , 'post_avg_team_size_avg': post_avg_team_size_avg
                    , 'post_avg_team_size_med': post_avg_team_size_med
                    , 'post_fund_amt_avg': post_fund_amt_avg
                    , 'post_fund_amt_med': post_fund_amt_med
                    , 'post_nb_grnts_avg': post_nb_grnts_avg
                    , 'post_nb_grnts_med': post_nb_grnts_med
                    , 'pre_citations_avg': pre_citations_avg
                    , 'pre_nb_pubs_avg': pre_nb_pubs_avg
                    , 'pre_citations_med': pre_citations_med
                    , 'pre_nb_pubs_med': pre_nb_pubs_med
                    , 'post_citations_avg': post_citations_avg
                    , 'post_citations_med': post_citations_med
                    , 'post_nb_pubs_avg': post_nb_pubs_avg
                    , 'post_nb_pubs_med': post_nb_pubs_med
                  }, index=funders).transpose()
df

Unnamed: 0,Cancer Research UK,French Institute of Health and Medical Research,French Institute of Health and Medical Research - ORCID Confirmed,French National Cancer Institute,French National Cancer Institute - ORCID Confirmed,Ministère des Affaires sociales et de la Santé,Ministère des Affaires sociales et de la Santé - ORCID Confirmed,National Cancer Institute,National Health and Medical Research Council,Wellcome Trust
funded_amt_avg,,234241,232874,557125,464459,442563,420509,1.85636e+06,609689,797714
funded_amt_med,,201540,235320,417676,471555,390888,363145,732061,433722,325304
funded_len_avg,3.37239,2.15015,2.08211,2.73308,2.65841,3.19443,3.11434,4.25496,2.8267,3.22012
funded_len_med,3,2,1.99726,3,3,3.00274,3.00274,3.9589,3,3.08493
nb_grnt_rsrs_avg,1.01664,1.11404,1.08571,1.38386,1.49242,1.15764,1.23438,1.10404,2.69925,1.57088
nb_grnt_rsrs_med,1,1,1,1,1,1,1,1,2,1
nb_unique_grnts,2225,114,35,730,132,406,64,12468,1064,261
nb_unique_rsrs,1370,122,34,669,95,333,50,8450,1944,328
post_avg_fund_len_avg,0.962587,0.197712,0.170896,0.501654,0.689976,0.557197,0.631883,0.945569,0.847795,0.847452
post_avg_fund_len_med,0.833333,0,0,0,0,0,0,0.500913,0.81868,0.500587


In [75]:
# Export to Excel
ls = !ls ../output/
if 'comparison_statistics.xlsx' in ls:
    book = load_workbook('../output/comparison_statistics.xlsx')
    writer = pd.ExcelWriter('../output/comparison_statistics.xlsx', engine='openpyxl') 
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
    df.to_excel(writer, "RAW")
    writer.save()
else:
    df.to_excel('../output/comparison_statistics.xlsx', sheet_name = 'RAW')

## Sandbox