# Comparison Statistics between Cohorts

## Python Setup

In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from pandas import Series
from openpyxl import load_workbook
import re
pd.options.display.max_columns = 100

## Load In Data

### Original Data

In [2]:
inca_grnts = pd.read_csv('../data/inca/inca_grants_details.csv', low_memory=False)
inca_pubs = pd.read_csv('../data/inca/inca_pub_details.csv', low_memory=False)
inca_orcid_responses = pd.read_csv('../output/researcher_info/researcher_info_ORCID_returns.csv')

In [3]:
# Clean Grants File
inca_grnts.columns = [x.lower().replace(' ', '_') for x in inca_grnts.columns]
inca_grnts.rename(columns={'inca_id': 'rsr_id'
                            , 'funding_amount_($)': 'funding_amount'
                            , 'dimensions_grant_id': 'grant_id'
                            , 'funder': 'funder_name'
                            , 'rcdc': 'rcdc_names'
                           }, inplace=True)
del inca_grnts['prenom_port'], inca_grnts['nom_port'], inca_grnts['title']
del inca_grnts['abstract'], inca_grnts['reference'], inca_grnts['organisme_port']
del inca_grnts['research_org_names'], inca_grnts['research_org_ids'], inca_grnts['for']
inca_grnts['rcdc_names'] = inca_grnts['rcdc_names'].replace(';', '; ', regex=True)

In [4]:
# Get INCA-funded Grants from file
inca_funders = ["French National Cancer Institute", "French Institute of Health and Medical Research"]
inca_funders += ["Ministère des Affaires sociales et de la Santé"]
inca_funded_grnts = inca_grnts[(inca_grnts['funder_name'].isin(inca_funders))
                               & (pd.DatetimeIndex(inca_grnts['start_date']).year>=2007)
                               & (pd.DatetimeIndex(inca_grnts['start_date']).year<=2012)].reset_index(drop=True)

In [5]:
# Replace Funder Names by "French Funders"
del inca_funded_grnts['funder_name']
inca_funded_grnts['funder_name'] = "INCa/INSERM/DGOS"

In [6]:
# Inca ORCID-responses: keep only when there was an ORCID response.
inca_orcid_responses = inca_orcid_responses[inca_orcid_responses['ORCID Return']=="YES"].reset_index(drop=True)
conf_ids = list(inca_orcid_responses['INCA ID'])

# Change Funder Name in INCa-Funded grants when they responded to ORCID
inca_funded_grnts['funder_name'] = (inca_funded_grnts['funder_name'] 
                                    + np.where(inca_funded_grnts['rsr_id'].isin(conf_ids), ' - ORCID Confirmed', ''))

In [7]:
# Clean Pubs File
inca_pubs.columns = [x.lower().replace(' ', '_') for x in inca_pubs.columns]
inca_pubs.rename(columns={'inca_id': 'rsr_id'
                          , 'dimensions_publication_id': 'pub_id'
                          , 'publication_year': 'date'
                          , 'rcdc': 'rcdc_names'
                          , 'times_cited': 'citations'
                         }, inplace=True)
del inca_pubs['prenom_port'], inca_pubs['nom_port'], inca_pubs['organisme_port']
del inca_pubs['dimensions_researcher_id'], inca_pubs['additional_researcher_dim_id_to_combine']
del inca_pubs['additional_researcher_dim_id_to_combine_2'], inca_pubs['orcid'], inca_pubs['title'], inca_pubs['issue']
del inca_pubs['pages'], inca_pubs['pubmed_id'], inca_pubs['volume'], inca_pubs['relative_citation_ratio']
del inca_pubs['altmetric'], inca_pubs['open_access'], inca_pubs['author_names'], inca_pubs['research_org_names']
del inca_pubs['research_org_ids'], inca_pubs['for'], inca_pubs['journal_id'], inca_pubs['journal_title']
del inca_pubs['publication_date']
inca_pubs['date'] = inca_pubs['date'].apply(str).replace('\.0', '', regex=True)
inca_pubs['date'] = inca_pubs['date'].apply(lambda x: np.nan if x=="nan" else x+"-01-01")
inca_pubs['rcdc_names'] = inca_pubs['rcdc_names'].replace(';', '; ', regex=True)
inca_pubs['citations'] = pd.to_numeric(inca_pubs['citations'])

### Counterfactual Data

In [8]:
funded_grnts = pd.read_csv('../data/counterfactual/counterfactual_funded_grants.csv')
grnts = pd.read_csv('../data/counterfactual/counterfactual_researcher_grants.csv')
pubs = pd.read_csv('../data/counterfactual/counterfactual_researcher_publications.csv')

In [9]:
# Restrict to Grants funded between 2007 and 2012
funded_grnts = funded_grnts[(pd.DatetimeIndex(funded_grnts['start_date']).year>=2007)
                            & (pd.DatetimeIndex(funded_grnts['start_date']).year<=2012)].reset_index(drop=True)

### Combining Data

In [10]:
funded_grnts = pd.concat([funded_grnts, inca_funded_grnts]).reset_index(drop = True)
grnts = pd.concat([grnts, inca_grnts]).reset_index(drop = True)
pubs = pd.concat([pubs, inca_pubs]).reset_index(drop = True)

### List of All Funders

In [11]:
funders = sorted(list(funded_grnts['funder_name'].unique()))
print(funders)

['Cancer Research UK', 'French Cancer Funders', 'French Cancer Funders - ORCID Confirmed', 'National Cancer Institute', 'National Health and Medical Research Council', 'Wellcome Trust']


## Cleaning Data

In [12]:
funded_grnts['start_date'] = pd.to_datetime(funded_grnts['start_date'])
funded_grnts['end_date'] = pd.to_datetime(funded_grnts['end_date'])
grnts['start_date'] = pd.to_datetime(grnts['start_date'])
grnts['end_date'] = pd.to_datetime(grnts['end_date'])
pubs['date'] = pd.to_datetime(pubs['date'])

In [13]:
funded_grnts['funding_len'] = (funded_grnts['end_date']-funded_grnts['start_date'])/timedelta(days=365)
grnts['funding_len'] = (grnts['end_date']-grnts['start_date'])/timedelta(days=365)

In [14]:
funded_grnts['nb_rsrs'] = funded_grnts.groupby('grant_id')['rsr_id'].transform('nunique')
funded_grnts['nb_rsrs'] = funded_grnts['nb_rsrs'].replace(0, np.nan)
grnts['nb_rsrs'] = grnts.groupby('grant_id')['rsr_id'].transform('nunique')
grnts['nb_rsrs'] = grnts['nb_rsrs'].replace(0, np.nan)
pubs['nb_rsrs'] = pubs.groupby('pub_id')['rsr_id'].transform('nunique')
pubs['nb_rsrs'] = pubs['nb_rsrs'].replace(0, np.nan)

In [15]:
grnt_cols = list(funded_grnts)
grnt_cols.remove('rsr_id')
funded_grnts_info = funded_grnts[grnt_cols].drop_duplicates().reset_index(drop=True)

## Funded Grants Statistics

### Number of Grants per Funder

In [16]:
funded_grnts.describe(include='all')

Unnamed: 0,end_date,funder_id,funder_name,funding_amount,grant_id,rcdc_codes,rcdc_names,rsr_id,start_date,funding_len,nb_rsrs
count,21068,19614,21116,18821.0,21116,18163.0,19645,20492,21116,21068.0,20492.0
unique,1086,4,6,,17545,7263.0,7795,13178,1177,,
top,2017-01-01 00:00:00,grid.48336.3a,National Cancer Institute,,grant.2695966,503.0,Cancer,ur.01117731572.33,2009-01-01 00:00:00,,
freq,695,13795,13795,,17,727.0,764,384,1562,,
first,2007-01-31 00:00:00,,,,,,,,2007-01-01 00:00:00,,
last,2024-10-31 00:00:00,,,,,,,,2012-12-31 00:00:00,,
mean,,,,1641116.0,,,,,,3.893504,1.703884
std,,,,8594130.0,,,,,,2.433762,1.620099
min,,,,0.0,,,,,,0.00274,1.0
25%,,,,281155.0,,,,,,2.00274,1.0


In [17]:
print("Number of unique researchers funded by each agency:\n")
print(funded_grnts.groupby('funder_name')['rsr_id'].nunique())
nb_unique_rsrs = list(funded_grnts.groupby('funder_name')['rsr_id'].nunique())

Number of unique researchers funded by each agency:

funder_name
Cancer Research UK                              1384
French Cancer Funders                            830
French Cancer Funders - ORCID Confirmed          151
National Cancer Institute                       8485
National Health and Medical Research Council    2071
Wellcome Trust                                   333
Name: rsr_id, dtype: int64


In [18]:
print("Number of unique grants funded by each agency:\n")
print(funded_grnts.groupby('funder_name')['grant_id'].nunique())
nb_unique_grnts = list(funded_grnts.groupby('funder_name')['grant_id'].nunique())

Number of unique grants funded by each agency:

funder_name
Cancer Research UK                               2254
French Cancer Funders                            1132
French Cancer Funders - ORCID Confirmed           231
National Cancer Institute                       12555
National Health and Medical Research Council     1155
Wellcome Trust                                    263
Name: grant_id, dtype: int64


### Number of Grants per Researcher

In [19]:
print("There are {} total researchers.".format(funded_grnts['rsr_id'].nunique()))
print("There are {} total grants.".format(funded_grnts['grant_id'].nunique()))

There are 13178 total researchers.
There are 17545 total grants.


In [20]:
print("Number of grants from agencies per researcher (as % of total):\n")
print(funded_grnts.groupby('rsr_id')['grant_id'].nunique().value_counts(normalize=True).head())

Number of grants from agencies per researcher (as % of total):

1    0.722492
2    0.168007
3    0.054712
4    0.024738
5    0.014039
Name: grant_id, dtype: float64


In [21]:
print("By how many agencies are the researchers funded?\n")
print(funded_grnts.groupby('rsr_id')['funder_name'].nunique().value_counts(normalize=True))

By how many agencies are the researchers funded?

1    0.994536
2    0.005236
3    0.000152
4    0.000076
Name: funder_name, dtype: float64


In [22]:
print("How many grants does each agency give to its reserachers in the 5 focal years?")
for funder in funders:
    print("\n{}:".format(funder))
    temp = funded_grnts[funded_grnts['funder_name']==funder].groupby('rsr_id')['grant_id'].nunique().reset_index()
    print(temp['grant_id'].value_counts(normalize=True).head(3))
    print("(Total researchers: {})".format(temp.shape[0]))

How many grants does each agency give to its reserachers in the 5 focal years?

Cancer Research UK:
1    0.705202
2    0.159682
3    0.054191
Name: grant_id, dtype: float64
(Total researchers: 1384)

French Cancer Funders:
1    0.718072
2    0.160241
3    0.060241
Name: grant_id, dtype: float64
(Total researchers: 830)

French Cancer Funders - ORCID Confirmed:
1    0.675497
2    0.192053
4    0.066225
Name: grant_id, dtype: float64
(Total researchers: 151)

National Cancer Institute:
1    0.724337
2    0.173954
3    0.053742
Name: grant_id, dtype: float64
(Total researchers: 8485)

National Health and Medical Research Council:
1    0.733462
2    0.148238
3    0.058909
Name: grant_id, dtype: float64
(Total researchers: 2071)

Wellcome Trust:
1    0.801802
2    0.165165
3    0.018018
Name: grant_id, dtype: float64
(Total researchers: 333)


### Grant Characteristics

In [23]:
funded_grnts_info.head()

Unnamed: 0,end_date,funder_id,funder_name,funding_amount,grant_id,rcdc_codes,rcdc_names,start_date,funding_len,nb_rsrs
0,2018-11-30,grid.48336.3a,National Cancer Institute,3181288.0,grant.2482176,503; 546; 337; 316; 507,Cancer; Patient Safety; Bioengineering; Breast...,2012-12-31,5.917808,2.0
1,2017-11-30,grid.48336.3a,National Cancer Institute,896400.0,grant.2411281,526; 414; 344; 503; 507,Genetics; Colo-Rectal Cancer; Digestive Diseas...,2012-12-18,4.953425,1.0
2,2013-12-16,grid.48336.3a,National Cancer Institute,45010.0,grant.2344785,,,2012-12-17,0.99726,1.0
3,2015-12-14,grid.48336.3a,National Cancer Institute,109427.0,grant.2358038,344; 363; 338; 503; 526; 414,Digestive Diseases; Human Genome; Biotechnolog...,2012-12-15,2.99726,1.0
4,2018-11-30,grid.48336.3a,National Cancer Institute,1264455.0,grant.2482260,559; 439; 313; 503; 501; 337,Rare Diseases; Diagnostic Radiology; Brain Can...,2012-12-15,5.961644,1.0


In [24]:
funded_amt_avg = []
funded_amt_med = []
funded_len_avg = []
funded_len_med = []
nb_grnt_rsrs_avg = []
nb_grnt_rsrs_med = []
for funder in funders:
    print("\n{}:".format(funder))
    temp = funded_grnts_info[funded_grnts_info['funder_name']==funder]
    print(temp.describe())
    print("(Total grants: {})".format(funded_grnts_info[funded_grnts_info['funder_name']==funder].shape[0]))
    funded_amt_avg.append(temp['funding_amount'].mean())
    funded_amt_med.append(temp['funding_amount'].median())
    funded_len_avg.append(temp['funding_len'].mean())
    funded_len_med.append(temp['funding_len'].median())
    nb_grnt_rsrs_avg.append(temp['nb_rsrs'].mean())
    nb_grnt_rsrs_med.append(temp['nb_rsrs'].median())


Cancer Research UK:
       funding_amount  funding_len     nb_rsrs
count             0.0  2254.000000  2252.00000
mean              NaN     3.372958     1.01643
std               NaN     2.070945     0.12715
min               NaN     0.002740     1.00000
25%               NaN     1.997260     1.00000
50%               NaN     3.000000     1.00000
75%               NaN     5.000000     1.00000
max               NaN    13.008219     2.00000
(Total grants: 2254)

French Cancer Funders:
       funding_amount  funding_len      nb_rsrs
count    1.131000e+03  1128.000000  1132.000000
mean     4.853801e+05     2.833887     1.159894
std      8.455426e+05     0.810038     0.390039
min      2.028000e+04     0.509589     1.000000
25%      1.730930e+05     2.169863     1.000000
50%      3.698380e+05     3.000000     1.000000
75%      5.984260e+05     3.002740     1.000000
max      1.406214e+07     5.424658     3.000000
(Total grants: 1132)

French Cancer Funders - ORCID Confirmed:
       funding_a

In [25]:
funded_grnts_info['year'] = pd.DatetimeIndex(funded_grnts_info['start_date']).year
pd.crosstab(funded_grnts_info['funder_name'], funded_grnts_info['year'])

year,2007,2008,2009,2010,2011,2012
funder_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cancer Research UK,549,428,346,314,346,271
French Cancer Funders,244,125,170,167,210,216
French Cancer Funders - ORCID Confirmed,49,30,27,36,48,41
National Cancer Institute,1777,2596,2873,1814,1771,1724
National Health and Medical Research Council,141,208,198,132,243,233
Wellcome Trust,33,40,37,54,55,44


### RCDC Codes Analysis

In [26]:
funded_grnts_info['nb_rcdc'] = (funded_grnts_info['rcdc_names'].str.count(';')+1)

In [27]:
print("Average number of RCDC's per Grant:\n")
print(funded_grnts_info.groupby('funder_name')['nb_rcdc'].mean())

Average number of RCDC's per Grant:

funder_name
Cancer Research UK                              4.235653
French Cancer Funders                           5.466368
French Cancer Funders - ORCID Confirmed         5.495614
National Cancer Institute                       5.760232
National Health and Medical Research Council    4.793074
Wellcome Trust                                  4.498099
Name: nb_rcdc, dtype: float64


In [28]:
rcdc = funded_grnts_info[['grant_id', 'rcdc_names']]
rcdc = rcdc[rcdc['rcdc_names'].notnull()]

In [29]:
rcdc = pd.concat([Series(row['grant_id'], row['rcdc_names'].split('; '))
                  for _, row in rcdc.iterrows()]).reset_index()
rcdc.columns = ['rcdc_name', 'grant_id']

In [30]:
rcdc = pd.merge(funded_grnts_info[['funder_name', 'grant_id', 'year']], rcdc, how='left', on='grant_id')

In [31]:
rcdc.head()

Unnamed: 0,funder_name,grant_id,year,rcdc_name
0,National Cancer Institute,grant.2482176,2012,Cancer
1,National Cancer Institute,grant.2482176,2012,Patient Safety
2,National Cancer Institute,grant.2482176,2012,Bioengineering
3,National Cancer Institute,grant.2482176,2012,Breast Cancer
4,National Cancer Institute,grant.2482176,2012,Clinical Research


In [32]:
rcdc_1 = []
rcdc_2 = []
rcdc_3 = []
rcdc_4 = []
rcdc_5 = []

for funder in funders:
    print("\n{}:".format(funder))
    temp = rcdc[rcdc['funder_name']==funder]
    print(temp['rcdc_name'].value_counts().head())
    rcdc_1.append(temp['rcdc_name'].value_counts().index[0])
    rcdc_2.append(temp['rcdc_name'].value_counts().index[1])
    rcdc_3.append(temp['rcdc_name'].value_counts().index[2])
    rcdc_4.append(temp['rcdc_name'].value_counts().index[3])
    rcdc_5.append(temp['rcdc_name'].value_counts().index[4])


Cancer Research UK:
Cancer               1440
Clinical Research     636
Genetics              581
Rare Diseases         484
Biotechnology         322
Name: rcdc_name, dtype: int64

French Cancer Funders:
Cancer               1077
Clinical Research     593
Rare Diseases         529
Genetics              368
Biotechnology         264
Name: rcdc_name, dtype: int64

French Cancer Funders - ORCID Confirmed:
Cancer               250
Clinical Research    144
Rare Diseases        117
Genetics             100
Hematology            70
Name: rcdc_name, dtype: int64

National Cancer Institute:
Cancer               10859
Clinical Research     4821
Genetics              4197
Rare Diseases         3962
Biotechnology         3956
Name: rcdc_name, dtype: int64

National Health and Medical Research Council:
Cancer               1155
Rare Diseases         364
Genetics              300
Prevention            271
Clinical Research     244
Name: rcdc_name, dtype: int64

Wellcome Trust:
Cancer               

## Prior and Subsequent Grants

In [33]:
def grant_groupby(grnts_mrg):
    cols = [col for col in grnts_mrg.columns if col[-2:]!="_2"]
    grnts_mrg['pre_flag'] = grnts_mrg['start_date']>grnts_mrg['start_date_2']
    grnts_mrg['post_flag'] = grnts_mrg['start_date']<grnts_mrg['start_date_2']
    for col in ['funding_amount', 'funding_len', 'nb_rsrs']:
        grnts_mrg['pre_'+col] = grnts_mrg[col+'_2']*grnts_mrg['pre_flag']
        grnts_mrg['post_'+col] = grnts_mrg[col+'_2']*grnts_mrg['post_flag']

    grnts_stats = grnts_mrg.groupby(['grant_id', 'rsr_id'])
    grnts_stats = pd.DataFrame({'pre_nb_grnts':grnts_stats['pre_flag'].sum()
                                , 'pre_fund_amt':grnts_stats['pre_funding_amount'].sum()
                                , 'pre_avg_fund_len':grnts_stats['pre_funding_len'].mean()
                                , 'pre_avg_team_size':grnts_stats['pre_nb_rsrs'].mean()
                                , 'post_nb_grnts':grnts_stats['post_flag'].sum()
                                , 'post_fund_amt':grnts_stats['post_funding_amount'].sum()
                                , 'post_avg_fund_len':grnts_stats['post_funding_len'].mean()
                                , 'post_avg_team_size':grnts_stats['post_nb_rsrs'].mean()
                               }).reset_index()
    temp = grnts_mrg[cols].copy().drop_duplicates()
    grnts_stats = pd.merge(temp, grnts_stats, how='left', on=['grant_id', 'rsr_id'])
    
    return grnts_stats

In [34]:
def grant_stats(grnts_stats):
    
    print("Pre-Funding Grant Statistics:")
    pre_avg_fund_len_avg = []
    pre_avg_fund_len_med = []
    pre_avg_team_size_avg = []
    pre_avg_team_size_med = []
    pre_fund_amt_avg = []
    pre_fund_amt_med = []
    pre_nb_grnts_avg = []
    pre_nb_grnts_med = []
    cols = [col for col in grnts_stats.columns if col[:4]=="pre_"]
    for funder in funders:
        print("\n{}:".format(funder))
        temp = grnts_stats[grnts_stats['funder_name']==funder]
        print(temp[cols].describe())
        pre_avg_fund_len_avg.append(temp['pre_avg_fund_len'].mean())
        pre_avg_fund_len_med.append(temp['pre_avg_fund_len'].median())
        pre_avg_team_size_avg.append(temp['pre_avg_team_size'].mean())
        pre_avg_team_size_med.append(temp['pre_avg_team_size'].median())
        pre_fund_amt_avg.append(temp['pre_fund_amt'].mean())
        pre_fund_amt_med.append(temp['pre_fund_amt'].median())
        pre_nb_grnts_avg.append(temp['pre_nb_grnts'].mean())
        pre_nb_grnts_med.append(temp['pre_nb_grnts'].median())
        
    print("\n")
    
    print("Post-Funding Grant Statistics:")
    post_avg_fund_len_avg = []
    post_avg_fund_len_med = []
    post_avg_team_size_avg = []
    post_avg_team_size_med = []
    post_fund_amt_avg = []
    post_fund_amt_med = []
    post_nb_grnts_avg = []
    post_nb_grnts_med = []
    cols = [col for col in grnts_stats.columns if col[:5]=="post_"]
    for funder in funders:
        print("\n{}:".format(funder))
        temp = grnts_stats[grnts_stats['funder_name']==funder]
        print(temp[cols].describe())
        post_avg_fund_len_avg.append(temp['post_avg_fund_len'].mean())
        post_avg_fund_len_med.append(temp['post_avg_fund_len'].median())
        post_avg_team_size_avg.append(temp['post_avg_team_size'].mean())
        post_avg_team_size_med.append(temp['post_avg_team_size'].median())
        post_fund_amt_avg.append(temp['post_fund_amt'].mean())
        post_fund_amt_med.append(temp['post_fund_amt'].median())
        post_nb_grnts_avg.append(temp['post_nb_grnts'].mean())
        post_nb_grnts_med.append(temp['post_nb_grnts'].median())

    return pre_avg_fund_len_avg, pre_avg_fund_len_med, pre_avg_team_size_avg, pre_avg_team_size_med, pre_fund_amt_avg, pre_fund_amt_med, pre_nb_grnts_avg, pre_nb_grnts_med, post_avg_fund_len_avg, post_avg_fund_len_med, post_avg_team_size_avg, post_avg_team_size_med, post_fund_amt_avg, post_fund_amt_med, post_nb_grnts_avg, post_nb_grnts_med

### For all Grants

In [35]:
# grnts_mrg = pd.merge(funded_grnts, grnts, how='left', on='rsr_id', suffixes=('', '_2'))
# grnts_stats_all = grant_groupby(grnts_mrg)

In [36]:
# grnts_stats_all.shape

In [37]:
# grnts_stats_all.describe(include='all')

In [38]:
# grant_stats(grnts_stats_all)

### Within 5 years of the funded grant

In [39]:
grnts_mrg = pd.merge(funded_grnts, grnts, how='left', on='rsr_id', suffixes=('', '_2'))
grnts_mrg = grnts_mrg[abs(grnts_mrg['start_date']-grnts_mrg['start_date_2'])/timedelta(days=365)<=5]
grnts_mrg = pd.merge(funded_grnts_info, grnts_mrg, how='left', on=grnt_cols)
grnts_mrg = grnts_mrg.reset_index(drop=True)
grnts_stats_5y = grant_groupby(grnts_mrg)

In [40]:
grnts_stats_5y.shape

(21115, 21)

In [41]:
grnts_stats_5y.describe(include='all')

Unnamed: 0,end_date,funder_id,funder_name,funding_amount,grant_id,rcdc_codes,rcdc_names,start_date,funding_len,nb_rsrs,year,nb_rcdc,rsr_id,post_avg_fund_len,post_avg_team_size,post_fund_amt,post_nb_grnts,pre_avg_fund_len,pre_avg_team_size,pre_fund_amt,pre_nb_grnts
count,21067,19614,21115,18820.0,21115,18163.0,19644,21115,21067.0,20491.0,21115.0,19644.0,20491,20468.0,20491.0,20491.0,20491.0,20468.0,20491.0,20491.0,20491.0
unique,1086,4,6,,17545,7263.0,7795,1177,,,,,13178,,,,,,,,
top,2017-01-01 00:00:00,grid.48336.3a,National Cancer Institute,,grant.2695966,503.0,Cancer,2009-01-01 00:00:00,,,,,ur.01117731572.33,,,,,,,,
freq,695,13795,13795,,17,727.0,764,1562,,,,,384,,,,,,,,
first,2007-01-31 00:00:00,,,,,,,2007-01-01 00:00:00,,,,,,,,,,,,,
last,2024-10-31 00:00:00,,,,,,,2012-12-31 00:00:00,,,,,,,,,,,,,
mean,,,,1641171.0,,,,,3.893499,1.703919,2009.447881,5.524893,,0.900586,0.59588,2165427.0,1.739056,0.821059,0.455808,1570626.0,1.612806
std,,,,8594355.0,,,,,2.433819,1.620131,1.670547,2.750802,,1.04403,0.962875,8124279.0,4.501351,1.081654,0.798978,5398971.0,4.29116
min,,,,0.0,,,,,0.00274,1.0,2007.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,,281077.5,,,,,2.00274,1.0,2008.0,3.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
pre_avg_fund_len_avg, pre_avg_fund_len_med, pre_avg_team_size_avg, pre_avg_team_size_med, pre_fund_amt_avg, pre_fund_amt_med, pre_nb_grnts_avg, pre_nb_grnts_med, post_avg_fund_len_avg, post_avg_fund_len_med, post_avg_team_size_avg, post_avg_team_size_med, post_fund_amt_avg, post_fund_amt_med, post_nb_grnts_avg, post_nb_grnts_med = grant_stats(grnts_stats_5y)

Pre-Funding Grant Statistics:

Cancer Research UK:
       pre_avg_fund_len  pre_avg_team_size  pre_fund_amt  pre_nb_grnts
count       2289.000000        2289.000000  2.289000e+03   2289.000000
mean           0.996940           0.546290  9.970536e+05      2.288772
std            1.035881           0.890745  2.823252e+06      3.015557
min            0.000000           0.000000  0.000000e+00      0.000000
25%            0.000000           0.000000  0.000000e+00      0.000000
50%            0.886675           0.333333  0.000000e+00      1.000000
75%            1.708219           0.666667  5.273810e+05      3.000000
max            7.153973          12.000000  3.148749e+07     19.000000

French Cancer Funders:
       pre_avg_fund_len  pre_avg_team_size  pre_fund_amt  pre_nb_grnts
count       1263.000000        1267.000000  1.267000e+03   1267.000000
mean           0.410819           0.213581  2.849481e+05      0.602999
std            0.674880           0.570949  8.281651e+05      1.122549
mi

## Prior and Subsequent Publications

In [43]:
def pub_groupby(pubs_mrg):
    cols = [col for col in pubs_mrg.columns if col[-2:]!="_2"]
    pubs_mrg['pre_flag'] = pubs_mrg['start_date']>pubs_mrg['date_2']
    pubs_mrg['post_flag'] = pubs_mrg['start_date']<pubs_mrg['date_2']
    for col in ['citations']:
        pubs_mrg['pre_'+col] = pubs_mrg[col+'_2']*pubs_mrg['pre_flag']
        pubs_mrg['post_'+col] = pubs_mrg[col+'_2']*pubs_mrg['post_flag']

    pubs_stats = pubs_mrg.groupby(['grant_id', 'rsr_id'])
    pubs_stats = pd.DataFrame({'pre_nb_pubs':pubs_stats['pre_flag'].sum()
                                , 'pre_citations':pubs_stats['pre_citations'].sum()
                                , 'post_nb_pubs':pubs_stats['post_flag'].sum()
                                , 'post_citations':pubs_stats['post_citations'].sum()
                               }).reset_index()
    temp = pubs_mrg[cols].copy().drop_duplicates()
    pubs_stats = pd.merge(temp, pubs_stats, how='left', on=['grant_id', 'rsr_id'])
    
    return pubs_stats

In [44]:
def pub_stats(pubs_stats):
    
    print("Pre-Funding Publication Statistics:")
    pre_citations_avg = []
    pre_nb_pubs_avg = []
    pre_citations_med = []
    pre_nb_pubs_med = []
    cols = [col for col in pubs_stats.columns if col[:4]=="pre_"]
    for funder in funders:
        print("\n{}:".format(funder))
        temp = pubs_stats[pubs_stats['funder_name']==funder]
        print(temp[cols].describe())
        pre_citations_avg.append(temp['pre_citations'].mean())
        pre_citations_med.append(temp['pre_citations'].median())
        pre_nb_pubs_avg.append(temp['pre_nb_pubs'].mean())
        pre_nb_pubs_med.append(temp['pre_nb_pubs'].median())
    
    print("\n")
    
    print("Post-Funding Publication Statistics:")
    post_citations_avg = []
    post_citations_med = []
    post_nb_pubs_avg = []
    post_nb_pubs_med = []
    cols = [col for col in pubs_stats.columns if col[:5]=="post_"]
    for funder in funders:
        print("\n{}:".format(funder))
        temp = pubs_stats[pubs_stats['funder_name']==funder]
        print(temp[cols].describe())
        post_citations_avg.append(temp['post_citations'].mean())
        post_citations_med.append(temp['post_citations'].median())
        post_nb_pubs_avg.append(temp['post_nb_pubs'].mean())
        post_nb_pubs_med.append(temp['post_nb_pubs'].median())
        
    return pre_citations_avg, pre_nb_pubs_avg, pre_citations_med, pre_nb_pubs_med, post_citations_avg, post_citations_med, post_nb_pubs_avg, post_nb_pubs_med

### For all Publications

In [45]:
# pubs_mrg = pd.merge(funded_grnts, pubs, how='left', on='rsr_id')
# pubs_mrg.columns = [col+"_2" if (col not in list(funded_grnts))&(col[-2:]!="_2") else col for col in pubs_mrg.columns]
# pubs_stats_all = pub_groupby(pubs_mrg)

In [46]:
# pubs_stats_all.shape

In [47]:
# pubs_stats_all.describe(include='all')

In [48]:
# pre_citations_avg, pre_nb_pubs_avg, pre_citations_med, pre_nb_pubs_med, post_citations_avg, post_citations_med, post_nb_pubs_avg, post_nb_pubs_med = pub_stats(pubs_stats_all)

### Within 5 years of the funded grant

In [49]:
pubs_mrg = pd.merge(funded_grnts, pubs, how='left', on='rsr_id', suffixes=('', '_2'))
pubs_mrg.columns = [col+"_2" if (col not in list(funded_grnts))&(col[-2:]!="_2") else col for col in pubs_mrg.columns]
pubs_mrg = pubs_mrg[abs(pubs_mrg['start_date']-pubs_mrg['date_2'])/timedelta(days=365)<=5]
pubs_mrg = pd.merge(funded_grnts_info, pubs_mrg, how='left', on=grnt_cols)
pubs_mrg = pubs_mrg.reset_index(drop=True)
pubs_stats_5y = pub_groupby(pubs_mrg)

In [50]:
pubs_stats_5y.shape

(20224, 17)

In [51]:
pubs_stats_5y.describe(include='all')

Unnamed: 0,end_date,funder_id,funder_name,funding_amount,grant_id,rcdc_codes,rcdc_names,start_date,funding_len,nb_rsrs,year,nb_rcdc,rsr_id,post_citations,post_nb_pubs,pre_citations,pre_nb_pubs
count,20188,18736,20224,17949.0,20224,17288.0,18756,20224,20188.0,19600.0,20224.0,18756.0,16601,16601.0,16601.0,16601.0,16601.0
unique,1086,4,6,,17545,7263.0,7795,1177,,,,,10716,,,,
top,2017-01-01 00:00:00,grid.48336.3a,National Cancer Institute,,grant.2695966,503.0,Cancer,2009-01-01 00:00:00,,,,,ur.0634201432.39,,,,
freq,682,13618,13618,,17,686.0,721,1478,,,,,127,,,,
first,2007-01-31 00:00:00,,,,,,,2007-01-01 00:00:00,,,,,,,,,
last,2024-10-31 00:00:00,,,,,,,2012-12-31 00:00:00,,,,,,,,,
mean,,,,1662556.0,,,,,3.91658,1.596837,2009.446944,5.537748,,1516.161617,41.951087,1884.646889,30.565327
std,,,,8783430.0,,,,,2.448979,1.502828,1.66649,2.758804,,2689.184604,47.658717,3167.167413,35.572271
min,,,,0.0,,,,,0.00274,1.0,2007.0,1.0,,0.0,0.0,0.0,0.0
25%,,,,273960.0,,,,,2.00274,1.0,2008.0,3.0,,240.0,13.0,330.0,9.0


In [52]:
pre_citations_avg, pre_nb_pubs_avg, pre_citations_med, pre_nb_pubs_med, post_citations_avg, post_citations_med, post_nb_pubs_avg, post_nb_pubs_med = pub_stats(pubs_stats_5y)

Pre-Funding Publication Statistics:

Cancer Research UK:
       pre_citations  pre_nb_pubs
count    1482.000000  1482.000000
mean     2121.664642    30.028340
std      2759.298535    31.926684
min         0.000000     0.000000
25%       420.500000    10.000000
50%      1087.000000    21.000000
75%      2591.500000    40.000000
max     21325.000000   401.000000

French Cancer Funders:
       pre_citations  pre_nb_pubs
count    1213.000000  1213.000000
mean     1651.693322    42.713108
std      2148.234397    48.562964
min         0.000000     0.000000
25%       343.000000    13.000000
50%       854.000000    30.000000
75%      2080.000000    60.000000
max     17255.000000   569.000000

French Cancer Funders - ORCID Confirmed:
       pre_citations  pre_nb_pubs
count     234.000000   234.000000
mean     1968.619658    36.012821
std      4539.566273    34.398882
min         0.000000     0.000000
25%       250.250000    13.000000
50%       820.000000    25.500000
75%      1749.000000    51.

## Subsequent Publication RCDC Codes

### Create Lookup Tables

#### Grant RCDC Codes

In [53]:
funded_grnt_rcdc = funded_grnts[['grant_id', 'rcdc_codes', 'rcdc_names']].drop_duplicates()
funded_grnt_rcdc = funded_grnt_rcdc[funded_grnt_rcdc['rcdc_names'].notnull()].reset_index()
temp1 = pd.concat([Series(row['grant_id'], row['rcdc_names'].split('; '))
                   for _, row in funded_grnt_rcdc.iterrows()]).reset_index()
temp1.columns = ['rcdc_name', 'grant_id']
# temp2 = pd.concat([Series(row['grant_id'], row['rcdc_codes'].split('; '))
#                    for _, row in funded_grnt_rcdc.iterrows()]).reset_index()
# temp2.columns = ['rcdc_code', 'grant_id_2']
# funded_grnt_rcdc = pd.concat([temp1, temp2], axis=1)
# del funded_grnt_rcdc['grant_id_2']
funded_grnt_rcdc = temp1.copy()

In [54]:
funded_grnt_rcdc.head()

Unnamed: 0,rcdc_name,grant_id
0,Cancer,grant.2482176
1,Patient Safety,grant.2482176
2,Bioengineering,grant.2482176
3,Breast Cancer,grant.2482176
4,Clinical Research,grant.2482176


In [55]:
funded_grnt_rcdc_1 = funded_grnt_rcdc.drop_duplicates('grant_id', keep='first').reset_index(drop=True)

In [56]:
funded_grnt_rcdc_1.head()

Unnamed: 0,rcdc_name,grant_id
0,Cancer,grant.2482176
1,Genetics,grant.2411281
2,Digestive Diseases,grant.2358038
3,Rare Diseases,grant.2482260
4,Stem Cell Research,grant.2482151


#### Publication RCDC Codes

In [57]:
pub_rcdc = pubs[['pub_id', 'rcdc_codes', 'rcdc_names']].drop_duplicates()
pub_rcdc = pub_rcdc[pub_rcdc['rcdc_names'].notnull()].reset_index()
temp1 = pd.concat([Series(row['pub_id'], row['rcdc_names'].split('; '))
                   for _, row in pub_rcdc.iterrows()]).reset_index()
temp1.columns = ['rcdc_name', 'pub_id']
# temp2 = pd.concat([Series(row['pub_id'], row['rcdc_codes'].split('; '))
#                    for _, row in pub_rcdc.iterrows()]).reset_index()
# temp2.columns = ['rcdc_code', 'pub_id_2']
# pub_rcdc = pd.concat([temp1, temp2], axis=1)
# del pub_rcdc['pub_id_2']
pub_rcdc = temp1.copy()

In [58]:
pub_rcdc.head()

Unnamed: 0,rcdc_name,pub_id
0,Vaccine Related,pub.1090680336
1,Clinical Research,pub.1090680336
2,Sexually Transmitted Diseases/Herpes,pub.1090680336
3,Prevention,pub.1090680336
4,HPV and/or Cervical Cancer Vaccines,pub.1090680336


In [59]:
pub_rcdc_1 = pub_rcdc.drop_duplicates('pub_id', keep='first').reset_index(drop=True)

In [60]:
pub_rcdc_1.head()

Unnamed: 0,rcdc_name,pub_id
0,Vaccine Related,pub.1090680336
1,Biotechnology,pub.1103817474
2,Breast Cancer,pub.1101698094
3,Health Services,pub.1101528382
4,Multiple Sclerosis,pub.1100212452


#### Grant-Publications Crossfile

In [61]:
pubs_mrg = pd.merge(funded_grnts, pubs, how='left', on='rsr_id', suffixes=('', '_pub'))
pubs_mrg = pubs_mrg[(abs(pubs_mrg['date']-pubs_mrg['start_date'])/timedelta(days=365)<=5)
                    & (pubs_mrg['date']>pubs_mrg['start_date'])].reset_index(drop=True)
pubs_mrg = pd.merge(funded_grnts_info, pubs_mrg, how='left', on=grnt_cols)

### Method 1: 1-to-1

In [62]:
cols = ['funder_name', 'funding_amount', 'grant_id', 'rsr_id', 'start_date', 'pub_id', 'citations']
rcdc_comp = pubs_mrg[cols].copy()

In [63]:
rcdc_comp = pd.merge(rcdc_comp, funded_grnt_rcdc_1, how='left', on='grant_id', suffixes=('', '_1_grnt'))
rcdc_comp = pd.merge(rcdc_comp, pub_rcdc_1, how='left', on='pub_id', suffixes=('', '_1_pub'))
rcdc_comp.rename(columns={'rcdc_name': 'rcdc_name_1_grnt', 'rcdc_code': 'rcdc_code_1_grnt'}, inplace=True)

In [64]:
# Get rid or ignore Null Values?
process = 'ignore' # or 'get_rid'
if process == 'get_rid':
    rcdc_comp = rcdc_1_comp[(rcdc_comp['rcdc_name_1_grnt'].notnull())&(rcdc_comp['rcdc_name_1_pub'].notnull())]
elif process == 'ignore':
    rcdc_comp['rcdc_name_1_grnt'] = rcdc_comp['rcdc_name_1_grnt'].fillna('')
    rcdc_comp['rcdc_name_1_pub'] = rcdc_comp['rcdc_name_1_pub'].fillna('')
else:
    print("Don't forget to choose !")

In [65]:
rcdc_comp_agency = rcdc_comp.groupby(['funder_name', 'rcdc_name_1_grnt', 'rcdc_name_1_pub'])
rcdc_comp_agency = pd.DataFrame({'nb_obs': rcdc_comp_agency.size()
                                   , 'nb_grnts': rcdc_comp_agency['grant_id'].nunique()
                                   , 'nb_pubs': rcdc_comp_agency['pub_id'].nunique()
                                  }).reset_index()

In [66]:
rcdc_comp_agency.head()

Unnamed: 0,funder_name,rcdc_name_1_grnt,rcdc_name_1_pub,nb_grnts,nb_obs,nb_pubs
0,Cancer Research UK,,,595,4953,3451
1,Cancer Research UK,,Acquired Cognitive Impairment,1,1,1
2,Cancer Research UK,,Adolescent Sexual Activity,1,1,1
3,Cancer Research UK,,Aging,78,191,114
4,Cancer Research UK,,"Alcoholism, Alcohol Use and Health",1,1,1


In [67]:
rcdc_comp_agency.tail()

Unnamed: 0,funder_name,rcdc_name_1_grnt,rcdc_name_1_pub,nb_grnts,nb_obs,nb_pubs
14266,Wellcome Trust,Stem Cell Research - Nonembryonic - Non-Human,Neurodegenerative,1,2,1
14267,Wellcome Trust,Stem Cell Research - Nonembryonic - Non-Human,Neurosciences,1,1,1
14268,Wellcome Trust,Stem Cell Research - Nonembryonic - Non-Human,Prevention,1,1,1
14269,Wellcome Trust,Stem Cell Research - Nonembryonic - Non-Human,Stem Cell Research,2,3,1
14270,Wellcome Trust,Vaccine Related,,2,2,0


In [68]:
rcdc_comp_agency.columns = [['funder_name', 'grnt_rcdc', 'pub_rcdc', 'nb_grnts', 'nb_obs', 'nb_pubs']]
rcdc_comp_agency.to_csv('../output/rcdc_grnt_1_pubs_1_comp.csv', index=False)

### Method 2: 1 to Many

In [69]:
cols = ['funder_name', 'funding_amount', 'grant_id', 'rsr_id', 'start_date', 'pub_id', 'citations']
cols += ['rcdc_names_pub', 'rcdc_codes_pub']
rcdc_comp = pubs_mrg[cols].copy()

In [70]:
rcdc_comp = pd.merge(rcdc_comp, funded_grnt_rcdc_1, how='left', on='grant_id', suffixes=('', '_1_grnt'))
rcdc_comp = pd.merge(rcdc_comp, pub_rcdc_1, how='left', on='pub_id', suffixes=('', '_1_pub'))
rcdc_comp.rename(columns={'rcdc_name': 'rcdc_name_1_grnt', 'rcdc_code': 'rcdc_code_1_grnt'}, inplace=True)

In [71]:
# Get rid or ignore Null Values?
process = 'ignore' # or 'get_rid'
if process == 'get_rid':
    rcdc_comp = rcdc_1_comp[(rcdc_comp['rcdc_name_1_grnt'].notnull())&(rcdc_comp['rcdc_names_pub'].notnull())]
elif process == 'ignore':
    rcdc_comp['rcdc_name_1_grnt'] = rcdc_comp['rcdc_name_1_grnt'].fillna('')
    rcdc_comp['rcdc_names_pub'] = rcdc_comp['rcdc_names_pub'].fillna('')
    rcdc_comp['rcdc_name_1_pub'] = rcdc_comp['rcdc_name_1_pub'].fillna('')
else:
    print("Don't forget to choose !")

In [72]:
def regin(df):
    return bool(re.search(r"(^|; )\b{}\b(; |$)".format(df['rcdc_name_1_grnt']), df['rcdc_names_pub']))
rcdc_comp['flag'] = rcdc_comp.apply(regin, axis=1)

In [73]:
rcdc_comp['rcdc_name_pub_impute'] = np.where(rcdc_comp['flag']==True
                                             , rcdc_comp['rcdc_name_1_grnt'], rcdc_comp['rcdc_name_1_pub'])

In [74]:
rcdc_comp_agency = rcdc_comp.groupby(['funder_name', 'rcdc_name_1_grnt', 'rcdc_name_pub_impute'])
rcdc_comp_agency = pd.DataFrame({'nb_obs': rcdc_comp_agency.size()
                                 , 'nb_grnts': rcdc_comp_agency['grant_id'].nunique()
                                 , 'nb_pubs': rcdc_comp_agency['pub_id'].nunique()
                                }).reset_index()

In [75]:
rcdc_comp_agency.head()

Unnamed: 0,funder_name,rcdc_name_1_grnt,rcdc_name_pub_impute,nb_grnts,nb_obs,nb_pubs
0,Cancer Research UK,,,595,4953,3451
1,Cancer Research UK,,Acquired Cognitive Impairment,1,1,1
2,Cancer Research UK,,Adolescent Sexual Activity,1,1,1
3,Cancer Research UK,,Aging,78,191,114
4,Cancer Research UK,,"Alcoholism, Alcohol Use and Health",1,1,1


In [76]:
rcdc_comp_agency.tail()

Unnamed: 0,funder_name,rcdc_name_1_grnt,rcdc_name_pub_impute,nb_grnts,nb_obs,nb_pubs
13196,Wellcome Trust,Stem Cell Research - Nonembryonic - Non-Human,Neurodegenerative,1,2,1
13197,Wellcome Trust,Stem Cell Research - Nonembryonic - Non-Human,Neurosciences,1,1,1
13198,Wellcome Trust,Stem Cell Research - Nonembryonic - Non-Human,Prevention,1,1,1
13199,Wellcome Trust,Stem Cell Research - Nonembryonic - Non-Human,Stem Cell Research - Nonembryonic - Non-Human,2,3,1
13200,Wellcome Trust,Vaccine Related,,2,2,0


In [77]:
rcdc_comp_agency.columns = [['funder_name', 'grnt_rcdc', 'pub_rcdc', 'nb_grnts', 'nb_obs', 'nb_pubs']]
rcdc_comp_agency.to_csv('../output/rcdc_grnt_1_pubs_comp.csv', index=False)

### Method 3: Many to Many

In [78]:
cols = ['funder_name', 'funding_amount', 'grant_id', 'rsr_id', 'start_date', 'pub_id', 'citations']
cols += ['rcdc_names_pub', 'rcdc_codes_pub']
rcdc_comp = pubs_mrg[cols].copy()

In [79]:
rcdc_comp = pd.merge(rcdc_comp, funded_grnt_rcdc, how='left', on='grant_id', suffixes=('', '_grnt'))
rcdc_comp = pd.merge(rcdc_comp, pub_rcdc_1, how='left', on='pub_id', suffixes=('', '_1_pub'))
rcdc_comp.rename(columns={'rcdc_name': 'rcdc_name_grnt', 'rcdc_code': 'rcdc_code_grnt'}, inplace=True)

In [80]:
# Get rid or ignore Null Values?
process = 'ignore' # or 'get_rid'
if process == 'get_rid':
    rcdc_comp = rcdc_1_comp[(rcdc_comp['rcdc_name_grnt'].notnull())&(rcdc_comp['rcdc_names_pub'].notnull())]
elif process == 'ignore':
    rcdc_comp['rcdc_name_grnt'] = rcdc_comp['rcdc_name_grnt'].fillna('')
    rcdc_comp['rcdc_names_pub'] = rcdc_comp['rcdc_names_pub'].fillna('')
    rcdc_comp['rcdc_name_1_pub'] = rcdc_comp['rcdc_name_1_pub'].fillna('')
else:
    print("Don't forget to choose !")

In [81]:
def regin(df):
    return bool(re.search(r"(^|; )\b{}\b(; |$)".format(df['rcdc_name_grnt']), df['rcdc_names_pub']))
rcdc_comp['flag'] = rcdc_comp.apply(regin, axis=1)

In [82]:
rcdc_comp['rcdc_name_pub_impute'] = np.where(rcdc_comp['flag']==True
                                             , rcdc_comp['rcdc_name_grnt'], rcdc_comp['rcdc_name_1_pub'])

In [83]:
rcdc_comp_agency = rcdc_comp.groupby(['funder_name', 'rcdc_name_grnt', 'rcdc_name_pub_impute'])
rcdc_comp_agency = pd.DataFrame({'nb_obs': rcdc_comp_agency.size()
                                 , 'nb_grnts': rcdc_comp_agency['grant_id'].nunique()
                                 , 'nb_pubs': rcdc_comp_agency['pub_id'].nunique()
                                }).reset_index()

In [84]:
rcdc_comp_agency.head()

Unnamed: 0,funder_name,rcdc_name_grnt,rcdc_name_pub_impute,nb_grnts,nb_obs,nb_pubs
0,Cancer Research UK,,,595,4953,3451
1,Cancer Research UK,,Acquired Cognitive Impairment,1,1,1
2,Cancer Research UK,,Adolescent Sexual Activity,1,1,1
3,Cancer Research UK,,Aging,78,191,114
4,Cancer Research UK,,"Alcoholism, Alcohol Use and Health",1,1,1


In [85]:
rcdc_comp_agency.tail()

Unnamed: 0,funder_name,rcdc_name_grnt,rcdc_name_pub_impute,nb_grnts,nb_obs,nb_pubs
42166,Wellcome Trust,Vaccine related (AIDS),Immunization,1,1,1
42167,Wellcome Trust,Vaccine related (AIDS),Infectious Diseases,1,1,1
42168,Wellcome Trust,Vector-Borne Diseases,,1,3,3
42169,Wellcome Trust,Vector-Borne Diseases,Behavioral and Social Science,1,1,1
42170,Wellcome Trust,Vector-Borne Diseases,Clinical Trials and Supportive Activities,1,1,1


In [86]:
rcdc_comp_agency.columns = [['funder_name', 'grnt_rcdc', 'pub_rcdc', 'nb_grnts', 'nb_obs', 'nb_pubs']]
rcdc_comp_agency.to_csv('../output/rcdc_grnt_pubs_comp.csv', index=False)

### Method 4: Many to many with score *(PRELIMINARY)*

In [87]:
pubs_mrg = pd.merge(funded_grnts, pubs, how='left', on='rsr_id', suffixes=('', '_pub'))
pubs_mrg = pubs_mrg[(abs(pubs_mrg['date']-pubs_mrg['start_date'])/timedelta(days=365)<=5)
                    & (pubs_mrg['date']>pubs_mrg['start_date'])].reset_index(drop=True)
pubs_mrg = pd.merge(funded_grnts_info, pubs_mrg, how='left', on=grnt_cols)

In [88]:
cols = ['funder_name', 'funding_amount', 'grant_id', 'rsr_id', 'start_date', 'pub_id', 'citations']
cols += ['rcdc_names_pub', 'rcdc_codes_pub']
rcdc_comp = pubs_mrg[cols].copy()

In [89]:
rcdc_comp = pd.merge(rcdc_comp, funded_grnt_rcdc, how='left', on='grant_id')

# Get rid or ignore Null Values?
process = 'ignore' # or 'get_rid'
if process == 'get_rid':
    rcdc_comp = rcdc_comp[(rcdc_comp['rcdc_name'].notnull())&(rcdc_comp['rcdc_names_pub'].notnull())]
elif process == 'ignore':
    rcdc_comp['rcdc_name'] = rcdc_comp['rcdc_name'].fillna('')
    rcdc_comp['rcdc_names_pub'] = rcdc_comp['rcdc_names_pub'].fillna('')
else:
    print("Don't forget to choose !")

rcdc_comp = rcdc_comp.reset_index(drop=True)

In [90]:
def regin(df):
    return bool(re.search(r"(^|; )\b{}\b(; |$)".format(df['rcdc_name']), df['rcdc_names_pub']))
rcdc_comp['flag'] = rcdc_comp.apply(regin, axis=1)

In [91]:
rcdc_comp.head()

Unnamed: 0,funder_name,funding_amount,grant_id,rsr_id,start_date,pub_id,citations,rcdc_names_pub,rcdc_codes_pub,rcdc_name,flag
0,National Cancer Institute,3181288.0,grant.2482176,ur.0755627762.12,2012-12-31,pub.1022823306,2.0,Behavioral and Social Science; Patient Safety;...,498; 546; 503; 389; 316; 507,Cancer,True
1,National Cancer Institute,3181288.0,grant.2482176,ur.0755627762.12,2012-12-31,pub.1022823306,2.0,Behavioral and Social Science; Patient Safety;...,498; 546; 503; 389; 316; 507,Patient Safety,True
2,National Cancer Institute,3181288.0,grant.2482176,ur.0755627762.12,2012-12-31,pub.1022823306,2.0,Behavioral and Social Science; Patient Safety;...,498; 546; 503; 389; 316; 507,Bioengineering,False
3,National Cancer Institute,3181288.0,grant.2482176,ur.0755627762.12,2012-12-31,pub.1022823306,2.0,Behavioral and Social Science; Patient Safety;...,498; 546; 503; 389; 316; 507,Breast Cancer,True
4,National Cancer Institute,3181288.0,grant.2482176,ur.0755627762.12,2012-12-31,pub.1022823306,2.0,Behavioral and Social Science; Patient Safety;...,498; 546; 503; 389; 316; 507,Clinical Research,True


In [92]:
rcdc_comp_grnts = rcdc_comp.copy()
# del rcdc_comp_grnts['rsr_id']
# rcdc_comp_grnts = rcdc_comp_grnts.drop_duplicates().reset_index(drop=True)

rcdc_comp_grnts = rcdc_comp_grnts.groupby(['rcdc_name', 'grant_id', 'funder_name', 'funding_amount'])
rcdc_comp_grnts = pd.DataFrame({'nb_pubs': rcdc_comp_grnts.size()
                                , 'rcdc_pubs': rcdc_comp_grnts['flag'].sum()
                               }).reset_index()
rcdc_comp_grnts['rcdc_pct'] = rcdc_comp_grnts['rcdc_pubs']/rcdc_comp_grnts['nb_pubs']
rcdc_comp_grnts.sort_values('grant_id').head(10)

Unnamed: 0,rcdc_name,grant_id,funder_name,funding_amount,nb_pubs,rcdc_pubs,rcdc_pct
58167,Networking and Information Technology R&D,grant.2343411,National Cancer Institute,1200000.0,1,0.0,0.0
14466,Cancer,grant.2343411,National Cancer Institute,1200000.0,1,0.0,0.0
4092,Bioengineering,grant.2343411,National Cancer Institute,1200000.0,1,0.0,0.0
0,,grant.2343412,National Cancer Institute,1098058.0,1,0.0,0.0
1,,grant.2343413,National Cancer Institute,1270813.0,1,0.0,0.0
14467,Cancer,grant.2343414,National Cancer Institute,1463779.0,1,0.0,0.0
14468,Cancer,grant.2343415,National Cancer Institute,1931503.0,1,0.0,0.0
14469,Cancer,grant.2343416,National Cancer Institute,605841.0,1,0.0,0.0
2,,grant.2343417,National Cancer Institute,892129.0,1,0.0,0.0
3,,grant.2343418,National Cancer Institute,957999.0,1,0.0,0.0


In [93]:
rcdc_comp_fund = rcdc_comp_grnts.groupby(['rcdc_name', 'funder_name'])
rcdc_comp_fund = pd.DataFrame({'nb_grants': rcdc_comp_fund.size()
                               , 'nb_pubs': rcdc_comp_fund['nb_pubs'].sum()
                               , 'rcdc_pubs': rcdc_comp_fund['rcdc_pubs'].sum()
                               , 'avg_rcdc_pct': rcdc_comp_fund['rcdc_pct'].mean()
                              }).reset_index()
rcdc_comp_fund['rcdc_pct'] = rcdc_comp_fund['rcdc_pubs']/rcdc_comp_fund['nb_pubs']
rcdc_comp_fund.sort_values(['rcdc_name', 'funder_name']).head(10)

Unnamed: 0,rcdc_name,funder_name,avg_rcdc_pct,nb_grants,nb_pubs,rcdc_pubs,rcdc_pct
0,,French Cancer Funders,0.0,16,297,0.0,0.0
1,,French Cancer Funders - ORCID Confirmed,0.0,3,194,0.0,0.0
2,,National Cancer Institute,0.0,827,6029,0.0,0.0
3,ALS,National Cancer Institute,0.052632,2,76,4.0,0.052632
4,Acquired Cognitive Impairment,National Cancer Institute,0.055827,12,279,20.0,0.071685
5,Acquired Cognitive Impairment,National Health and Medical Research Council,0.120879,3,98,6.0,0.061224
6,Acute Respiratory Distress Syndrome,French Cancer Funders,0.0,1,42,0.0,0.0
7,Acute Respiratory Distress Syndrome,National Cancer Institute,0.083333,1,60,5.0,0.083333
8,Acute Respiratory Distress Syndrome,National Health and Medical Research Council,0.026777,5,836,13.0,0.01555
9,Adolescent Sexual Activity,National Cancer Institute,0.044483,3,104,7.0,0.067308


## Creating Output Table

Table with funders as columns and:
- 1st RCDC Code
- 2nd RCDC Code
- 3rd RCDC Code
- Mean number of previous grants
- Median number of previous grants
- Mean amount of previous grants
- Median number of previous grants
- Mean length of previous grants
- Median length of previous grants

In [94]:
df = pd.DataFrame({'rcdc_1':rcdc_1
                    , 'rcdc_2':rcdc_2
                    , 'rcdc_3':rcdc_3
                    , 'rcdc_4':rcdc_4
                    , 'rcdc_5':rcdc_5                   
                    , 'nb_unique_rsrs': nb_unique_rsrs
                    , 'nb_unique_grnts': nb_unique_grnts
                    , 'funded_amt_avg': funded_amt_avg
                    , 'funded_amt_med': funded_amt_med
                    , 'funded_len_avg': funded_len_avg
                    , 'funded_len_med': funded_len_med
                    , 'nb_grnt_rsrs_avg': nb_grnt_rsrs_avg
                    , 'nb_grnt_rsrs_med': nb_grnt_rsrs_med
                    , 'pre_avg_fund_len_avg': pre_avg_fund_len_avg
                    , 'pre_avg_fund_len_med': pre_avg_fund_len_med
                    , 'pre_avg_team_size_avg': pre_avg_team_size_avg
                    , 'pre_avg_team_size_med': pre_avg_team_size_med
                    , 'pre_fund_amt_avg': pre_fund_amt_avg
                    , 'pre_fund_amt_med': pre_fund_amt_med
                    , 'pre_nb_grnts_avg': pre_nb_grnts_avg
                    , 'pre_nb_grnts_med': pre_nb_grnts_med
                    , 'post_avg_fund_len_avg': post_avg_fund_len_avg
                    , 'post_avg_fund_len_med': post_avg_fund_len_med
                    , 'post_avg_team_size_avg': post_avg_team_size_avg
                    , 'post_avg_team_size_med': post_avg_team_size_med
                    , 'post_fund_amt_avg': post_fund_amt_avg
                    , 'post_fund_amt_med': post_fund_amt_med
                    , 'post_nb_grnts_avg': post_nb_grnts_avg
                    , 'post_nb_grnts_med': post_nb_grnts_med
                    , 'pre_citations_avg': pre_citations_avg
                    , 'pre_nb_pubs_avg': pre_nb_pubs_avg
                    , 'pre_citations_med': pre_citations_med
                    , 'pre_nb_pubs_med': pre_nb_pubs_med
                    , 'post_citations_avg': post_citations_avg
                    , 'post_citations_med': post_citations_med
                    , 'post_nb_pubs_avg': post_nb_pubs_avg
                    , 'post_nb_pubs_med': post_nb_pubs_med
                  }, index=funders).transpose()
df

Unnamed: 0,Cancer Research UK,French Cancer Funders,French Cancer Funders - ORCID Confirmed,National Cancer Institute,National Health and Medical Research Council,Wellcome Trust
funded_amt_avg,,485380,417194,1.85648e+06,623422,807210
funded_amt_med,,369838,346935,734829,436411,325304
funded_len_avg,3.37296,2.83389,2.69741,4.25441,2.82565,3.22667
funded_len_med,3,3,3,3.9589,2.00274,3.08493
nb_grnt_rsrs_avg,1.01643,1.15989,1.22078,1.10391,2.69437,1.58175
nb_grnt_rsrs_med,1,1,1,1,2,1
nb_unique_grnts,2254,1132,231,12555,1155,263
nb_unique_rsrs,1384,830,151,8485,2071,333
post_avg_fund_len_avg,0.962713,0.533526,0.596199,0.943564,0.852547,0.840639
post_avg_fund_len_med,0.833333,0,0,0.5,0.833333,0.500228


In [95]:
# Export to Excel
ls = !ls ../output/
if 'comparison_statistics.xlsx' in ls:
    book = load_workbook('../output/comparison_statistics.xlsx')
    writer = pd.ExcelWriter('../output/comparison_statistics.xlsx', engine='openpyxl') 
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
    df.to_excel(writer, "RAW")
    writer.save()
else:
    df.to_excel('../output/comparison_statistics.xlsx', sheet_name = 'RAW')

## Sandbox