# Subsequent Grants

## Python Setup

In [2]:
import pandas as pd

## Data Read In

In [3]:
grnt_dtl = pd.read_csv('../data/inca_grants_details.csv', low_memory=False)
pub_dtl = pd.read_csv('../data/inca_pub_details.csv', low_memory=False)
actors = pd.read_csv('../data/HELIOSv2_ACTEURS_all_2007-2012.csv', low_memory=False)

In [4]:
grnt_dtl['st_date'] = pd.to_datetime(grnt_dtl['Start Date'])
grnt_dtl['end_date'] = pd.to_datetime(grnt_dtl['End Date'])
grnt_dtl['grnt_ref'] = grnt_dtl['Reference']

In [5]:
cols = ['prenom_port', 'nom_port', 'st_date', 'end_date', 'grnt_ref']

In [6]:
grnts = grnt_dtl[cols]
grnts = grnts.drop_duplicates().reset_index(drop = True)

## Flag Subsequent Grants

In [7]:
sub_grnts = pd.merge(grnts, grnts, on = ['prenom_port', 'nom_port'])
for var in ('grnt_ref', 'st_date', 'end_date'):
    sub_grnts = sub_grnts.rename(index=str, columns={var+'_x': var, var+'_y': 'sub_'+var})
sub_grnts = sub_grnts[sub_grnts['grnt_ref'] != sub_grnts['sub_grnt_ref']]
sub_grnts = sub_grnts[sub_grnts['st_date'] <= sub_grnts['sub_st_date']]
sub_grnts = sub_grnts[sub_grnts['sub_st_date'] <= (sub_grnts['st_date']+pd.DateOffset(years=5))]

In [8]:
sub_grnts_g = sub_grnts.groupby(['prenom_port', 'nom_port', 'st_date', 'end_date', 'grnt_ref'])
sub_grnts_g = pd.DataFrame({'nb_sub_grnts': sub_grnts_g.size()}).reset_index()

In [9]:
grnts = pd.merge(grnts, sub_grnts_g, on = cols, how = 'outer')
grnts['nb_sub_grnts'] = grnts['nb_sub_grnts'].fillna(0.)

## Summary Statistics

In [10]:
print("The distribution of subsequent grants by a given researcher within 5 years of the first grant is:")
print(grnts['nb_sub_grnts'].astype(int).value_counts())

The distribution of subsequent grants by a given researcher within 5 years of the first grant is:
0    1081
1     352
2     155
3      82
4      33
5      16
6       6
7       3
Name: nb_sub_grnts, dtype: int64


In [11]:
actors_awards = list(actors['awardcode'].drop_duplicates())

In [12]:
grnts_o = grnts[grnts['grnt_ref'].apply(lambda x: x in actors_awards)].reset_index(drop = True)

In [13]:
print("Among the original INCa grants:")
print("The distribution of subsequent grants by a given researcher within 5 years of the first grant is:")
print(grnts_o['nb_sub_grnts'].astype(int).value_counts())

Among the original INCa grants:
The distribution of subsequent grants by a given researcher within 5 years of the first grant is:
0    912
1    293
2    122
3     69
4     25
5     13
6      4
7      3
Name: nb_sub_grnts, dtype: int64


## Sandbox

In [14]:
test = grnt_dtl[['prenom_port', 'nom_port', 'organisme_port']].drop_duplicates()

In [15]:
test.describe()

Unnamed: 0,prenom_port,nom_port,organisme_port
count,1000,1000,1000
unique,418,969,506
top,Philippe,ROCHE,Institut Gustave Roussy
freq,28,3,44
