# Data Read-In and Summary Statistics

## Python Setup

In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
pd.options.display.max_rows = 50
pd.options.display.max_columns = 100
import unidecode

## Data Read-In

In [3]:
grnt_dtl = pd.read_csv('../data/inca_grants_details.csv', low_memory=False)
pub_dtl = pd.read_csv('../data/inca_pub_details.csv', low_memory=False)

### Drop Duplicate Lines

In [4]:
print(grnt_dtl.shape)
print(pub_dtl.shape)

(1821, 16)
(145121, 28)


In [5]:
grnt_dtl = grnt_dtl.drop_duplicates()
pub_dtl = pub_dtl.drop_duplicates()

In [6]:
print(grnt_dtl.shape)
print(pub_dtl.shape)

(1820, 16)
(145121, 28)


## Descriptive Statistics

In [14]:
grnt_dtl.describe(include = 'all')

Unnamed: 0,INCA ID,prenom_port,nom_port,organisme_port,Dimensions Grant ID,Title,Abstract,Funder,Reference,Research Org Names,Research Org IDs,FOR,RCDC,Funding Amount ($),Start Date,End Date
count,1820,1820,1820,1820,1820,1820,1778,1820,1820,1734,1734,1773,1730,1786.0,1782,1764
unique,1001,418,969,506,1627,1546,1491,39,1550,543,543,160,1054,,382,494
top,inca_502,Philippe,FEIL,Institut Gustave Roussy,grant.7155004,Design of a predictive algorithm to assess cli...,Scientific context: Renal cell cancer (RCC) ac...,French National Cancer Institute,INCa_DGOS_1376,Institut Gustave Roussy,grid.14925.3b,1112 Oncology and Carcinogenesis,Cancer,,2007-02-02,2010-02-02
freq,12,45,12,110,3,4,4,848,4,124,124,470,49,,99,87
mean,,,,,,,,,,,,,,531239.6,,
std,,,,,,,,,,,,,,959213.5,,
min,,,,,,,,,,,,,,0.0,,
25%,,,,,,,,,,,,,,177413.0,,
50%,,,,,,,,,,,,,,386935.0,,
75%,,,,,,,,,,,,,,615486.5,,


In [15]:
pub_dtl.describe(include = 'all')

Unnamed: 0,INCA ID,prenom_port,nom_port,organisme_port,Dimensions Researcher ID,Additional Researcher DIM ID to combine,Additional Researcher DIM ID to combine 2,ORCID,title,Dimensions Publication ID,doi,issue,pages,Pubmed ID,volume,Relative Citation Ratio,Times Cited,altmetric,Open access,Author Names,Research Org Names,Research Org IDs,FOR,RCDC,Journal ID,Journal Title,Publication Year,Publication Date
count,145121,145121,145121,145121,145121,29643,2763,32080,145121,145121,131335,127929.0,142897,105386.0,140591.0,97820.0,145121.0,31666.0,43809,145121,104194,104194,97697,88764,140096,140096,145109.0,0.0
unique,965,398,934,483,965,95,9,198,107243,109001,97583,1607.0,53204,,1591.0,,,,1,102384,36032,36044,826,30275,4565,4558,,
top,inca_542,Philippe,FRANCESCHI,Institut Gustave Roussy,ur.01074225776.01,ur.01204666360.03,ur.010226620247.19,0000-0003-4181-8071,Reply,pub.1026893639,10.1038/bmt.2016.102,1.0,1-9,,28.0,,,,True,"Jeffrey W. Pollard, Alla Danilkovitch-Miagkova...",Institut Gustave Roussy,grid.14925.3b,1112 Oncology and Carcinogenesis,Cancer,jour.1312191,Journal of Clinical Oncology,,
freq,1272,4892,1272,9567,1272,1272,488,1272,37,10,10,16673.0,322,,2505.0,,,,43809,538,4352,4352,24728,3197,4577,4577,,
mean,,,,,,,,,,,,,,17701330.0,,1.56983,29.756348,8.959736,,,,,,,,,2006.900737,
std,,,,,,,,,,,,,,7572868.0,,3.939371,97.346231,47.376261,,,,,,,,,8.572783,
min,,,,,,,,,,,,,,7239.0,,0.0,0.0,0.0,,,,,,,,,1949.0,
25%,,,,,,,,,,,,,,11282560.0,,0.23,0.0,1.0,,,,,,,,,2002.0,
50%,,,,,,,,,,,,,,18687560.0,,0.72,6.0,3.0,,,,,,,,,2009.0,
75%,,,,,,,,,,,,,,24025140.0,,1.69,27.0,5.0,,,,,,,,,2014.0,


## Funder Name

In [10]:
grnt_dtl['funder_clean'] = grnt_dtl['Funder'].fillna('')

In [11]:
grnt_dtl['funder_flag'] = grnt_dtl['funder_clean'].apply(lambda x: x!='')
print("{}% of the grants have a Funder.".format(int(grnt_dtl['funder_flag'].value_counts(normalize=True)[True]*100)))
print("There are {} different Funder Names".format(len(grnt_dtl['funder_clean'].value_counts())))
print("Here are the most frequent:")
grnt_dtl['funder_clean'].value_counts().head()

100% of the grants have a Funder.
There are 39 different Funder Names
Here are the most frequent:


French National Cancer Institute                   848
Ministère des Affaires sociales et de la Santé     523
French Institute of Health and Medical Research    162
French National Research Agency                    122
Swiss National Science Foundation                   42
Name: funder_clean, dtype: int64

### INCA vs. Non-INCA

In [12]:
grnt_dtl['inca_flag'] = (grnt_dtl['funder_clean']=="French National Cancer Institute")
grants = grnt_dtl.groupby('Dimensions Grant ID')['inca_flag'].sum().reset_index()
grants['inca_flag_any'] = (grants['inca_flag']>0)
print("{}% of the grants are INCA-funded.".format(int(grants['inca_flag_any'].value_counts(normalize=True)[True]*100)))

45% of the grants are INCA-funded.


## Institution Names

In [13]:
grnt_dtl['org_clean'] = grnt_dtl['Research Org Names'].fillna('')

In [14]:
grnt_dtl['org_flag'] = grnt_dtl['org_clean'].apply(lambda x: x!='')
print("{}% of the grants have an Institution.".format(int(grnt_dtl['org_flag'].value_counts(normalize=True)[True]*100)))
print("There are {} different Institution Names".format(len(grnt_dtl['org_clean'].value_counts())))
print("Here are the most frequent:")
grnt_dtl[grnt_dtl['org_clean']!=""]['org_clean'].value_counts().head(10)

95% of the grants have an Institution.
There are 544 different Institution Names
Here are the most frequent:


Institut Gustave Roussy                                     124
Institute Curie                                              87
French Institute of Health and Medical Research              63
Centre Léon Bérard                                           51
Institute Paoli-Calmettes                                    43
Institute of Genetics and Molecular and Cellular Biology     31
Hôpital Saint-Louis                                          30
Institut Bergonié                                            24
UniCancer Group                                              22
Institut Claudius Regaud                                     18
Name: org_clean, dtype: int64

In [15]:
print("Organisation names are clean for the most part:")
string = "Paoli"
grnt_dtl[grnt_dtl['org_clean'].str.contains(string)]['org_clean'].value_counts()

Organisation names are clean for the most part:


Institute Paoli-Calmettes                                             43
Centre Oscar Lambret;UniCancer Group;Institute Paoli-Calmettes         1
Hôpital René Huguenin;Institute Paoli-Calmettes;Centre Jean Perrin     1
Institute Paoli-Calmettes;Hôpital Sainte-Marguerite                    1
Name: org_clean, dtype: int64

## ORCID Number

In [16]:
pub_dtl['orcid_flag'] = pub_dtl['ORCID'].notnull()

In [17]:
print("{}% of the grants have an ORCID.".format(int(pub_dtl['orcid_flag'].value_counts(normalize = True)[True]*100)))
print("Here are the most frequent ORCIDs:")
pub_dtl['ORCID'].value_counts().head()

22% of the grants have an ORCID.
Here are the most frequent ORCIDs:


0000-0003-4181-8071    1272
0000-0002-9334-4405    1114
0000-0001-7190-120X     875
0000-0003-2574-3874     776
0000-0002-0400-1954     767
Name: ORCID, dtype: int64

## Researcher Name

### Number of Reserachers

In [18]:
grnt_dtl['researcher_name'] = grnt_dtl['prenom_port'] + " " + grnt_dtl['nom_port']
researchers = set(grnt_dtl['researcher_name'])
print("There are {} unique researchers (defined by unique names).".format(len(researchers)))
print("There are {} unique researchers (defined by unique INCA IDs).".format(len(set(grnt_dtl['INCA ID']))))

There are 1000 unique researchers (defined by unique names).
There are 1001 unique researchers (defined by unique INCA IDs).


### Link between Researcher Name and INCA ID:

In [19]:
grnt_dtl[['INCA ID', 'researcher_name']].drop_duplicates().describe()

Unnamed: 0,INCA ID,researcher_name
count,1001,1001
unique,1001,1000
top,inca_1068,Jean BOURHIS
freq,1,2


In [20]:
grnt_dtl[['INCA ID', 'researcher_name']].drop_duplicates()[grnt_dtl[['INCA ID', 'researcher_name']].drop_duplicates()['researcher_name']=="Jean BOURHIS"]

Unnamed: 0,INCA ID,researcher_name
319,inca_209,Jean BOURHIS
325,inca_210,Jean BOURHIS


### Linking a Dimesions ID to Reseracher Names

In [21]:
researchers = pd.merge(grnt_dtl[['prenom_port', 'nom_port']].drop_duplicates()
                       , pub_dtl[['prenom_port', 'nom_port', 'Dimensions Researcher ID']].drop_duplicates()
                       , how = 'outer')
researchers = researchers.drop_duplicates()
researchers.columns = ['first_name', 'last_name', 'id']
researchers['name'] = researchers['last_name'] + ", " + researchers['first_name']
researchers = researchers.sort_values('name')
researchers = researchers.reset_index(drop = True)
researchers.describe()

Unnamed: 0,first_name,last_name,id,name
count,1004,1004,965,1004
unique,420,972,965,1004
top,Philippe,BENHAMOU,ur.0647453126.48,"CAZAUX, Christophe"
freq,28,3,1,1


In [22]:
researchers.to_csv('../data/researchers.csv', index = False)

## Number of Grants per Researcher

In [23]:
grnt_dtl['researcher'] = grnt_dtl['nom_port'] + ", " + grnt_dtl['prenom_port']

In [24]:
rsrs_grants = grnt_dtl.groupby('researcher').size().reset_index()
rsrs_grants = rsrs_grants.rename(index=str, columns={0: "grant_count"})

In [25]:
print("{} researchers have 1 grant.".format(rsrs_grants['grant_count'].value_counts()[1]))
print("The maximum number of grants for given researcher is {}.".format(max(rsrs_grants['grant_count'])))
print("Here is the distribution:")
rsrs_grants['grant_count'].hist(bins = 11)
rsrs_grants['grant_count'].value_counts()

618 researchers have 1 grant.
The maximum number of grants for given researcher is 12.
Here is the distribution:


1     618
2     186
3      81
4      61
5      25
6      10
7       8
8       4
9       3
10      2
12      1
11      1
Name: grant_count, dtype: int64

## Analysis of Key Words (FORs)

In [26]:
grnt_dtl['nb_for'] = grnt_dtl['FOR'].str.count(';')+1
grnt_dtl['nb_for'] = grnt_dtl['nb_for'].fillna(0)
grnt_dtl['nb_for'] = grnt_dtl['nb_for'].apply(lambda x: int(x))
print("{} grants have no FOR.".format(grnt_dtl['nb_for'].value_counts()[0]))
print("A grant has at most {} FORs.".format(max(grnt_dtl['nb_for'])))
print("The distribution is:")
grnt_dtl['nb_for'].value_counts()

47 grants have no FOR.
A grant has at most 5 FORs.
The distribution is:


1    1035
2     663
3      71
0      47
4       3
5       1
Name: nb_for, dtype: int64

In [27]:
for_df = pd.concat([grnt_dtl['Dimensions Grant ID'], grnt_dtl['FOR'].str.split(';', expand=True)], axis = 1)
for_df = for_df.rename(index=str, columns={'Dimensions Grant ID': 'grant_id'})
grant_id = []
for_name = []
for i in range(for_df.shape[1]-1):
    temp = for_df[['grant_id', i]]
    temp = temp[temp[i].notnull()]
    grant_id.extend(list(temp['grant_id']))
    for_name.extend(list(temp[i]))
for_df_t = pd.DataFrame({'grant_id': grant_id, 'for': for_name}).sort_values('grant_id').reset_index(drop = True)
for_df = pd.merge(for_df[['grant_id']], for_df_t, how = 'left', on = 'grant_id')
print("There are {} different FOR codes. The most frequent are:".format(len(for_df_t['for'].value_counts())))
for_df_t['for'].value_counts().head()

There are 48 different FOR codes. The most frequent are:


1112 Oncology and Carcinogenesis          1033
0601 Biochemistry and Cell Biology         365
1117 Public Health and Health Services     272
0604 Genetics                              266
1103 Clinical Sciences                     238
Name: for, dtype: int64

## Sandbox