# Data Read-In and Preliminary Analysis

## Python Setup

In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
pd.options.display.max_rows = 50
pd.options.display.max_columns = 100
import unidecode

## Data Read-In

In [3]:
grnt_dtl = pd.read_csv('../data/inca_grants_details.csv')
pub_dtl = pd.read_csv('../data/inca_pub_details.csv', low_memory=False)

In [4]:
print(grnt_dtl.shape)
print(pub_dtl.shape)

(1821, 16)
(145121, 28)


In [5]:
grnt_dtl = grnt_dtl.drop_duplicates()
pub_dtl = pub_dtl.drop_duplicates()

In [6]:
print(grnt_dtl.shape)
print(pub_dtl.shape)

(1820, 16)
(145121, 28)


In [7]:
grnt_dtl.head(2)

Unnamed: 0,INCA ID,prenom_port,nom_port,organisme_port,Dimensions Grant ID,Title,Abstract,Funder,Reference,Research Org Names,Research Org IDs,FOR,RCDC,Funding Amount ($),Start Date,End Date
0,inca_1,Jérôme,ABADIE,Ecole Vétérinaire de Nantes,grant.7426242,"Cancer, Environment and metabolomics: the dog ...",Our innovative project aims to use the dog as ...,French Institute of Health and Medical Research,Inserm_6181,Oniris,grid.418682.1,1117 Public Health and Health Services;1112 On...,Rare Diseases;Prevention;Cancer;Clinical Research,65061.0,2011-12-13,2013-06-12
1,inca_2,Julien,ADAM,Hôpital Necker- Enfants malades APHP,grant.7426178,Creating tools to assess DNA repair dysfunctio...,Scientific context Non-small cell lung carcin...,French Institute of Health and Medical Research,Inserm_4631,Necker-Enfants Malades Hospital;Institut Gusta...,grid.412134.1;grid.14925.3b,0601 Biochemistry and Cell Biology;1112 Oncolo...,Biotechnology;Lung;Cancer;Lung Cancer;Genetics...,240110.0,2011-11-01,2014-11-01


In [8]:
pub_dtl.head(2)

Unnamed: 0,INCA ID,prenom_port,nom_port,organisme_port,Dimensions Researcher ID,Additional Researcher DIM ID to combine,Additional Researcher DIM ID to combine 2,ORCID,title,Dimensions Publication ID,doi,issue,pages,Pubmed ID,volume,Relative Citation Ratio,Times Cited,altmetric,Open access,Author Names,Research Org Names,Research Org IDs,FOR,RCDC,Journal ID,Journal Title,Publication Year,Publication Date
0,inca_1,Jérôme,ABADIE,Ecole Vétérinaire de Nantes,ur.0642054564.81,,,,Spontaneous Canine Mammary Carcinoma as a Mode...,pub.1017598066,10.1016/j.jcpa.2011.11.136,1,79,,146,,1,,,"J. Abadie, F. Nguyen, D. Loussouarn, I. Bemelm...",Institut de Cancérologie de l'Ouest;Université...,grid.418191.4;grid.449623.e,,,jour.1007441,Journal of Comparative Pathology,2012.0,
1,inca_1,Jérôme,ABADIE,Ecole Vétérinaire de Nantes,ur.0642054564.81,,,,FC‐15 \nImmunohistochemical detection of c‐kit...,pub.1013784597,10.1111/j.1365-3164.2004.411_15.x,s1,24-24,,15,,0,,,"M. C. Cadiergues, F. Degorce‐Rubiales, J. Abad...",,,1103 Clinical Sciences,,jour.1108160,Veterinary Dermatology,2004.0,


In [9]:
grnt_dtl.describe(include = 'all')

Unnamed: 0,INCA ID,prenom_port,nom_port,organisme_port,Dimensions Grant ID,Title,Abstract,Funder,Reference,Research Org Names,Research Org IDs,FOR,RCDC,Funding Amount ($),Start Date,End Date
count,1820,1820,1820,1820,1820,1820,1778,1820,1820,1734,1734,1773,1730,1786.0,1782,1764
unique,1001,418,969,506,1627,1546,1491,39,1550,543,543,160,1054,,382,494
top,inca_502,Philippe,FEIL,Institut Gustave Roussy,grant.7154985,Role of cancer stem cells during metastatic di...,Background: The complete success of cancer the...,French National Cancer Institute,INCa_DGOS_4046,Institut Gustave Roussy,grid.14925.3b,1112 Oncology and Carcinogenesis,Cancer,,2007-02-02,2010-02-02
freq,12,45,12,110,3,4,4,848,4,124,124,470,49,,99,87
mean,,,,,,,,,,,,,,531239.6,,
std,,,,,,,,,,,,,,959213.5,,
min,,,,,,,,,,,,,,0.0,,
25%,,,,,,,,,,,,,,177413.0,,
50%,,,,,,,,,,,,,,386935.0,,
75%,,,,,,,,,,,,,,615486.5,,


In [10]:
pub_dtl.describe(include='all')

Unnamed: 0,INCA ID,prenom_port,nom_port,organisme_port,Dimensions Researcher ID,Additional Researcher DIM ID to combine,Additional Researcher DIM ID to combine 2,ORCID,title,Dimensions Publication ID,doi,issue,pages,Pubmed ID,volume,Relative Citation Ratio,Times Cited,altmetric,Open access,Author Names,Research Org Names,Research Org IDs,FOR,RCDC,Journal ID,Journal Title,Publication Year,Publication Date
count,145121,145121,145121,145121,145121,29643,2763,32080,145121,145121,131335,127929.0,142897,105386.0,140591.0,97820.0,145121.0,31666.0,43809,145121,104194,104194,97697,88764,140096,140096,145109.0,0.0
unique,965,398,934,483,965,95,9,198,107243,109001,97583,1607.0,53204,,1591.0,,,,1,102384,36032,36044,826,30275,4565,4558,,
top,inca_542,Philippe,FRANCESCHI,Institut Gustave Roussy,ur.01074225776.01,ur.01204666360.03,ur.010226620247.19,0000-0003-4181-8071,Reply,pub.1026893639,10.18632/oncotarget.13665,1.0,1-9,,28.0,,,,True,"Jeffrey W. Pollard, Alla Danilkovitch-Miagkova...",Institut Gustave Roussy,grid.14925.3b,1112 Oncology and Carcinogenesis,Cancer,jour.1312191,Journal of Clinical Oncology,,
freq,1272,4892,1272,9567,1272,1272,488,1272,37,10,10,16673.0,322,,2505.0,,,,43809,538,4352,4352,24728,3197,4577,4577,,
mean,,,,,,,,,,,,,,17701330.0,,1.56983,29.756348,8.959736,,,,,,,,,2006.900737,
std,,,,,,,,,,,,,,7572868.0,,3.939371,97.346231,47.376261,,,,,,,,,8.572783,
min,,,,,,,,,,,,,,7239.0,,0.0,0.0,0.0,,,,,,,,,1949.0,
25%,,,,,,,,,,,,,,11282560.0,,0.23,0.0,1.0,,,,,,,,,2002.0,
50%,,,,,,,,,,,,,,18687560.0,,0.72,6.0,3.0,,,,,,,,,2009.0,
75%,,,,,,,,,,,,,,24025140.0,,1.69,27.0,5.0,,,,,,,,,2014.0,


## Funder Name

In [11]:
grnt_dtl['funder_clean'] = grnt_dtl['Funder'].fillna('')

In [12]:
grnt_dtl['funder_flag'] = grnt_dtl['funder_clean'].apply(lambda x: x!='')
grnt_dtl['funder_flag'].value_counts(normalize = True)

True    1.0
Name: funder_flag, dtype: float64

All grants have a Funder.

In [13]:
print("{} different Funder Names".format(len(grnt_dtl['funder_clean'].value_counts())))

39 different Funder Names


In [14]:
grnt_dtl['funder_clean'].value_counts().head()

French National Cancer Institute                   848
Ministère des Affaires sociales et de la Santé     523
French Institute of Health and Medical Research    162
French National Research Agency                    122
Swiss National Science Foundation                   42
Name: funder_clean, dtype: int64

In [15]:
grnt_dtl['funder_clean'] = grnt_dtl['funder_clean'].str.upper()

# Remove accents
grnt_dtl['funder_clean'] = grnt_dtl['funder_clean'].apply(lambda x: unidecode.unidecode(x))

grnt_dtl['funder_clean'] = grnt_dtl['funder_clean'].str.replace(r'\s+', ' ')
grnt_dtl['funder_clean'] = grnt_dtl['funder_clean'].str.strip()

In [16]:
print("{} different Funder Names".format(len(grnt_dtl['funder_clean'].value_counts())))

39 different Funder Names


In [17]:
grnt_dtl['funder_clean'].value_counts().head()

FRENCH NATIONAL CANCER INSTITUTE                   848
MINISTERE DES AFFAIRES SOCIALES ET DE LA SANTE     523
FRENCH INSTITUTE OF HEALTH AND MEDICAL RESEARCH    162
FRENCH NATIONAL RESEARCH AGENCY                    122
SWISS NATIONAL SCIENCE FOUNDATION                   42
Name: funder_clean, dtype: int64

Funder names are clean.

## Institution Names

In [18]:
grnt_dtl['org_clean'] = grnt_dtl['Research Org Names'].fillna('')

In [19]:
grnt_dtl['org_flag'] = grnt_dtl['org_clean'].apply(lambda x: x!='')
grnt_dtl['org_flag'].value_counts(normalize = True)

True     0.952747
False    0.047253
Name: org_flag, dtype: float64

In [20]:
print("{} different Institution Names".format(len(grnt_dtl['org_clean'].value_counts())))

544 different Institution Names


In [21]:
grnt_dtl['org_clean'].value_counts().head(10)

Institut Gustave Roussy                                     124
Institute Curie                                              87
                                                             86
French Institute of Health and Medical Research              63
Centre Léon Bérard                                           51
Institute Paoli-Calmettes                                    43
Institute of Genetics and Molecular and Cellular Biology     31
Hôpital Saint-Louis                                          30
Institut Bergonié                                            24
UniCancer Group                                              22
Name: org_clean, dtype: int64

In [22]:
string = "Paoli"
grnt_dtl[grnt_dtl['org_clean'].str.contains(string)]['org_clean'].value_counts()

Institute Paoli-Calmettes                                             43
Hôpital René Huguenin;Institute Paoli-Calmettes;Centre Jean Perrin     1
Centre Oscar Lambret;UniCancer Group;Institute Paoli-Calmettes         1
Institute Paoli-Calmettes;Hôpital Sainte-Marguerite                    1
Name: org_clean, dtype: int64

Organisation names are clean for the most part.

## ORCID Number

In [23]:
pub_dtl['orcid_flag'] = pub_dtl['ORCID'].notnull()

In [24]:
pub_dtl['orcid_flag'].value_counts(normalize = True)

False    0.778943
True     0.221057
Name: orcid_flag, dtype: float64

Only 22% of publications have an ORCID code.

In [25]:
pub_dtl['ORCID'].value_counts().head()

0000-0003-4181-8071    1272
0000-0002-9334-4405    1114
0000-0001-7190-120X     875
0000-0003-2574-3874     776
0000-0002-0400-1954     767
Name: ORCID, dtype: int64

In [26]:
orcid = pub_dtl[pub_dtl['ORCID'].notnull()][['INCA ID', 'ORCID']].reset_index(drop = True)

In [27]:
orcid.sort_values('ORCID').head()

Unnamed: 0,INCA ID,ORCID
23860,inca_823,0000-0001-5088-0155
23865,inca_823,0000-0001-5088-0155
23866,inca_823,0000-0001-5088-0155
23867,inca_823,0000-0001-5088-0155
23868,inca_823,0000-0001-5088-0155


In [28]:
orcid.sort_values('ORCID').tail()

Unnamed: 0,INCA ID,ORCID
24247,inca_847,0000-0003-4839-4347
24249,inca_847,0000-0003-4839-4347
24250,inca_847,0000-0003-4839-4347
24236,inca_847,0000-0003-4839-4347
24253,inca_847,0000-0003-4839-4347


ORCID codes seem clean.

## Researcher Name

In [34]:
grnt_dtl['researcher_name'] = grnt_dtl['prenom_port'] + " " + grnt_dtl['nom_port']
grnt_dtl[['INCA ID', 'researcher_name']].drop_duplicates().describe()

Unnamed: 0,INCA ID,researcher_name
count,1001,1001
unique,1001,1000
top,inca_1071,Jean BOURHIS
freq,1,2


In [35]:
grnt_dtl[['INCA ID', 'researcher_name']].drop_duplicates()[grnt_dtl[['INCA ID', 'researcher_name']].drop_duplicates()['researcher_name']=="Jean BOURHIS"]

Unnamed: 0,INCA ID,researcher_name
319,inca_209,Jean BOURHIS
325,inca_210,Jean BOURHIS


In [61]:
researchers = pd.merge(grnt_dtl[['prenom_port', 'nom_port']].drop_duplicates()
                       , pub_dtl[['prenom_port', 'nom_port', 'Dimensions Researcher ID']].drop_duplicates()
                       , how = 'outer')
researchers = researchers.drop_duplicates()
researchers.columns = ['first_name', 'last_name', 'id']
researchers['name'] = researchers['last_name'] + ", " + researchers['first_name']
researchers = researchers.sort_values('name')
researchers = researchers.reset_index(drop = True)
researchers.describe()

Unnamed: 0,first_name,last_name,id,name
count,1004,1004,965,1004
unique,420,972,965,1004
top,Philippe,ANDRE,ur.0630750367.35,"SABLIN, Marie-Paule"
freq,28,3,1,1


In [31]:
researchers.to_csv('../data/researchers.csv', index = False)

## Sandbox

In [65]:
list(pub_dtl)

['INCA ID',
 'prenom_port',
 'nom_port',
 'organisme_port',
 'Dimensions Researcher ID',
 'Additional Researcher DIM ID to combine',
 'Additional Researcher DIM ID to combine 2',
 'ORCID',
 'title',
 'Dimensions Publication ID',
 'doi',
 'issue',
 'pages',
 'Pubmed ID',
 'volume',
 'Relative Citation Ratio',
 'Times Cited',
 'altmetric',
 'Open access',
 'Author Names',
 'Research Org Names',
 'Research Org IDs',
 'FOR',
 'RCDC',
 'Journal ID',
 'Journal Title',
 'Publication Year',
 'Publication Date',
 'orcid_flag']

In [74]:
grnt_dtl[(grnt_dtl['prenom_port'] == "Sebastian")&(grnt_dtl['nom_port'] == "AMIGORENA")]

Unnamed: 0,INCA ID,prenom_port,nom_port,organisme_port,Dimensions Grant ID,Title,Abstract,Funder,Reference,Research Org Names,Research Org IDs,FOR,RCDC,Funding Amount ($),Start Date,End Date,funder_clean,funder_flag,org_clean,org_flag,researcher_name
28,inca_22,Sebastian,AMIGORENA,Institut Curie,grant.7153854,Immune response against dying tumor cells,"For immunologists, cell death is not an endpoi...",French National Cancer Institute,INCa_1676,Institute Curie,grid.418596.7,1112 Oncology and Carcinogenesis;1107 Immunology,Rare Diseases;Cancer;Breast Cancer,556952.0,2007-12-03,2011-02-02,FRENCH NATIONAL CANCER INSTITUTE,True,Institute Curie,True,Sebastian AMIGORENA
29,inca_22,Sebastian,AMIGORENA,Institut Curie,grant.3800931,Phagosome functions and antigen cross presenta...,T cell cross priming (the initiation of CD8+ T...,European Research Council,340046,Institute Curie,grid.418596.7,0604 Genetics;1107 Immunology,Immunization;Vaccine Related;Prevention;Human ...,2907641.0,2014-09-01,2019-08-31,EUROPEAN RESEARCH COUNCIL,True,Institute Curie,True,Sebastian AMIGORENA
30,inca_22,Sebastian,AMIGORENA,Institut Curie,grant.4731058,Study and development of new anti-cancer thera...,The IMOCA Industrial Chair (Immuno-MOdulation ...,French National Research Agency,ANR-15-CHIN-0002,Institute Curie,grid.418596.7,1107 Immunology,Biotechnology;Immunization;Vaccine Related;Cancer,1860365.0,2015-11-01,2019-10-31,FRENCH NATIONAL RESEARCH AGENCY,True,Institute Curie,True,Sebastian AMIGORENA
31,inca_22,Sebastian,AMIGORENA,Institut Curie,grant.4525380,Targeting of the Tn antigen by a specific chim...,Therapeutic monoclonal antibodies (mAb...,French National Research Agency,ANR-10-EMMA-0015,Institute Curie,grid.418596.7,1107 Immunology;1112 Oncology and Carcinogenesis,Immunization;Ovarian Cancer;Biotechnology;Canc...,495490.0,2011-04-01,2013-03-31,FRENCH NATIONAL RESEARCH AGENCY,True,Institute Curie,True,Sebastian AMIGORENA
32,inca_22,Sebastian,AMIGORENA,Institut Curie,grant.7154861,New strategies for dendritic cells based cance...,The present project aims at coordinating and i...,French National Cancer Institute,INCa_1642,Pasteur Institute;Institute Curie,grid.428999.7;grid.418596.7,1107 Immunology,Biotechnology;Rare Diseases;Immunization;Vacci...,854679.0,2007-11-20,2009-12-19,FRENCH NATIONAL CANCER INSTITUTE,True,Pasteur Institute;Institute Curie,True,Sebastian AMIGORENA


In [71]:
researchers[researchers['name']=="AMIGORENA, Sebastian"]

Unnamed: 0,first_name,last_name,id,name
18,Sebastian,AMIGORENA,ur.01265037264.14,"AMIGORENA, Sebastian"


In [77]:
pub_dtl[(pub_dtl['prenom_port'] == "Sebastian")&(pub_dtl['nom_port'] == "AMIGORENA")].describe(include = 'all')

Unnamed: 0,INCA ID,prenom_port,nom_port,organisme_port,Dimensions Researcher ID,Additional Researcher DIM ID to combine,Additional Researcher DIM ID to combine 2,ORCID,title,Dimensions Publication ID,doi,issue,pages,Pubmed ID,volume,Relative Citation Ratio,Times Cited,altmetric,Open access,Author Names,Research Org Names,Research Org IDs,FOR,RCDC,Journal ID,Journal Title,Publication Year,Publication Date,orcid_flag
count,182,182,182,182,182,0.0,0.0,182,182,182,172,170.0,180,177.0,181.0,169.0,182.0,74.0,93,182,168,168,155,127,176,176,182.0,0.0,182
unique,1,1,1,1,1,0.0,0.0,1,181,182,172,40.0,180,,107.0,,,,1,168,99,99,10,92,65,65,,,1
top,inca_22,Sebastian,AMIGORENA,Institut Curie,ur.01265037264.14,,,0000-0001-8583-8416,Dissecting the Tumor Myeloid Compartment Revea...,pub.1037700469,10.1016/j.cell.2011.11.021,1.0,1379-1385,,11.0,,,,True,"Elodie Segura, Sebastian Amigorena",Institute Curie,grid.418596.7,1107 Immunology,Cancer,jour.1077134,The Journal of Immunology,,,True
freq,182,182,182,182,182,,,182,2,1,1,28.0,1,,7.0,,,,93,5,57,57,100,6,20,20,,,182
mean,,,,,,,,,,,,,,16019320.0,,2.991538,123.994505,6.783784,,,,,,,,,2004.620879,,
std,,,,,,,,,,,,,,7827017.0,,5.108748,237.622513,12.06747,,,,,,,,,8.513506,,
min,,,,,,,,,,,,,,1287114.0,,0.0,0.0,0.0,,,,,,,,,1986.0,,
25%,,,,,,,,,,,,,,10545500.0,,0.44,10.0,1.0,,,,,,,,,1999.0,,
50%,,,,,,,,,,,,,,16020720.0,,1.4,38.5,3.0,,,,,,,,,2005.0,,
75%,,,,,,,,,,,,,,22430490.0,,2.96,115.75,6.0,,,,,,,,,2011.75,,
