# Data Read-In and Preliminary Analysis

## Python Setup

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
pd.options.display.max_rows = 50
pd.options.display.max_columns = 100
import unidecode

## Data Read-In

In [2]:
grants_details = pd.read_csv('../data/inca_grants_details.csv')
pub_details = pd.read_csv('../data/inca_pub_details.csv', low_memory=False)

In [3]:
print(grants_details.shape)
print(pub_details.shape)

(1821, 10)
(145121, 19)


In [4]:
grants_details = grants_details.drop_duplicates()
pub_details = pub_details.drop_duplicates()

In [5]:
print(grants_details.shape)
print(pub_details.shape)

(1820, 10)
(145121, 19)


In [6]:
grants_details.head()

Unnamed: 0,INCA ID,Dimensions ID,Title,Abstract,Funder,Reference,Organisation,Funding Amount ($),Start Date,End Date
0,inca_1,grant.7426242,"Cancer, Environment and metabolomics: the dog ...",Our innovative project aims to use the dog as ...,French Institute of Health and Medical Research,Inserm_6181,Ecole Vétérinaire de Nantes,65061.0,2011-12-13,2013-06-12
1,inca_2,grant.7426178,Creating tools to assess DNA repair dysfunctio...,Scientific context Non-small cell lung carcin...,French Institute of Health and Medical Research,Inserm_4631,Institut Gustave Roussy,240110.0,2011-11-01,2014-11-01
2,inca_3,grant.7154464,A multicenter randomized phase II study to eva...,Medical oncologists are used to treat patients...,Ministère des Affaires sociales et de la Santé,DGOS_2555,Centre Oscar Lambret,248109.0,2010-06-01,2013-06-01
3,inca_3,grant.7154160,National oesophageal and gastric carcinomas da...,"In 2005, the worldwide incidence of oesophagog...",French National Cancer Institute,INCa_6288,Centre Oscar Lambret,875159.0,2013-04-15,2016-04-14
4,inca_4,grant.7154483,Non-coding RNA and cancer,The aim of our project is to understand the mo...,French National Cancer Institute,INCa_2699,CNRS Inserm UMR8161,400306.0,2010-11-04,2013-11-04


In [7]:
pub_details.head()

Unnamed: 0,INCA ID,Dimensions ID,Additional DIM ID to combine,Additional DIM ID to combine 2,ORCID,title,doi,issue,pages,Pubmed ID,volume,Relative Citation Ratio,Times Cited,altmetric,Open access,Author Names,Journal Title,Publication Year,Publication Date
0,inca_1,ur.0642054564.81,,,,Canine invasive mammary carcinomas as models o...,10.1007/s10549-017-4548-2,3.0,635-648,29086231.0,167.0,,1,0.0,True,"Frédérique Nguyen, Laura Peña, Catherine Ibisc...",Breast Cancer Research and Treatment,2018.0,2018-02-09
1,inca_1,ur.0642054564.81,,,,MIB‐1 immunoreactivity correlates with biologi...,10.1046/j.1365-3164.2001.00236.x,3.0,139-147,11420929.0,12.0,1.42,35,,,"C. Laprie, J. Abadie, M.‐F. Amardeilh, J.‐L.L....",Veterinary Dermatology,2001.0,2001-06-09
2,inca_1,ur.0642054564.81,,,,Molecular cytogenetic characterization of cani...,10.1186/1471-2407-11-201,1.0,1-14,21615919.0,11.0,1.73,43,7.0,True,"Benoit Hedan, Rachael Thomas, Alison Motsinger...",BMC Cancer,2011.0,2011-12-09
3,inca_1,ur.0642054564.81,,,,Chapter 2 Cancer Prevalence and Etiology in Wi...,10.1016/b978-0-12-804310-3.00002-8,,11-46,,,,7,,,"Thomas Madsen, Audrey Arnal, Marion Vittecoq, ...",,2017.0,2017-05-09
4,inca_1,ur.0642054564.81,,,,Spontaneous Canine Mammary Carcinoma as a Mode...,10.1016/j.jcpa.2011.11.136,1.0,79,,146.0,,1,,,"J. Abadie, F. Nguyen, D. Loussouarn, I. Bemelm...",Journal of Comparative Pathology,2012.0,2012-01-09


In [8]:
grants_details.describe(include = 'all')

Unnamed: 0,INCA ID,Dimensions ID,Title,Abstract,Funder,Reference,Organisation,Funding Amount ($),Start Date,End Date
count,1820,1820,1820,1778,1820,1820,1819,1820.0,1782,1764
unique,1001,1627,1546,1491,39,1550,699,,382,493
top,inca_502,grant.7154229,Role of CXCL4L1 in Angiogenesis in Renal Cance...,"Approximately 15-20% of sarcomas, tumors deriv...",French National Cancer Institute,INCa_DGOS_1315,Institut Gustave Roussy,,2007-02-02,2010-02-02
freq,12,3,4,4,848,4,106,,99,87
mean,,,,,,,,521293.3,,
std,,,,,,,,952936.6,,
min,,,,,,,,0.0,,
25%,,,,,,,,165986.2,,
50%,,,,,,,,377323.5,,
75%,,,,,,,,611028.0,,


In [9]:
pub_details.describe(include='all')

Unnamed: 0,INCA ID,Dimensions ID,Additional DIM ID to combine,Additional DIM ID to combine 2,ORCID,title,doi,issue,pages,Pubmed ID,volume,Relative Citation Ratio,Times Cited,altmetric,Open access,Author Names,Journal Title,Publication Year,Publication Date
count,145121,145121,29643,2763,32080,145121,131335,127929.0,142897,105386.0,140591.0,97820.0,145121.0,30482.0,43809,145121,140296,145109.0,145109
unique,965,965,95,9,198,107243,97583,1607.0,53204,,1591.0,,,,1,102384,4566,,6237
top,inca_542,ur.01074225776.01,ur.01204666360.03,ur.010226620247.19,0000-0003-4181-8071,Reply,10.1038/bmt.2016.102,1.0,1-9,,28.0,,,,True,"Jeffrey W. Pollard, Alla Danilkovitch-Miagkova...",Journal of Clinical Oncology,,2011-05-09
freq,1272,1272,1272,488,1272,37,10,16673.0,322,,2505.0,,,,43809,538,4577,,1760
mean,,,,,,,,,,17701330.0,,1.569821,29.756348,9.063939,,,,2006.900737,
std,,,,,,,,,,7572868.0,,3.939357,97.346231,48.115127,,,,8.572783,
min,,,,,,,,,,7239.0,,0.0,0.0,0.0,,,,1949.0,
25%,,,,,,,,,,11282560.0,,0.23,0.0,1.0,,,,2002.0,
50%,,,,,,,,,,18687560.0,,0.72,6.0,2.0,,,,2009.0,
75%,,,,,,,,,,24025140.0,,1.69,27.0,5.0,,,,2014.0,


## Funder Name Quality

In [10]:
grants_details['funder_clean'] = grants_details['Funder'].fillna('')

In [11]:
grants_details['funder_flag'] = grants_details['funder_clean'].apply(lambda x: x!='')
grants_details['funder_flag'].value_counts(normalize = True)

True    1.0
Name: funder_flag, dtype: float64

All grants have a Funder.

In [12]:
print("{} different Funder Names".format(len(grants_details['funder_clean'].value_counts())))

39 different Funder Names


In [13]:
grants_details['funder_clean'].value_counts().head()

French National Cancer Institute                   848
Ministère des Affaires sociales et de la Santé     523
French Institute of Health and Medical Research    162
French National Research Agency                    122
Swiss National Science Foundation                   42
Name: funder_clean, dtype: int64

In [14]:
grants_details['funder_clean'] = grants_details['funder_clean'].str.upper()

# Remove accents
grants_details['funder_clean'] = grants_details['funder_clean'].apply(lambda x: unidecode.unidecode(x))

grants_details['funder_clean'] = grants_details['funder_clean'].str.replace(r'\s+', ' ')
grants_details['funder_clean'] = grants_details['funder_clean'].str.strip()

In [15]:
print("{} different Funder Names".format(len(grants_details['funder_clean'].value_counts())))

39 different Funder Names


In [16]:
grants_details['funder_clean'].value_counts().head()

FRENCH NATIONAL CANCER INSTITUTE                   848
MINISTERE DES AFFAIRES SOCIALES ET DE LA SANTE     523
FRENCH INSTITUTE OF HEALTH AND MEDICAL RESEARCH    162
FRENCH NATIONAL RESEARCH AGENCY                    122
SWISS NATIONAL SCIENCE FOUNDATION                   42
Name: funder_clean, dtype: int64

Funder names are clean.

## Institution Names Quality

In [17]:
grants_details['org_clean'] = grants_details['Organisation'].fillna('')

In [18]:
grants_details['org_flag'] = grants_details['org_clean'].apply(lambda x: x!='')
grants_details['org_flag'].value_counts(normalize = True)

True     0.999451
False    0.000549
Name: org_flag, dtype: float64

99.9% of grants have an organization.

In [19]:
print("{} different Institution Names".format(len(grants_details['org_clean'].value_counts())))

700 different Institution Names


In [20]:
grants_details['org_clean'].value_counts().head()

Institut Gustave Roussy     106
Institut Curie               84
Centre Léon Bérard           54
Institut Paoli-Calmettes     37
Institut Bergonié            24
Name: org_clean, dtype: int64

In [21]:
string = "Paoli"
grants_details[grants_details['org_clean'].str.contains(string)]['org_clean'].value_counts()

Institut Paoli-Calmettes                                                                       37
Institut Paoli Calmettes                                                                        5
Inserm U1068 -  Centre de recherche en cancérologie de Marseille - Institut Paoli Calmettes     3
UMR912 Inserm-IRD-Université d’Aix Marseille SESSTIM - Institut Paoli Calmettes                 2
Inserm U1068 - Centre de recherche en cancérologie de Marseille - Institut Paoli-Calmettes      1
UMR7258  Centre de recherche en cancérologie de Marseille (CRCM) - Institut Paoli Calmettes     1
Name: org_clean, dtype: int64

Organisation names are not clean at all.

### Cleaning Institution Names

In [22]:
grants_details['org_clean'] = grants_details['org_clean'].str.upper()
grants_details['org_clean'] = grants_details['org_clean'].str.strip()
grants_details['org_clean'] = grants_details['org_clean'].str.replace(r'-', ' ')
grants_details['org_clean'] = grants_details['org_clean'].str.replace(r'\s+', ' ')

# Remove accents
grants_details['org_clean'] = grants_details['org_clean'].apply(lambda x: unidecode.unidecode(x))

grants_details['org_clean'] = grants_details['org_clean'].str.replace(r'^(CNRS INSERM|CNRS|INSERM|) ?UM?R? ?\d+ -? ?', ' ')
grants_details['org_clean'] = grants_details['org_clean'].str.replace(r'APHP', ' ')
grants_details['org_clean'] = grants_details['org_clean'].str.replace('CURIE INSTITUTE', 'INSTITUT CURIE')
grants_details['org_clean'] = grants_details['org_clean'].str.replace(r'\(.+\)', 'INSTITUT CURIE')
grants_details['org_clean'] = grants_details['org_clean'].str.replace(r'\s+', ' ')
grants_details['org_clean'] = grants_details['org_clean'].str.strip()

In [23]:
grants_details['org_clean'].value_counts().head(5)

INSTITUT GUSTAVE ROUSSY     132
INSTITUT CURIE              104
CENTRE LEON BERARD           54
HOPITAL SAINT LOUIS          44
INSTITUT PAOLI CALMETTES     42
Name: org_clean, dtype: int64

In [24]:
print("{} different Clean Institution Names".format(len(grants_details['org_clean'].value_counts())))

607 different Clean Institution Names


**To be continued...**

## ORCID Names Quality

In [25]:
pub_details['orcid_flag'] = pub_details['ORCID'].notnull()

In [26]:
pub_details['orcid_flag'].value_counts(normalize = True)

False    0.778943
True     0.221057
Name: orcid_flag, dtype: float64

Only 22% of publications have an ORCID code.

In [27]:
pub_details['ORCID'].value_counts().head()

0000-0003-4181-8071    1272
0000-0002-9334-4405    1114
0000-0001-7190-120X     875
0000-0003-2574-3874     776
0000-0002-0400-1954     767
Name: ORCID, dtype: int64

In [28]:
orcid = pub_details[pub_details['ORCID'].notnull()][['INCA ID', 'ORCID']].reset_index(drop = True)

In [29]:
orcid.sort_values('ORCID').head()

Unnamed: 0,INCA ID,ORCID
23860,inca_823,0000-0001-5088-0155
23865,inca_823,0000-0001-5088-0155
23866,inca_823,0000-0001-5088-0155
23867,inca_823,0000-0001-5088-0155
23868,inca_823,0000-0001-5088-0155


In [30]:
orcid.sort_values('ORCID').tail()

Unnamed: 0,INCA ID,ORCID
24247,inca_847,0000-0003-4839-4347
24249,inca_847,0000-0003-4839-4347
24250,inca_847,0000-0003-4839-4347
24236,inca_847,0000-0003-4839-4347
24253,inca_847,0000-0003-4839-4347


ORCID codes seem clean.

## Sandbox

In [31]:
grants = ['DGOS_5411', 'DGOS_5413', 'DGOS_5415', 'DGOS_5419', 'DGOS_5422', 'DGOS_5424', 'DGOS_5430', 'DGOS_5432', 'DGOS_5436', 'DGOS_5437', 'DGOS_5459', 'DGOS_5469', 'DGOS_5477', 'DGOS_5480', 'DGOS_5483', 'DGOS_5485', 'DGOS_5491', 'DGOS_5492', 'DGOS_5497', 'DGOS_5500', 'DGOS_5506', 'DGOS_5507', 'DGOS_5515', 'DGOS_5520', 'DGOS_5523', 'DGOS_5534', 'DGOS_5535', 'DGOS_5536', 'DGOS_5538', 'DGOS_5539', 'DGOS_5541', 'DGOS_5544', 'DGOS_5548', 'DGOS_5549', 'DGOS_5559', 'DGOS_5567', 'DGOS_5569', 'DGOS_5571', 'DGOS_5574', 'DGOS_5581', 'DGOS_5584', 'DGOS_5586', 'DGOS_5588', 'DGOS_5589', 'DGOS_5590', 'DGOS_5592', 'DGOS_5596', 'DGOS_5597', 'DGOS_5603', 'DGOS_5604', 'DGOS_5611', 'DGOS_5616', 'DGOS_5619', 'DGOS_5631', 'DGOS_5650', 'DGOS_5653', 'DGOS_5656', 'INCa_DGOS_5667', 'INCa_DGOS_5669', 'INCa_DGOS_5678', 'INCa_DGOS_5687', 'INCa_DGOS_5694', 'INCa_DGOS_5697', 'INCa_DGOS_5702', 'INCa_DGOS_5710', 'INCa_DGOS_5714', 'INCa_DGOS_5716', 'INCa_DGOS_5720', 'INCa_DGOS_5728', 'INCa_DGOS_5732', 'INCa_DGOS_5733', 'INCa_DGOS_5742', 'INCa_DGOS_5747', 'INCa_DGOS_5748', 'INCa_DGOS_5749', 'INCa_DGOS_5750', 'INCa_DGOS_5776', 'INCa_DGOS_5780', 'INCa_DGOS_5790', 'INCa_DGOS_5797', 'INCa_DGOS_5807', 'INCa_DGOS_5819', 'INCa_5827', 'INCa_5828', 'INCa_5835', 'INCa_5839', 'INCa_5850', 'INCa_5852', 'INCa_5864', 'INCa_5865', 'INCa_5866', 'INCa_5869', 'INCa_5879', 'INCa_5880', 'INCa_5907', 'INCa_5911', 'INCa_5918', 'INCa_5922', 'INCa_5930', 'INCa_5933', 'INCa_5934', 'INCa_5940', 'INCa_5944', 'INCa_5954', 'INCa_5959', 'INCa_5960', 'INCa_5964', 'INCa_5975', 'INCa_5976', 'INCa_5982', 'INCa_5992', 'INCa_5997', 'INCa_6000', 'INCa_6001', 'INCa_6016', 'INCa_6019', 'INCa_6022', 'INCa_6029', 'INCa_6032', 'INCa_DGOS_6037', 'INCa_6038', 'INCa_6039', 'INCa_6041', 'INCa_6043', 'INCa_6045', 'INCa_6046', 'INCa_ARC_Ligue_6050', 'INCa_ARC_Ligue_6052', 'INCa_ARC_Ligue_6054', 'INCa_ARC_Ligue_6057', 'INCa_ARC_Ligue_6058', 'INCa_ARC_Ligue_6078', 'INCa_6107', 'INCa_ARC_6123', 'INCa_ARC_6124', 'INCa_ARC_6125', 'INCa_6131', 'INCa_6132', 'INCa_6133', 'INCa_6137', 'INCa_6138', 'INCa_6139', 'INCa_6143', 'INCa_6144', 'INCa_6146', 'INCa_6150', 'INCa_6152', 'INCa_6155', 'INCa_6156', 'INCa_6162', 'INCa_6165', 'INCa_6166', 'Inserm_6233', 'Inserm_6237', 'Inserm_6238', 'Inserm_6240', 'Inserm_6279', 'Inserm_6280', 'Inserm_6281', 'Inserm_6282', 'Inserm_6283', 'Inserm_6284', 'Inserm_6285', 'Inserm_6286', 'INCa_6288', 'INCa_6291', 'INCa_6294', 'INCa_6298', 'INCa_6306', 'INCa_6307', 'INCa_6310', 'INCa_6311', 'INCa_6312', 'INCa_6313', 'INCa_6317', 'Inserm_6336', 'Inserm_6337', 'Inserm_6338', 'Inserm_6339', 'Inserm_6340', 'Inserm_6341', 'Inserm_6342', 'Inserm_6343', 'Inserm_6345', 'Inserm_6346', 'Inserm_6348', 'Inserm_6349', 'Inserm_6350', 'Inserm_6351', 'Inserm_6352', 'Inserm_6353', 'Inserm_6354', 'Inserm_6355', 'Inserm_6356', 'Inserm_6357', 'Inserm_6358', 'Inserm_6359', 'Inserm_6360', 'Inserm_6361', 'Inserm_6362', 'Inserm_6402', 'Inserm_6403', 'Inserm_6404', 'Inserm_6405', 'Inserm_6406', 'Inserm_6407', 'Inserm_6408', 'Inserm_6409', 'Inserm_6410', 'Inserm_6411', 'Inserm_6413', 'Inserm_6414', 'Inserm_6415', 'Inserm_6416', 'Inserm_6417', 'Inserm_6418', 'Inserm_6419', 'Inserm_6420', 'Inserm_6421', 'Inserm_6422', 'Inserm_6423', 'Inserm_6424', 'Inserm_6425', 'Inserm_6426', 'Inserm_6427', 'Inserm_6428', 'Inserm_6429', 'Inserm_6430', 'Inserm_6431', 'Inserm_6432', 'INCa_6716', 'Inserm_6927', 'Inserm_6928', 'Inserm_6929', 'Inserm_6930', 'Inserm_6987', 'Inserm_6988', 'Inserm_6989', 'Inserm_6990', 'Inserm_6991', 'Inserm_6992', 'Inserm_6993', 'DGOS_3668', 'DGOS_3669', 'DGOS_3674', 'DGOS_3677', 'DGOS_3687', 'DGOS_3688', 'DGOS_3694', 'DGOS_3695', 'DGOS_3706', 'DGOS_3712', 'DGOS_3713', 'DGOS_3715', 'DGOS_3718', 'DGOS_3722', 'DGOS_3743', 'DGOS_3758', 'DGOS_3761', 'DGOS_3764', 'DGOS_3766', 'DGOS_3768', 'DGOS_3770', 'DGOS_3774', 'DGOS_3776', 'DGOS_3781', 'DGOS_3782', 'DGOS_3792', 'DGOS_3796', 'DGOS_3807', 'DGOS_3809', 'DGOS_3810', 'DGOS_3813', 'DGOS_3815', 'DGOS_3823', 'DGOS_3827', 'DGOS_3830', 'DGOS_3838', 'DGOS_3844', 'DGOS_3847', 'DGOS_3855', 'DGOS_3856', 'DGOS_3860', 'DGOS_3867', 'DGOS_3869', 'DGOS_3870', 'DGOS_3881', 'DGOS_3882', 'DGOS_3887', 'DGOS_3889', 'DGOS_3892', 'DGOS_3896', 'DGOS_3901', 'DGOS_3909', 'DGOS_3915', 'DGOS_3916', 'DGOS_3921', 'DGOS_3926', 'DGOS_3927', 'DGOS_3931', 'DGOS_3937', 'DGOS_3941', 'DGOS_3945', 'DGOS_3955', 'DGOS_3959', 'DGOS_3961', 'INCa_DGOS_3966', 'INCa_DGOS_3974', 'INCa_DGOS_3986', 'INCa_DGOS_3989', 'INCa_DGOS_3996', 'INCa_DGOS_4010', 'INCa_DGOS_4014', 'INCa_DGOS_4016', 'INCa_DGOS_4020', 'INCa_DGOS_4024', 'INCa_DGOS_4046', 'INCa_DGOS_4053', 'DGOS_4077', 'INCa_4361', 'INCa_4376', 'INCa_4377', 'INCa_4379', 'INCa_4382', 'INCa_4393', 'INCa_4398', 'INCa_4402', 'INCa_4412', 'INCa_4415', 'INCa_4418', 'INCa_4419', 'INCa_4431', 'INCa_4444', 'INCa_4453', 'INCa_4454', 'INCa_4457', 'INCa_4458', 'INCa_4467', 'INCa_4468', 'INCa_4470', 'INCa_4476', 'INCa_4481', 'INCa_4486', 'INCa_4496', 'INCa_4505', 'INCa_4508', 'INCa_4513', 'INCa_4520', 'INCa_4546', 'INCa_4566', 'INCa_4567', 'INCa_4568', 'INCa_4569', 'INCa_4570', 'INCa_4571', 'INCa_4579', 'Inserm_4620', 'Inserm_4622', 'Inserm_4624', 'Inserm_4625', 'Inserm_4628', 'Inserm_4631', 'Inserm_4632', 'Inserm_4634', 'Inserm_4635', 'Inserm_4636', 'Inserm_4638', 'Inserm_4639', 'Inserm_4641', 'Inserm_4642', 'Inserm_4643', 'Inserm_4644', 'Inserm_4647', 'Inserm_4649', 'Inserm_4651', 'INCa_4654', 'INCa_4664', 'INCa_4670', 'INCa_4678', 'INCa_4680', 'INCa_4684', 'INCa_4688', 'INCa_4690', 'INCa_4692', 'INCa_4699', 'INCa_4703', 'INCa_4704', 'INCa_4705', 'INCa_4710', 'INCa_4712', 'INCa_4715', 'INCa_4719', 'INCa_4721', 'INCa_4727', 'INCa_4733', 'INCa_4734', 'INCa_4871', 'INCa_4876', 'INCa_4883', 'INCa_ARC_Ligue_4901', 'INCa_ARC_Ligue_4913', 'INCa_ARC_Ligue_4915', 'INCa_ARC_Ligue_4918', 'INCa_ARC_Ligue_4924', 'INCa_ARC_Ligue_4930', 'INCa_ARC_Ligue_4931', 'INCa_4957', 'INCa_4968', 'INCa_4987', 'DGOS_4990', 'Inserm_5205', 'Inserm_5206', 'Inserm_5207', 'Inserm_5208', 'Inserm_5209', 'Inserm_5210', 'Inserm_5211', 'Inserm_5212', 'Inserm_5213', 'Inserm_5214', 'Inserm_5215', 'Inserm_5216', 'Inserm_5217', 'Inserm_5218', 'Inserm_5219', 'Inserm_5220', 'Inserm_5221', 'INCa_5395', 'INCa_5400', 'INCa_5401', 'INCa_5402', 'INCa_5406', 'INCa_6035', 'INCa_6036', 'Inserm_6177', 'Inserm_6178', 'Inserm_6179', 'Inserm_6180', 'Inserm_6181', 'Inserm_6182', 'Inserm_6183', 'Inserm_6184', 'Inserm_6185', 'Inserm_6225', 'Inserm_6226', 'Inserm_6228', 'Inserm_6231', 'Inserm_6232', 'Inserm_6242', 'Inserm_6243', 'Inserm_6302', 'Inserm_6303', 'Inserm_6304', 'Inserm_6305', 'Inserm_6395', 'Inserm_6396', 'Inserm_6397', 'Inserm_6398', 'Inserm_6399', 'Inserm_6400', 'Inserm_6401', 'Inserm_6917', 'Inserm_6918', 'Inserm_6919', 'Inserm_6920', 'Inserm_6921', 'Inserm_6922', 'Inserm_6923', 'Inserm_6924', 'Inserm_6925', 'Inserm_6926', 'DGOS_2383', 'DGOS_2387', 'DGOS_2390', 'DGOS_2391', 'INCa_2408', 'INCa_2413', 'DGOS_2416', 'DGOS_2420', 'DGOS_2421', 'DGOS_2425', 'DGOS_2426', 'DGOS_2427', 'DGOS_2428', 'DGOS_2435', 'DGOS_2443', 'DGOS_2444', 'DGOS_2446', 'DGOS_2452', 'DGOS_2453', 'DGOS_2457', 'DGOS_2465', 'DGOS_2466', 'DGOS_2469', 'DGOS_2472', 'DGOS_2473', 'DGOS_2474', 'DGOS_2479', 'DGOS_2480', 'DGOS_2481', 'DGOS_2486', 'DGOS_2490', 'DGOS_2491', 'DGOS_2497', 'DGOS_2498', 'DGOS_2508', 'DGOS_2509', 'DGOS_2510', 'DGOS_2511', 'DGOS_2512', 'DGOS_2530', 'DGOS_2536', 'DGOS_2548', 'DGOS_2553', 'DGOS_2555', 'DGOS_2557', 'DGOS_2561', 'DGOS_2562', 'DGOS_2564', 'DGOS_2565', 'DGOS_2567', 'DGOS_2572', 'DGOS_2574', 'DGOS_2577', 'DGOS_2580', 'DGOS_2585', 'DGOS_2586', 'DGOS_2588', 'DGOS_2596', 'DGOS_2599', 'DGOS_2600', 'DGOS_2619', 'DGOS_2622', 'INCa_2637', 'INCa_DGOS_2641', 'INCa_2642', 'INCa_DGOS_2643', 'INCa_2648', 'INCa_2649', 'INCa_2650', 'INCa_DGOS_2659', 'INCa_DGOS_2662', 'INCa_DGOS_2664', 'INCa_2675', 'INCa_2679', 'INCa_2680', 'INCa_2685', 'DGOS_2690', 'INCa_DGOS_2698', 'INCa_2699', 'INCa_2700', 'INCa_DGOS_2714', 'INCa_DGOS_2717', 'INCa_DGOS_2719', 'INCa_DGOS_2720', 'INCa_DGOS_2721', 'INCa_DGOS_2726', 'INCa_DGOS_2740', 'INCa_DGOS_2747', 'INCa_DGOS_2752', 'INCa_DGOS_2758', 'INCa_2765', 'INCa_2768', 'INCa_2783', 'INCa_2784', 'INCa_2785', 'INCa_2814', 'INCa_2843', 'INCa_2848', 'INCa_2853', 'INCa_2854', 'INCa_2874', 'INCa_2876', 'INCa_2879', 'INCa_2881', 'INCa_2883', 'INCa_2893', 'INCa_2900', 'INCa_2903', 'INCa_2910', 'INCa_2915', 'INCa_2916', 'INCa_2921', 'INCa_2922', 'INCa_2934', 'INCa_2940', 'INCa_2941', 'INCa_2943', 'INCa_2960', 'INCa_2963', 'INCa_2966', 'INCa_3039', 'INCa_3041', 'INCa_3043', 'INCa_3046', 'INCa_3048', 'INCa_3049', 'INCa_3051', 'INCa_3052', 'INCa_3061', 'INCa_3067', 'INCa_3069', 'INCa_3070', 'INCa_3075', 'INCa_3078', 'INCa_3081', 'INCa_3085', 'INCa_3087', 'INCa_3099', 'INCa_3104', 'INCa_3106', 'INCa_3110', 'INCa_3111', 'INCa_3113', 'INCa_3115', 'INCa_3123', 'INCa_3126', 'INCa_3127', 'INCa_3129', 'INCa_3131', 'INCa_3133', 'INCa_3135', 'INCa_3136', 'INCa_3140', 'INCa_3141', 'INCa_3143', 'INCa_3145', 'INCa_3146', 'INCa_3147', 'INCa_3148', 'INCa_3154', 'INCa_3156', 'INCa_ARC_Ligue_3180', 'INCa_ARC_Ligue_3190', 'INCa_ARC_Ligue_3199', 'INCa_ARC_Ligue_3202', 'INCa_ARC_Ligue_3204', 'INCa_ARC_Ligue_3210', 'INCa_ARC_Ligue_3213', 'INCa_ARC_Ligue_3215', 'INCa_3410', 'INCa_3411', 'INCa_3485', 'INCa_3504', 'INCa_3506', 'INCa_3507', 'INCa_3509', 'INCa_3512', 'INCa_3513', 'INCa_3515', 'INCa_3516', 'INCa_3533', 'INCa_3534', 'INCa_3541', 'INCa_3544', 'INCa_3545', 'INCa_3549', 'INCa_3550', 'INCa_3650', 'INCa_3651', 'INCa_3652', 'INCa_3653', 'INCa_3654', 'DGOS_3655', 'DGOS_3656', 'INCa_3657', 'INCa_3658', 'INCa_3659', 'INCa_3660', 'INCa_3661', 'INCa_3662', 'INCa_3663', 'INCa_3664', 'DGOS_1076', 'DGOS_1078', 'DGOS_1079', 'DGOS_1093', 'DGOS_1097', 'DGOS_1098', 'DGOS_1100', 'DGOS_1103', 'DGOS_1104', 'DGOS_1107', 'DGOS_1108', 'DGOS_1110', 'DGOS_1111', 'DGOS_1117', 'DGOS_1122', 'DGOS_1126', 'DGOS_1129', 'DGOS_1131', 'DGOS_1133', 'DGOS_1141', 'DGOS_1142', 'DGOS_1145', 'DGOS_1146', 'DGOS_1155', 'DGOS_1165', 'DGOS_1169', 'DGOS_1170', 'DGOS_1172', 'DGOS_1174', 'DGOS_1186', 'DGOS_1190', 'DGOS_1191', 'DGOS_1203', 'DGOS_1204', 'DGOS_1208', 'DGOS_1217', 'DGOS_1218', 'DGOS_1225', 'DGOS_1229', 'DGOS_1231', 'DGOS_1235', 'DGOS_1242', 'DGOS_1244', 'DGOS_1245', 'DGOS_1247', 'DGOS_1249', 'DGOS_1262', 'DGOS_1285', 'DGOS_1287', 'DGOS_1289', 'DGOS_1291', 'DGOS_1296', 'DGOS_1298', 'INCa_DGOS_1302', 'INCa_DGOS_1304', 'INCa_DGOS_1306', 'INCa_DGOS_1307', 'INCa_DGOS_1308', 'INCa_DGOS_1312', 'INCa_DGOS_1313', 'INCa_DGOS_1315', 'INCa_DGOS_1321', 'INCa_DGOS_1324', 'INCa_DGOS_1332', 'DGOS_1335', 'DGOS_1336', 'INCa_DGOS_1339', 'INCa_DGOS_1345', 'INCa_DGOS_1349', 'INCa_DGOS_1355', 'INCa_DGOS_1359', 'INCa_DGOS_1368', 'INCa_DGOS_1376', 'INCa_DGOS_1393', 'INCa_DGOS_1415', 'INCa_DGOS_1429', 'INCa_DGOS_1450', 'INCa_DGOS_1458', 'INCa_DGOS_1461', 'DGOS_1465', 'INCa_1466', 'INCa_1467', 'INCa_1468', 'INCa_1470', 'INCa_1472', 'INCa_1473', 'INCa_1476', 'INCa_1482', 'INCa_1484', 'INCa_1486', 'INCa_1487', 'INCa_1488', 'INCa_1489', 'INCa_1490', 'INCa_1491', 'INCa_1496', 'INCa_1505', 'INCa_1506', 'INCa_1508', 'INCa_1509', 'INCa_1511', 'INCa_1512', 'INCa_1514', 'INCa_1515', 'INCa_1517', 'INCa_1518', 'INCa_1520', 'INCa_1522', 'INCa_1532', 'INCa_1534', 'INCa_1535', 'INCa_1538', 'INCa_1543', 'INCa_1545', 'INCa_1547', 'INCa_1549', 'INCa_1554', 'INCa_1555', 'INCa_1559', 'INCa_1560', 'INCa_1563', 'INCa_1564', 'INCa_1576', 'INCa_1580', 'INCa_1581', 'INCa_1744', 'INCa_1787', 'INCa_1789', 'INCa_1803', 'INCa_1818', 'INCa_1822', 'INCa_1841', 'INCa_1843', 'INCa_1871', 'INCa_1892', 'INCa_1893', 'INCa_1915', 'INCa_1925', 'INCa_1929', 'INCa_1930', 'INCa_1938', 'INCa_1946', 'INCa_1962', 'INCa_1965', 'INCa_1971', 'INCa_1976', 'INCa_1983', 'INCa_2001', 'INCa_2002', 'INCa_2048', 'INCa_2054', 'INCa_2078', 'INCa_2117', 'INCa_2125', 'INCa_2128', 'INCa_2349', 'INCa_2356', 'INCa_2357', 'INCa_2358', 'INCa_2801', 'INCa_2828', 'INCa_3217', 'INCa_3218', 'INCa_3260', 'DGOS_0274', 'DGOS_0278', 'DGOS_0279', 'DGOS_0282', 'DGOS_0284', 'DGOS_0292', 'DGOS_0293', 'DGOS_0298', 'DGOS_0310', 'DGOS_0315', 'DGOS_0320', 'DGOS_0322', 'DGOS_0325', 'DGOS_0327', 'DGOS_0337', 'DGOS_0341', 'DGOS_0342', 'DGOS_0344', 'DGOS_0346', 'DGOS_0347', 'DGOS_0348', 'DGOS_0354', 'DGOS_0357', 'DGOS_0360', 'DGOS_0362', 'DGOS_0363', 'DGOS_0365', 'DGOS_0366', 'DGOS_0378', 'DGOS_0380', 'DGOS_0383', 'DGOS_0385', 'DGOS_0387', 'DGOS_0392', 'DGOS_0397', 'DGOS_0401', 'DGOS_0410', 'DGOS_0415', 'DGOS_0416', 'DGOS_0417', 'DGOS_0420', 'DGOS_0424', 'DGOS_0427', 'DGOS_0428', 'DGOS_0435', 'DGOS_0442', 'DGOS_0444', 'DGOS_0447', 'DGOS_0448', 'DGOS_0459', 'DGOS_0478', 'DGOS_0480', 'DGOS_0485', 'DGOS_0489', 'DGOS_0494', 'INCa_0513', 'INCa_0516', 'INCa_0517', 'INCa_0519', 'INCa_0525', 'INCa_0543', 'INCa_0544', 'INCa_0548', 'INCa_0550', 'INCa_0554', 'INCa_0557', 'INCa_0561', 'INCa_0562', 'INCa_0563', 'INCa_0583', 'INCa_0585', 'INCa_0593', 'INCa_0594', 'INCa_0597', 'INCa_0600', 'INCa_0602', 'INCa_0607', 'INCa_0609', 'INCa_0614', 'INCa_0615', 'INCa_0617', 'INCa_0620', 'INCa_0624', 'INCa_0625', 'INCa_0627', 'INCa_0628', 'INCa_0637', 'INCa_0639', 'INCa_0644', 'INCa_0651', 'INCa_0657', 'INCa_0658', 'INCa_0659', 'INCa_0669', 'INCa_0673', 'INCa_0680', 'INCa_0700', 'INCa_0706', 'INCa_0707', 'INCa_0713', 'INCa_0714', 'INCa_0716', 'INCa_0717', 'INCa_0720', 'INCa_0726', 'INCa_0727', 'INCa_0729', 'INCa_0739', 'INCa_0748', 'INCa_0752', 'INCa_0759', 'INCa_0777', 'INCa_0779', 'INCa_0782', 'INCa_0794', 'INCa_0795', 'INCa_0810', 'INCa_0811', 'INCa_0815', 'INCa_0820', 'INCa_0826', 'INCa_0834', 'INCa_0839', 'INCa_0843', 'INCa_0849', 'INCa_0855', 'INCa_0859', 'INCa_0867', 'INCa_0870', 'INCa_0874', 'INCa_0876', 'INCa_0888', 'INCa_0892', 'INCa_0898', 'INCa_0946', 'INCa_0954', 'INCa_0972', 'INCa_0976', 'INCa_0977', 'INCa_0979', 'INCa_2138', 'INCa_2139', 'INCa_2140', 'INCa_2141', 'INCa_2163', 'INCa_2315', 'INCa_2316', 'INCa_2351', 'INCa_2352', 'INCa_2354', 'INCa_2355', 'INCa_2800', 'DGOS_0013', 'DGOS_0018', 'DGOS_0020', 'DGOS_0022', 'DGOS_0024', 'DGOS_0025', 'DGOS_0034', 'DGOS_0040', 'DGOS_0042', 'DGOS_0046', 'DGOS_0050', 'DGOS_0052', 'DGOS_0054', 'DGOS_0056', 'DGOS_0058', 'DGOS_0060', 'DGOS_0066', 'DGOS_0072', 'DGOS_0074', 'DGOS_0075', 'DGOS_0078', 'DGOS_0079', 'DGOS_0082', 'DGOS_0084', 'DGOS_0091', 'DGOS_0093', 'DGOS_0095', 'DGOS_0096', 'DGOS_0097', 'DGOS_0099', 'DGOS_0100', 'DGOS_0101', 'DGOS_0110', 'DGOS_0112', 'DGOS_0113', 'DGOS_0117', 'DGOS_0121', 'DGOS_0126', 'DGOS_0133', 'DGOS_0134', 'DGOS_0137', 'DGOS_0140', 'DGOS_0141', 'DGOS_0143', 'DGOS_0146', 'DGOS_0147', 'DGOS_0148', 'DGOS_0149', 'DGOS_0150', 'DGOS_0151', 'DGOS_0152', 'DGOS_0158', 'DGOS_0160', 'DGOS_0161', 'DGOS_0162', 'DGOS_0164', 'DGOS_0165', 'DGOS_0166', 'DGOS_0175', 'DGOS_0177', 'DGOS_0179', 'DGOS_0181', 'DGOS_0184', 'DGOS_0186', 'DGOS_0193', 'DGOS_0194', 'DGOS_0199', 'DGOS_0202', 'DGOS_0203', 'DGOS_0204', 'DGOS_0206', 'DGOS_0207', 'DGOS_0211', 'DGOS_0212', 'DGOS_0215', 'DGOS_0216', 'DGOS_0252', 'DGOS_0253', 'DGOS_0254', 'DGOS_0255', 'DGOS_0256', 'DGOS_0257', 'DGOS_0258', 'DGOS_0259', 'DGOS_0260', 'DGOS_0261', 'INCa_0981', 'INCa_0982', 'INCa_0983', 'INCa_0984', 'INCa_0985', 'INCa_0986', 'INCa_0987', 'INCa_0988', 'INCa_0989', 'INCa_0991', 'INCa_0992', 'INCa_0993', 'INCa_0994', 'INCa_0995', 'INCa_0996', 'INCa_0997', 'INCa_0998', 'INCa_1000', 'INCa_1001', 'INCa_1002', 'INCa_1003', 'INCa_1004', 'INCa_1005', 'INCa_1008', 'INCa_1009', 'INCa_1010', 'INCa_1011', 'INCa_1012', 'INCa_1013', 'INCa_1014', 'INCa_1015', 'INCa_1016', 'INCa_1017', 'INCa_1018', 'INCa_1019', 'INCa_1020', 'INCa_1022', 'INCa_1023', 'INCa_1024', 'INCa_1025', 'INCa_1026', 'INCa_1027', 'INCa_1028', 'INCa_ARC_1029', 'INCa_1030', 'INCa_1031', 'INCa_1032', 'INCa_1033', 'INCa_ARC_1034', 'INCa_ARC_1035', 'INCa_1036', 'INCa_1037', 'INCa_ARC_1038', 'INCa_ARC_1039', 'INCa_ARC_1040', 'INCa_ARC_1041', 'INCa_1042', 'INCa_1043', 'INCa_1044', 'INCa_1045', 'INCa_1046', 'INCa_1047', 'INCa_1050', 'INCa_1051', 'INCa_1052', 'INCa_1053', 'INCa_1054', 'INCa_1055', 'INCa_1056', 'INCa_1057', 'INCa_1058', 'INCa_1059', 'INCa_1060', 'INCa_1071', 'INCa_1072', 'INCa_1073', 'INCa_1074', 'INCa_1293', 'INCa_1294', 'INCa_1632', 'INCa_1633', 'INCa_1634', 'INCa_1635', 'INCa_1636', 'INCa_1637', 'INCa_1638', 'INCa_1639', 'INCa_1640', 'INCa_1641', 'INCa_1642', 'INCa_1643', 'INCa_1644', 'INCa_1645', 'INCa_1646', 'INCa_1647', 'INCa_1648', 'INCa_1649', 'INCa_1650', 'INCa_1651', 'INCa_1652', 'INCa_1653', 'INCa_1654', 'INCa_1655', 'INCa_1656', 'INCa_1657', 'INCa_1658', 'INCa_1659', 'INCa_1660', 'INCa_1661', 'INCa_1662', 'INCa_1663', 'INCa_1664', 'INCa_1665', 'INCa_1666', 'INCa_1667', 'INCa_1668', 'INCa_1669', 'INCa_1670', 'INCa_1671', 'INCa_1672', 'INCa_1673', 'INCa_1674', 'INCa_1675', 'INCa_1676', 'INCa_1677', 'INCa_1678', 'INCa_1679', 'INCa_1680', 'INCa_1681', 'INCa_1682', 'INCa_1683', 'INCa_1684', 'INCa_1685', 'INCa_1686', 'INCa_1687', 'INCa_1688', 'INCa_1689', 'INCa_1690', 'INCa_1691', 'INCa_1692', 'INCa_1693', 'INCa_1694', 'INCa_1695', 'INCa_1696', 'INCa_1697', 'INCa_1698', 'INCa_1699', 'INCa_1700', 'INCa_1701', 'INCa_1702', 'INCa_1703', 'INCa_1704', 'INCa_1705', 'INCa_1706', 'INCa_1707', 'INCa_1708', 'INCa_1709', 'INCa_1710', 'INCa_1711', 'INCa_1712', 'INCa_1713', 'INCa_1714', 'INCa_1715', 'INCa_1716', 'INCa_1717', 'INCa_1718', 'INCa_1719', 'INCa_1720', 'INCa_1721', 'INCa_1722', 'INCa_1723', 'INCa_1724', 'INCa_1725', 'INCa_1726', 'INCa_1727', 'INCa_1728', 'INCa_1729', 'INCa_1730', 'INCa_1731', 'INCa_2159', 'INCa_2160', 'INCa_2311', 'INCa_2312', 'INCa_2313', 'INCa_2314', 'INCa_5139', 'INCa_5140', 'INCa_5141', 'INCa_5142', 'INCa_5143', 'INCa_5144', 'INCa_5291', 'INCa_5292', 'INCa_5293', 'INCa_5294', 'INCa_5295', 'INCa_5296', 'INCa_5297', 'INCa_5298', 'INCa_5299', 'INCa_5344', 'INCa_5345', 'INCa_5346', 'INCa_5347', 'INCa_5348', 'INCa_5349', 'INCa_5350', 'INCa_5351', 'INCa_5352', 'INCa_5353', 'INCa_5354', 'INCa_5355', 'INCa_5363', 'INCa_5364', 'INCa_5365']

In [32]:
len(grants)

1290

In [33]:
grants_details['test'] = grants_details['Reference'].apply(lambda x: x in grants)

In [34]:
test = grants_details[['Reference', 'test']].drop_duplicates().reset_index(drop = True)

In [35]:
test.describe(include = 'all')

Unnamed: 0,Reference,test
count,1550,1550
unique,1550,2
top,ANR-08-PCVI-0021,True
freq,1,1269
