# Data Read-In and Summary Statistics

## Python Setup

In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
pd.options.display.max_rows = 50
pd.options.display.max_columns = 100
import unidecode
import re

## Data Read-In

In [3]:
grnt_dtl = pd.read_csv('../data/inca_grants_details.csv', low_memory=False)
pub_dtl = pd.read_csv('../data/inca_pub_details.csv', low_memory=False)

In [3]:
actors = pd.read_csv('../data/HELIOSv2_ACTEURS_all_2007-2012.csv', low_memory=False)
cols = [col for col in actors.columns if bool(re.match(r'^Unnamed: \d\d\d$', col))==False]
actors = actors[cols]
budget = pd.read_csv('../data/HELIOSv2_BUDGET_all_2007-2012.csv', low_memory=False)
projects = pd.read_csv('../data/HELIOSv2_PROJETS_all_2007-2012.csv', low_memory=False)

### Drop Duplicate Lines

In [4]:
print(grnt_dtl.shape)
print(pub_dtl.shape)
grnt_dtl = grnt_dtl.drop_duplicates()
pub_dtl = pub_dtl.drop_duplicates()
print(grnt_dtl.shape)
print(pub_dtl.shape)

(1821, 16)
(145121, 28)
(1820, 16)
(145121, 28)


## Descriptive Statistics

In [5]:
grnt_dtl.describe(include = 'all')

Unnamed: 0,INCA ID,prenom_port,nom_port,organisme_port,Dimensions Grant ID,Title,Abstract,Funder,Reference,Research Org Names,Research Org IDs,FOR,RCDC,Funding Amount ($),Start Date,End Date
count,1820,1820,1820,1820,1820,1820,1778,1820,1820,1734,1734,1773,1730,1786.0,1782,1764
unique,1001,418,969,506,1627,1546,1491,39,1550,543,543,160,1054,,382,494
top,inca_502,Philippe,FEIL,Institut Gustave Roussy,grant.7155004,"Identification, characterization and clinical ...",Scientific context: The initiation and progres...,French National Cancer Institute,INCa_DGOS_5694,Institut Gustave Roussy,grid.14925.3b,1112 Oncology and Carcinogenesis,Cancer,,2007-02-02,2010-02-02
freq,12,45,12,110,3,4,4,848,4,124,124,470,49,,99,87
mean,,,,,,,,,,,,,,531239.6,,
std,,,,,,,,,,,,,,959213.5,,
min,,,,,,,,,,,,,,0.0,,
25%,,,,,,,,,,,,,,177413.0,,
50%,,,,,,,,,,,,,,386935.0,,
75%,,,,,,,,,,,,,,615486.5,,


In [6]:
pub_dtl.describe(include = 'all')

Unnamed: 0,INCA ID,prenom_port,nom_port,organisme_port,Dimensions Researcher ID,Additional Researcher DIM ID to combine,Additional Researcher DIM ID to combine 2,ORCID,title,Dimensions Publication ID,doi,issue,pages,Pubmed ID,volume,Relative Citation Ratio,Times Cited,altmetric,Open access,Author Names,Research Org Names,Research Org IDs,FOR,RCDC,Journal ID,Journal Title,Publication Year,Publication Date
count,145121,145121,145121,145121,145121,29643,2763,32080,145121,145121,131335,127929.0,142897,105386.0,140591.0,97820.0,145121.0,31666.0,43809,145121,104194,104194,97697,88764,140096,140096,145109.0,0.0
unique,965,398,934,483,965,95,9,198,107243,109001,97583,1607.0,53204,,1591.0,,,,1,102384,36032,36044,826,30275,4565,4558,,
top,inca_542,Philippe,FRANCESCHI,Institut Gustave Roussy,ur.01074225776.01,ur.01204666360.03,ur.010226620247.19,0000-0003-4181-8071,Reply,pub.1046732593,10.3324/haematol.2013.085068,1.0,1-9,,28.0,,,,True,"Jeffrey W. Pollard, Alla Danilkovitch-Miagkova...",Institut Gustave Roussy,grid.14925.3b,1112 Oncology and Carcinogenesis,Cancer,jour.1312191,Journal of Clinical Oncology,,
freq,1272,4892,1272,9567,1272,1272,488,1272,37,10,10,16673.0,322,,2505.0,,,,43809,538,4352,4352,24728,3197,4577,4577,,
mean,,,,,,,,,,,,,,17701330.0,,1.56983,29.756348,8.959736,,,,,,,,,2006.900737,
std,,,,,,,,,,,,,,7572868.0,,3.939371,97.346231,47.376261,,,,,,,,,8.572783,
min,,,,,,,,,,,,,,7239.0,,0.0,0.0,0.0,,,,,,,,,1949.0,
25%,,,,,,,,,,,,,,11282560.0,,0.23,0.0,1.0,,,,,,,,,2002.0,
50%,,,,,,,,,,,,,,18687560.0,,0.72,6.0,3.0,,,,,,,,,2009.0,
75%,,,,,,,,,,,,,,24025140.0,,1.69,27.0,5.0,,,,,,,,,2014.0,


## Comparison Original Data – Dimensions Data

### Grants

In [7]:
# List of awards
actors_awards = list(actors['awardcode'].drop_duplicates())
budget_awards = list(budget['Awardcode'].drop_duplicates())
projects_awards = list(projects['AwardCode'].drop_duplicates())

awards_o = actors_awards
awards_o.extend(budget_awards)
awards_o.extend(projects_awards)
awards_o = set(awards_o)

In [8]:
awards_d = list(grnt_dtl['Reference'].drop_duplicates())

In [9]:
print("There are {} awards in the original data.".format(len(awards_o)))
print("There are {} awards in the Dimenions data.".format(len(awards_d)))

There are 1290 awards in the original data.
There are 1550 awards in the Dimenions data.


In [10]:
in_d = [award in awards_d for award in awards_o]
print("{} original awards are in the Dimensions data ({}%).".format(sum(in_d), int(100*sum(in_d)/len(in_d))))

1269 original awards are in the Dimensions data (98%).


In [11]:
print("The {} original awards not in the Dimensions data are:".format(len(awards_o)-sum(in_d)))
for award in awards_o:
    if award not in awards_d:
        print(" - {}".format(award))

The 21 original awards not in the Dimensions data are:
 - DGOS_0489
 - DGOS_5604
 - INCa_2316
 - INCa_2960
 - INCa_5976
 - DGOS_3921
 - INCa_2828
 - INCa_1645
 - INCa_0516
 - INCa_6165
 - INCa_5348
 - INCa_1508
 - INCa_2700
 - Inserm_5210
 - INCa_2354
 - INCa_4454
 - Inserm_5214
 - INCa_6306
 - INCa_6307
 - INCa_4393
 - INCa_5353


In [12]:
in_o = [award not in awards_o for award in awards_d]
print("There are {} additional awards in the Dimensions data.".format(sum(in_o)))

There are 281 additional awards in the Dimensions data.


### Researchers

In [13]:
researchers_o = actors[['prenom_port', 'nom_port']].drop_duplicates().reset_index(drop = True)
researchers_o = researchers_o[(researchers_o['prenom_port'].notnull())&(researchers_o['nom_port'].notnull())]
researchers_o['source'] = "0"
print("There are {} unique primary researchers in the original data.".format(researchers_o.shape[0]))

There are 1008 unique primary researchers in the original data.


In [14]:
for i in range(1, 5):
    temp = actors[['prenom_aut_port{}'.format(i), 'nom_aut_port{}'.format(i)]].drop_duplicates()
    temp = temp.rename(index=str, columns={'prenom_aut_port{}'.format(i): "prenom_port"
                                           , 'nom_aut_port{}'.format(i): "nom_port"})
    temp = temp[(temp['prenom_port'].notnull())&(temp['nom_port'].notnull())]
    temp['source'] = "{}".format(i)
    researchers_o = pd.concat([researchers_o, temp])
    researchers_o = researchers_o.drop_duplicates(['prenom_port', 'nom_port']).reset_index(drop = True)
r = researchers_o.shape[0]
print("There are {} unique researchers in the original data when accounting for the aut researchers.".format(r))

There are 1336 unique researchers in the original data when accounting for the aut researchers.


In [15]:
researchers_d = grnt_dtl[['prenom_port', 'nom_port']].drop_duplicates()
researchers_d['source'] = 'dimensions'
r = researchers_d.shape[0]
print("There are {} unique researchers in the Dimensions data.".format(r))

There are 1000 unique researchers in the Dimensions data.


In [16]:
irreg_rsrs = pd.concat([researchers_o, researchers_d]).drop_duplicates(['prenom_port', 'nom_port'], keep=False)
irreg_rsrs = irreg_rsrs.reset_index(drop = True)

In [17]:
r = irreg_rsrs[irreg_rsrs['source'] == 'dimensions'].shape[0]
rp = 100*irreg_rsrs[irreg_rsrs['source'] == 'dimensions'].shape[0]/researchers_d.shape[0]
print("{} researchers ({}%) are in Dimensions data but not original data.".format(r, int(round(rp))))

0 researchers (0%) are in Dimensions data but not original data.


In [18]:
r = irreg_rsrs[irreg_rsrs['source'] != 'dimensions'].shape[0]
rp = 100*irreg_rsrs[irreg_rsrs['source'] != 'dimensions'].shape[0]/researchers_o.shape[0]
print("{} researchers ({}%) are in original data but not Dimensions data.".format(r, int(round(rp))))

336 researchers (25%) are in original data but not Dimensions data.


In [19]:
print("Of these:")
r = irreg_rsrs[irreg_rsrs['source']=='0'].shape[0]
rp = 100*irreg_rsrs[irreg_rsrs['source']=='0'].shape[0]/researchers_o[researchers_o['source']=='0'].shape[0]
print(" - {} primary researchers ({}%) are in original data but not the Dimensions data.".format(r, int(round(rp))))
for i in range(1, 5):
    r = irreg_rsrs[irreg_rsrs['source']==str(i)].shape[0]
    rp = 100*irreg_rsrs[irreg_rsrs['source']==str(i)].shape[0]/researchers_o[researchers_o['source']==str(i)].shape[0]
    print(" - {} aut{} researchers ({}%) are in original data but not Dimensions data.".format(r, i, int(round(rp))))

Of these:
 - 8 primary researchers (1%) are in original data but not the Dimensions data.
 - 240 aut1 researchers (100%) are in original data but not Dimensions data.
 - 66 aut2 researchers (100%) are in original data but not Dimensions data.
 - 16 aut3 researchers (100%) are in original data but not Dimensions data.
 - 6 aut4 researchers (100%) are in original data but not Dimensions data.


## Funder Name

In [20]:
grnt_dtl['funder_clean'] = grnt_dtl['Funder'].fillna('')

In [21]:
grnt_dtl['funder_flag'] = grnt_dtl['funder_clean'].apply(lambda x: x!='')
print("{}% of the grants have a Funder.".format(int(grnt_dtl['funder_flag'].value_counts(normalize=True)[True]*100)))
print("There are {} different Funder Names".format(len(grnt_dtl['funder_clean'].value_counts())))
print("Here are the most frequent:")
grnt_dtl['funder_clean'].value_counts().head()

100% of the grants have a Funder.
There are 39 different Funder Names
Here are the most frequent:


French National Cancer Institute                   848
Ministère des Affaires sociales et de la Santé     523
French Institute of Health and Medical Research    162
French National Research Agency                    122
Swiss National Science Foundation                   42
Name: funder_clean, dtype: int64

### INCA vs. Non-INCA

In [22]:
grnt_dtl['inca_flag'] = (grnt_dtl['funder_clean']=="French National Cancer Institute")
grants = grnt_dtl.groupby('Dimensions Grant ID')['inca_flag'].sum().reset_index()
grants['inca_flag_any'] = (grants['inca_flag']>0)
print("{}% of the grants are INCA-funded.".format(int(grants['inca_flag_any'].value_counts(normalize=True)[True]*100)))

45% of the grants are INCA-funded.


## Institution Names

In [23]:
grnt_dtl['org_clean'] = grnt_dtl['Research Org Names'].fillna('')

In [24]:
grnt_dtl['org_flag'] = grnt_dtl['org_clean'].apply(lambda x: x!='')
print("{}% of the grants have an Institution.".format(int(grnt_dtl['org_flag'].value_counts(normalize=True)[True]*100)))
print("There are {} different Institution Names".format(len(grnt_dtl['org_clean'].value_counts())))
print("Here are the most frequent:")
grnt_dtl[grnt_dtl['org_clean']!=""]['org_clean'].value_counts().head(10)

95% of the grants have an Institution.
There are 544 different Institution Names
Here are the most frequent:


Institut Gustave Roussy                                     124
Institute Curie                                              87
French Institute of Health and Medical Research              63
Centre Léon Bérard                                           51
Institute Paoli-Calmettes                                    43
Institute of Genetics and Molecular and Cellular Biology     31
Hôpital Saint-Louis                                          30
Institut Bergonié                                            24
UniCancer Group                                              22
Institut Claudius Regaud                                     18
Name: org_clean, dtype: int64

In [25]:
print("Organisation names are clean for the most part:")
string = "Paoli"
grnt_dtl[grnt_dtl['org_clean'].str.contains(string)]['org_clean'].value_counts()

Organisation names are clean for the most part:


Institute Paoli-Calmettes                                             43
Hôpital René Huguenin;Institute Paoli-Calmettes;Centre Jean Perrin     1
Institute Paoli-Calmettes;Hôpital Sainte-Marguerite                    1
Centre Oscar Lambret;UniCancer Group;Institute Paoli-Calmettes         1
Name: org_clean, dtype: int64

## ORCID Number

In [26]:
pub_dtl['orcid_flag'] = pub_dtl['ORCID'].notnull()

In [27]:
print("{}% of the grants have an ORCID.".format(int(pub_dtl['orcid_flag'].value_counts(normalize = True)[True]*100)))
print("Here are the most frequent ORCIDs:")
pub_dtl['ORCID'].value_counts().head()

22% of the grants have an ORCID.
Here are the most frequent ORCIDs:


0000-0003-4181-8071    1272
0000-0002-9334-4405    1114
0000-0001-7190-120X     875
0000-0003-2574-3874     776
0000-0002-0400-1954     767
Name: ORCID, dtype: int64

## Researcher Name

### Number of Reserachers

In [28]:
grnt_dtl['researcher_name'] = grnt_dtl['prenom_port'] + " " + grnt_dtl['nom_port']
researchers = set(grnt_dtl['researcher_name'])
print("There are {} unique researchers (defined by unique names).".format(len(researchers)))
print("There are {} unique researchers (defined by unique INCA IDs).".format(len(set(grnt_dtl['INCA ID']))))

There are 1000 unique researchers (defined by unique names).
There are 1001 unique researchers (defined by unique INCA IDs).


### Link between Researcher Name and INCA ID:

In [29]:
grnt_dtl[['INCA ID', 'researcher_name']].drop_duplicates().describe()

Unnamed: 0,INCA ID,researcher_name
count,1001,1001
unique,1001,1000
top,inca_638,Jean BOURHIS
freq,1,2


In [30]:
grnt_dtl[['INCA ID', 'researcher_name']].drop_duplicates()[grnt_dtl[['INCA ID', 'researcher_name']].drop_duplicates()['researcher_name']=="Jean BOURHIS"]

Unnamed: 0,INCA ID,researcher_name
319,inca_209,Jean BOURHIS
325,inca_210,Jean BOURHIS


### Linking a Dimesions ID to Reseracher Names

In [178]:
dim_ids = pub_dtl[['prenom_port', 'nom_port', 'Dimensions Researcher ID'
                   , 'Additional Researcher DIM ID to combine', 'Additional Researcher DIM ID to combine 2']]
dim_ids = dim_ids.drop_duplicates().reset_index(drop = True)

In [179]:
rsrs_ids = dim_ids[['prenom_port', 'nom_port', 'Dimensions Researcher ID']]
rsrs_ids = rsrs_ids.rename(index=str, columns={'Dimensions Researcher ID': 'id'})
rsrs_ids = rsrs_ids[rsrs_ids['id'].notnull()]
for string in ('', ' 2'):
    temp = dim_ids[['prenom_port', 'nom_port', 'Additional Researcher DIM ID to combine{}'.format(string)]]
    temp = temp.rename(index=str, columns={'Additional Researcher DIM ID to combine{}'.format(string): 'id'})
    temp = temp[temp['id'].notnull()]
    rsrs_ids = pd.concat([rsrs_ids, temp])
rsrs_ids = rsrs_ids.reset_index(drop = True)

In [180]:
researchers = pd.merge(grnt_dtl[['prenom_port', 'nom_port']].drop_duplicates()
                       , rsrs_ids[['prenom_port', 'nom_port', 'id']].drop_duplicates()
                       , how = 'outer')
researchers = researchers.drop_duplicates()
researchers.columns = ['first_name', 'last_name', 'id']
researchers['name'] = researchers['last_name'] + ", " + researchers['first_name']
researchers = researchers.sort_values('name')
researchers = researchers.reset_index(drop = True)
researchers.describe()

Unnamed: 0,first_name,last_name,id,name
count,1108,1108,1069,1108
unique,420,972,1069,1004
top,Philippe,DELATTRE,ur.0773526204.78,"JOLY-LOBBEDEZ, Florence"
freq,37,4,1,3


In [181]:
researchers.to_csv('../data/researchers.csv', index = False)

## Number of Grants per Researcher

In [33]:
grnt_dtl['researcher'] = grnt_dtl['nom_port'] + ", " + grnt_dtl['prenom_port']

In [34]:
rsrs_grants = grnt_dtl.groupby('researcher').size().reset_index()
rsrs_grants = rsrs_grants.rename(index=str, columns={0: "grant_count"})

In [35]:
print("{} researchers have 1 grant.".format(rsrs_grants['grant_count'].value_counts()[1]))
print("The maximum number of grants for given researcher is {}.".format(max(rsrs_grants['grant_count'])))
print("Here is the distribution:")
rsrs_grants['grant_count'].hist(bins = 11)
rsrs_grants['grant_count'].value_counts()

618 researchers have 1 grant.
The maximum number of grants for given researcher is 12.
Here is the distribution:


1     618
2     186
3      81
4      61
5      25
6      10
7       8
8       4
9       3
10      2
12      1
11      1
Name: grant_count, dtype: int64

## Analysis of Key Words

In [125]:
def distribution(words, sep=';'):
    grnt_dtl['nb_'+words] = grnt_dtl[str.upper(words)].str.count(sep)+1
    grnt_dtl['nb_'+words] = grnt_dtl['nb_'+words].fillna(0)
    grnt_dtl['nb_'+words] = grnt_dtl['nb_'+words].apply(lambda x: int(x))
    dist = grnt_dtl[['Dimensions Grant ID', str.upper(words), 'nb_'+words]].drop_duplicates().reset_index(drop = True)
    print("{} grants have no {}.".format(dist['nb_'+words].value_counts()[0], str.upper(words)))
    print("A grant has at most {} {}s.".format(max(dist['nb_'+words]), str.upper(words)))
    print("The distribution is:\n")
    print(dist['nb_'+words].value_counts())

In [133]:
def league_table(words, sep=';'):
    df = grnt_dtl[['Dimensions Grant ID', str.upper(words)]].drop_duplicates().reset_index(drop = True)
    df = df.rename(index=str, columns={'Dimensions Grant ID': 'grant_id'})
    df = pd.concat([df['grant_id'], df[str.upper(words)].str.split(';', expand=True)], axis=1)
    grant_id = []
    name = []
    order = []
    for i in range(df.shape[1]-1):
        temp = df[['grant_id', i]]
        temp = temp[temp[i].notnull()]
        temp['order'] = int(i+1)
        grant_id.extend(list(temp['grant_id']))
        name.extend(list(temp[i]))
        order.extend(list(temp['order']))
    df = pd.DataFrame({'grant_id': grant_id, 'name': name, 'order': order}).reset_index(drop = True)
    df_p = df[df['order']==1]
    print("There are {} different Principal {} codes.".format(len(df_p['name'].value_counts()), str.upper(words)))
    print("The most frequent are:\n")
    print(df_p['name'].value_counts().head())
    print("\nThere are {} different {} codes.".format(len(df['name'].value_counts()), str.upper(words)))
    print("The most frequent are:\n")
    print(df['name'].value_counts().head())
    return df

### RCDC

In [127]:
distribution('rcdc')

89 grants have no RCDC.
A grant has at most 13 RCDCs.
The distribution is:

5     223
6     208
3     205
4     201
7     163
2     144
8     126
1      89
0      89
9      84
10     42
11     28
12     20
13      5
Name: nb_rcdc, dtype: int64


In [134]:
df = league_table('rcdc')

There are 66 different Principle RCDC codes.
The most frequent are:

Rare Diseases         385
Cancer                187
Genetics              164
Prevention             90
Digestive Diseases     79
Name: name, dtype: int64

There are 131 different RCDC codes.
The most frequent are:

Cancer               1318
Clinical Research     709
Rare Diseases         647
Genetics              522
Biotechnology         374
Name: name, dtype: int64


In [146]:
g1 = list(df[(df['name'] == 'Cancer') & (df['order']==1)].head()['grant_id'])
g2 = list(df[(df['name'] == 'Cancer') & (df['order']==1)].tail()['grant_id'])

In [147]:
g3 = list(df[(df['name'] == 'Cancer') & (df['order']!=1)].head()['grant_id'])
g4 = list(df[(df['name'] == 'Cancer') & (df['order']!=1)].tail()['grant_id'])

In [148]:
g = g1
g.extend(g2)
g.extend(g3)
g.extend(g4)

In [158]:
temp = grnt_dtl[grnt_dtl['Dimensions Grant ID'].apply(lambda x: x in g)].drop_duplicates(['Dimensions Grant ID'])
temp.to_csv('../output/cancer_rcdc.csv', index = False)

### FOR

In [129]:
distribution('for')

47 grants have no FOR.
A grant has at most 5 FORs.
The distribution is:

1    937
2    574
3     65
0     47
4      3
5      1
Name: nb_for, dtype: int64


In [130]:
league_table('for')

There are 45 different Principle FOR codes.
The most frequent are:

1112 Oncology and Carcinogenesis          624
0601 Biochemistry and Cell Biology        235
1103 Clinical Sciences                    166
1117 Public Health and Health Services    162
0604 Genetics                             129
Name: name, dtype: int64

There are 48 different FOR codes.
The most frequent are:

1112 Oncology and Carcinogenesis          892
0601 Biochemistry and Cell Biology        341
1117 Public Health and Health Services    242
0604 Genetics                             234
1103 Clinical Sciences                    210
Name: name, dtype: int64


## Sandbox

In [5]:
grnt_dtl.head()

Unnamed: 0,INCA ID,prenom_port,nom_port,organisme_port,Dimensions Grant ID,Title,Abstract,Funder,Reference,Research Org Names,Research Org IDs,FOR,RCDC,Funding Amount ($),Start Date,End Date
0,inca_1,Jérôme,ABADIE,Ecole Vétérinaire de Nantes,grant.7426242,"Cancer, Environment and metabolomics: the dog ...",Our innovative project aims to use the dog as ...,French Institute of Health and Medical Research,Inserm_6181,Oniris,grid.418682.1,1117 Public Health and Health Services;1112 On...,Rare Diseases;Prevention;Cancer;Clinical Research,65061.0,2011-12-13,2013-06-12
1,inca_2,Julien,ADAM,Hôpital Necker- Enfants malades APHP,grant.7426178,Creating tools to assess DNA repair dysfunctio...,Scientific context Non-small cell lung carcin...,French Institute of Health and Medical Research,Inserm_4631,Necker-Enfants Malades Hospital;Institut Gusta...,grid.412134.1;grid.14925.3b,0601 Biochemistry and Cell Biology;1112 Oncolo...,Biotechnology;Lung;Cancer;Lung Cancer;Genetics...,240110.0,2011-11-01,2014-11-01
2,inca_3,Antoine,ADENIS,Centre Oscar Lambret,grant.7154464,A multicenter randomized phase II study to eva...,Medical oncologists are used to treat patients...,Ministère des Affaires sociales et de la Santé,DGOS_2555,Centre Oscar Lambret,grid.452351.4,1112 Oncology and Carcinogenesis,Cancer;Clinical Research;Clinical Trials and S...,248109.0,2010-06-01,2013-06-01
3,inca_3,Antoine,ADENIS,Centre Oscar Lambret,grant.7154160,National oesophageal and gastric carcinomas da...,"In 2005, the worldwide incidence of oesophagog...",French National Cancer Institute,INCa_6288,Centre Oscar Lambret;Hôpital Claude Huriez,grid.452351.4;grid.413875.c,1112 Oncology and Carcinogenesis,Rare Diseases;Digestive Diseases;Cancer,875159.0,2013-04-15,2016-04-14
4,inca_4,Eric,ADRIAENSSENS,CNRS Inserm UMR8161,grant.7154483,Non-coding RNA and cancer,The aim of our project is to understand the mo...,French National Cancer Institute,INCa_2699,,,1112 Oncology and Carcinogenesis;0604 Genetics,Genetics;Rare Diseases;Cancer;Breast Cancer,400306.0,2010-11-04,2013-11-04


In [6]:
pub_dtl.head()

Unnamed: 0,INCA ID,prenom_port,nom_port,organisme_port,Dimensions Researcher ID,Additional Researcher DIM ID to combine,Additional Researcher DIM ID to combine 2,ORCID,title,Dimensions Publication ID,doi,issue,pages,Pubmed ID,volume,Relative Citation Ratio,Times Cited,altmetric,Open access,Author Names,Research Org Names,Research Org IDs,FOR,RCDC,Journal ID,Journal Title,Publication Year,Publication Date
0,inca_1,Jérôme,ABADIE,Ecole Vétérinaire de Nantes,ur.0642054564.81,,,,Spontaneous Canine Mammary Carcinoma as a Mode...,pub.1017598066,10.1016/j.jcpa.2011.11.136,1,79,,146,,1,,,"J. Abadie, F. Nguyen, D. Loussouarn, I. Bemelm...",Institut de Cancérologie de l'Ouest;Université...,grid.418191.4;grid.449623.e,,,jour.1007441,Journal of Comparative Pathology,2012.0,
1,inca_1,Jérôme,ABADIE,Ecole Vétérinaire de Nantes,ur.0642054564.81,,,,FC‐15 \nImmunohistochemical detection of c‐kit...,pub.1013784597,10.1111/j.1365-3164.2004.411_15.x,s1,24-24,,15,,0,,,"M. C. Cadiergues, F. Degorce‐Rubiales, J. Abad...",,,1103 Clinical Sciences,,jour.1108160,Veterinary Dermatology,2004.0,
2,inca_1,Jérôme,ABADIE,Ecole Vétérinaire de Nantes,ur.0642054564.81,,,,Chronic gastrointestinal inflammation in a dog...,pub.1093104577,10.24070/bjvp.1983-0246.v10i3p100-104,3,100-104,,10,,0,,True,"Mario Cervone, Julie Duboy, Caroline Laprie, J...",,,,,jour.1044051,Brazilian Journal of Veterinary Pathology,2017.0,
3,inca_1,Jérôme,ABADIE,Ecole Vétérinaire de Nantes,ur.0642054564.81,,,,Persistent Left Cranial Vena Cava Causing Oeso...,pub.1023385850,10.1016/j.jcpa.2006.05.002,2-3,150-152,16952369.0,135,0.53,10,,,"T. Larcher, J. Abadie, F.A. Roux, J.-Y. Descha...",,,1102 Cardiorespiratory Medicine and Haematology,,jour.1007441,Journal of Comparative Pathology,2006.0,
4,inca_1,Jérôme,ABADIE,Ecole Vétérinaire de Nantes,ur.0642054564.81,,,,Pollutants in pet dogs: a model for environmen...,pub.1017994447,10.1186/s40064-015-0790-4,1,1-11,25646150.0,4,1.64,8,4.0,True,"Sabine Sévère, Philippe Marchand, Ingrid Guiff...",,,1112 Oncology and Carcinogenesis;1117 Public H...,Cancer;Breast Cancer,jour.1047790,SpringerPlus,2015.0,
