# Data Read-In and Summary Statistics

## Python Setup

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
pd.options.display.max_rows = 50
pd.options.display.max_columns = 100
import unidecode
import re

## Data Read-In

In [None]:
grnt_dtl = pd.read_csv('../data/inca_grants_details.csv', low_memory=False)
pub_dtl = pd.read_csv('../data/inca_pub_details.csv', low_memory=False)

In [None]:
actors = pd.read_csv('../data/HELIOSv2_ACTEURS_all_2007-2012.csv', low_memory=False)
cols = [col for col in actors.columns if bool(re.match(r'^Unnamed: \d\d\d$', col))==False]
actors = actors[cols]
budget = pd.read_csv('../data/HELIOSv2_BUDGET_all_2007-2012.csv', low_memory=False)
projects = pd.read_csv('../data/HELIOSv2_PROJETS_all_2007-2012.csv', low_memory=False)

### Drop Duplicate Lines

In [None]:
print(grnt_dtl.shape)
print(pub_dtl.shape)
grnt_dtl = grnt_dtl.drop_duplicates()
pub_dtl = pub_dtl.drop_duplicates()
print(grnt_dtl.shape)
print(pub_dtl.shape)

## Descriptive Statistics

In [None]:
grnt_dtl.describe(include = 'all')

In [None]:
pub_dtl.describe(include = 'all')

## Comparison Original Data – Dimensions Data

### Grants

In [None]:
# List of awards
actors_awards = list(actors['awardcode'].drop_duplicates())
budget_awards = list(budget['Awardcode'].drop_duplicates())
projects_awards = list(projects['AwardCode'].drop_duplicates())

awards_o = actors_awards
awards_o.extend(budget_awards)
awards_o.extend(projects_awards)
awards_o = set(awards_o)

In [None]:
awards_d = list(grnt_dtl['Reference'].drop_duplicates())

In [None]:
print("There are {} awards in the original data.".format(len(awards_o)))
print("There are {} awards in the Dimenions data.".format(len(awards_d)))

In [None]:
in_d = [award in awards_d for award in awards_o]
print("{} original awards are in the Dimensions data ({}%).".format(sum(in_d), int(100*sum(in_d)/len(in_d))))

In [None]:
print("The {} original awards not in the Dimensions data are:".format(len(awards_o)-sum(in_d)))
for award in awards_o:
    if award not in awards_d:
        print(" - {}".format(award))

In [None]:
in_o = [award not in awards_o for award in awards_d]
print("There are {} additional awards in the Dimensions data.".format(sum(in_o)))

### Researchers

In [None]:
researchers_o = actors[['prenom_port', 'nom_port']].drop_duplicates().reset_index(drop = True)
researchers_o = researchers_o[(researchers_o['prenom_port'].notnull())&(researchers_o['nom_port'].notnull())]
researchers_o['source'] = "0"
print("There are {} unique primary researchers in the original data.".format(researchers_o.shape[0]))

In [None]:
for i in range(1, 5):
    temp = actors[['prenom_aut_port{}'.format(i), 'nom_aut_port{}'.format(i)]].drop_duplicates()
    temp = temp.rename(index=str, columns={'prenom_aut_port{}'.format(i): "prenom_port"
                                           , 'nom_aut_port{}'.format(i): "nom_port"})
    temp = temp[(temp['prenom_port'].notnull())&(temp['nom_port'].notnull())]
    temp['source'] = "{}".format(i)
    researchers_o = pd.concat([researchers_o, temp])
    researchers_o = researchers_o.drop_duplicates(['prenom_port', 'nom_port']).reset_index(drop = True)
r = researchers_o.shape[0]
print("There are {} unique researchers in the original data when accounting for the aut researchers.".format(r))

In [None]:
researchers_d = grnt_dtl[['prenom_port', 'nom_port']].drop_duplicates()
researchers_d['source'] = 'dimensions'
r = researchers_d.shape[0]
print("There are {} unique researchers in the Dimensions data.".format(r))

In [None]:
irreg_rsrs = pd.concat([researchers_o, researchers_d]).drop_duplicates(['prenom_port', 'nom_port'], keep=False)
irreg_rsrs = irreg_rsrs.reset_index(drop = True)

In [None]:
r = irreg_rsrs[irreg_rsrs['source'] == 'dimensions'].shape[0]
rp = 100*irreg_rsrs[irreg_rsrs['source'] == 'dimensions'].shape[0]/researchers_d.shape[0]
print("{} researchers ({}%) are in Dimensions data but not original data.".format(r, int(round(rp))))

In [None]:
r = irreg_rsrs[irreg_rsrs['source'] != 'dimensions'].shape[0]
rp = 100*irreg_rsrs[irreg_rsrs['source'] != 'dimensions'].shape[0]/researchers_o.shape[0]
print("{} researchers ({}%) are in original data but not Dimensions data.".format(r, int(round(rp))))

In [None]:
print("Of these:")
r = irreg_rsrs[irreg_rsrs['source']=='0'].shape[0]
rp = 100*irreg_rsrs[irreg_rsrs['source']=='0'].shape[0]/researchers_o[researchers_o['source']=='0'].shape[0]
print(" - {} primary researchers ({}%) are in original data but not the Dimensions data.".format(r, int(round(rp))))
for i in range(1, 5):
    r = irreg_rsrs[irreg_rsrs['source']==str(i)].shape[0]
    rp = 100*irreg_rsrs[irreg_rsrs['source']==str(i)].shape[0]/researchers_o[researchers_o['source']==str(i)].shape[0]
    print(" - {} aut{} researchers ({}%) are in original data but not Dimensions data.".format(r, i, int(round(rp))))

## Funder Name

In [None]:
grnt_dtl['funder_clean'] = grnt_dtl['Funder'].fillna('')

In [None]:
grnt_dtl['funder_flag'] = grnt_dtl['funder_clean'].apply(lambda x: x!='')
print("{}% of the grants have a Funder.".format(int(grnt_dtl['funder_flag'].value_counts(normalize=True)[True]*100)))
print("There are {} different Funder Names".format(len(grnt_dtl['funder_clean'].value_counts())))
print("Here are the most frequent:")
grnt_dtl['funder_clean'].value_counts().head()

### INCA vs. Non-INCA

In [None]:
grnt_dtl['inca_flag'] = (grnt_dtl['funder_clean']=="French National Cancer Institute")
grants = grnt_dtl.groupby('Dimensions Grant ID')['inca_flag'].sum().reset_index()
grants['inca_flag_any'] = (grants['inca_flag']>0)
print("{}% of the grants are INCA-funded.".format(int(grants['inca_flag_any'].value_counts(normalize=True)[True]*100)))

## Institution Names

In [None]:
grnt_dtl['org_clean'] = grnt_dtl['Research Org Names'].fillna('')

In [None]:
grnt_dtl['org_flag'] = grnt_dtl['org_clean'].apply(lambda x: x!='')
print("{}% of the grants have an Institution.".format(int(grnt_dtl['org_flag'].value_counts(normalize=True)[True]*100)))
print("There are {} different Institution Names".format(len(grnt_dtl['org_clean'].value_counts())))
print("Here are the most frequent:")
grnt_dtl[grnt_dtl['org_clean']!=""]['org_clean'].value_counts().head(10)

In [None]:
print("Organisation names are clean for the most part:")
string = "Paoli"
grnt_dtl[grnt_dtl['org_clean'].str.contains(string)]['org_clean'].value_counts()

## ORCID Number

In [None]:
pub_dtl['orcid_flag'] = pub_dtl['ORCID'].notnull()

In [None]:
print("{}% of the grants have an ORCID.".format(int(pub_dtl['orcid_flag'].value_counts(normalize = True)[True]*100)))
print("Here are the most frequent ORCIDs:")
pub_dtl['ORCID'].value_counts().head()

## Researcher Name

### Number of Reserachers

In [None]:
grnt_dtl['researcher_name'] = grnt_dtl['prenom_port'] + " " + grnt_dtl['nom_port']
researchers = set(grnt_dtl['researcher_name'])
print("There are {} unique researchers (defined by unique names).".format(len(researchers)))
print("There are {} unique researchers (defined by unique INCA IDs).".format(len(set(grnt_dtl['INCA ID']))))

### Link between Researcher Name and INCA ID:

In [None]:
grnt_dtl[['INCA ID', 'researcher_name']].drop_duplicates().describe()

In [None]:
grnt_dtl[['INCA ID', 'researcher_name']].drop_duplicates()[grnt_dtl[['INCA ID', 'researcher_name']].drop_duplicates()['researcher_name']=="Jean BOURHIS"]

### Linking a Dimesions ID to Reseracher Names

In [None]:
dim_ids = pub_dtl[['prenom_port', 'nom_port', 'Dimensions Researcher ID'
                   , 'Additional Researcher DIM ID to combine', 'Additional Researcher DIM ID to combine 2']]
dim_ids = dim_ids.drop_duplicates().reset_index(drop = True)

In [None]:
rsrs_ids = dim_ids[['prenom_port', 'nom_port', 'Dimensions Researcher ID']]
rsrs_ids = rsrs_ids.rename(index=str, columns={'Dimensions Researcher ID': 'id'})
rsrs_ids = rsrs_ids[rsrs_ids['id'].notnull()]
for string in ('', ' 2'):
    temp = dim_ids[['prenom_port', 'nom_port', 'Additional Researcher DIM ID to combine{}'.format(string)]]
    temp = temp.rename(index=str, columns={'Additional Researcher DIM ID to combine{}'.format(string): 'id'})
    temp = temp[temp['id'].notnull()]
    rsrs_ids = pd.concat([rsrs_ids, temp])
rsrs_ids = rsrs_ids.reset_index(drop = True)

In [None]:
researchers = pd.merge(grnt_dtl[['prenom_port', 'nom_port']].drop_duplicates()
                       , rsrs_ids[['prenom_port', 'nom_port', 'id']].drop_duplicates()
                       , how = 'outer')
researchers = researchers.drop_duplicates()
researchers.columns = ['first_name', 'last_name', 'id']
researchers['name'] = researchers['last_name'] + ", " + researchers['first_name']
researchers = researchers.sort_values('name')
researchers = researchers.reset_index(drop = True)
researchers.describe()

In [None]:
researchers.to_csv('../data/researchers.csv', index = False)

## Number of Grants per Researcher

In [None]:
grnt_dtl['researcher'] = grnt_dtl['nom_port'] + ", " + grnt_dtl['prenom_port']

In [None]:
rsrs_grants = grnt_dtl.groupby('researcher').size().reset_index()
rsrs_grants = rsrs_grants.rename(index=str, columns={0: "grant_count"})

In [None]:
print("{} researchers have 1 grant.".format(rsrs_grants['grant_count'].value_counts()[1]))
print("The maximum number of grants for given researcher is {}.".format(max(rsrs_grants['grant_count'])))
print("Here is the distribution:")
rsrs_grants['grant_count'].hist(bins = 11)
rsrs_grants['grant_count'].value_counts()

## Analysis of Key Words

In [None]:
def distribution(words, sep=';'):
    grnt_dtl['nb_'+words] = grnt_dtl[str.upper(words)].str.count(sep)+1
    grnt_dtl['nb_'+words] = grnt_dtl['nb_'+words].fillna(0)
    grnt_dtl['nb_'+words] = grnt_dtl['nb_'+words].apply(lambda x: int(x))
    dist = grnt_dtl[['Dimensions Grant ID', str.upper(words), 'nb_'+words]].drop_duplicates().reset_index(drop = True)
    print("{} grants have no {}.".format(dist['nb_'+words].value_counts()[0], str.upper(words)))
    print("A grant has at most {} {}s.".format(max(dist['nb_'+words]), str.upper(words)))
    print("The distribution is:\n")
    print(dist['nb_'+words].value_counts())

In [None]:
def league_table(words, sep=';'):
    df = grnt_dtl[['Dimensions Grant ID', str.upper(words)]].drop_duplicates().reset_index(drop = True)
    df = df.rename(index=str, columns={'Dimensions Grant ID': 'grant_id'})
    df = pd.concat([df['grant_id'], df[str.upper(words)].str.split(';', expand=True)], axis=1)
    grant_id = []
    name = []
    order = []
    for i in range(df.shape[1]-1):
        temp = df[['grant_id', i]]
        temp = temp[temp[i].notnull()]
        temp['order'] = int(i+1)
        grant_id.extend(list(temp['grant_id']))
        name.extend(list(temp[i]))
        order.extend(list(temp['order']))
    df = pd.DataFrame({'grant_id': grant_id, 'name': name, 'order': order}).reset_index(drop = True)
    df_p = df[df['order']==1]
    print("There are {} different Principal {} codes.".format(len(df_p['name'].value_counts()), str.upper(words)))
    print("The most frequent are:\n")
    print(df_p['name'].value_counts().head())
    print("\nThere are {} different {} codes.".format(len(df['name'].value_counts()), str.upper(words)))
    print("The most frequent are:\n")
    print(df['name'].value_counts().head())
    return df

### RCDC

In [None]:
distribution('rcdc')

In [None]:
df = league_table('rcdc')

In [None]:
g1 = list(df[(df['name'] == 'Cancer') & (df['order']==1)].head()['grant_id'])
g2 = list(df[(df['name'] == 'Cancer') & (df['order']==1)].tail()['grant_id'])

In [None]:
g3 = list(df[(df['name'] == 'Cancer') & (df['order']!=1)].head()['grant_id'])
g4 = list(df[(df['name'] == 'Cancer') & (df['order']!=1)].tail()['grant_id'])

In [None]:
g = g1
g.extend(g2)
g.extend(g3)
g.extend(g4)

In [None]:
temp = grnt_dtl[grnt_dtl['Dimensions Grant ID'].apply(lambda x: x in g)].drop_duplicates(['Dimensions Grant ID'])
temp.to_csv('../output/cancer_rcdc.csv', index = False)

### FOR

In [None]:
distribution('for')

In [None]:
league_table('for')

## Sandbox