# INCA Data Cleaning and Extractions

## Python Setup

In [174]:
import pandas as pd
import numpy as np
from datetime import timedelta
from pandas import Series
from openpyxl import load_workbook
import re
pd.options.display.max_columns = 100
from api_query import execute_query
from api_query import pull_data
import time

## Load In Data

In [175]:
grnts = pd.read_csv('../data/inca_raw/inca_grants_details.csv', low_memory=False)
pubs = pd.read_csv('../data/inca_raw/inca_pub_details.csv', low_memory=False)
inca_orcid_responses = pd.read_csv('../output/researcher_info/researcher_info_ORCID_returns.csv', low_memory=False)

## INCA ID - Dimensions ID Lookup

In [176]:
id_lookup = pubs[['INCA ID', 'Dimensions Researcher ID']].drop_duplicates().reset_index(drop=True)

In [177]:
grnts = pd.merge(grnts, id_lookup, how='left', on='INCA ID')
grnts['Dimensions Researcher ID'] = np.where(grnts['Dimensions Researcher ID'].notnull(), 
                                                  grnts['Dimensions Researcher ID'], grnts['INCA ID'])

## Account for Dimensions Manual Disambiguations

In [178]:
ids = pubs[['Dimensions Researcher ID', 'Additional Researcher DIM ID to combine', 
                 'Additional Researcher DIM ID to combine 2']].copy().drop_duplicates().reset_index(drop=True)

In [179]:
dupls_1 = ids[ids['Additional Researcher DIM ID to combine 2'].notnull()].reset_index(drop=True)
dupls_1['new_id'] = (dupls_1['Dimensions Researcher ID']
                      +"&"+dupls_1['Additional Researcher DIM ID to combine']
                      +"&"+dupls_1['Additional Researcher DIM ID to combine 2'])
dupls_2 = ids[(ids['Additional Researcher DIM ID to combine 2'].isnull())
               &(ids['Additional Researcher DIM ID to combine'].notnull())].reset_index(drop=True)
dupls_2['new_id'] = (dupls_2['Dimensions Researcher ID']
                      +"&"+dupls_2['Additional Researcher DIM ID to combine'])
dupls = pd.concat([dupls_1, dupls_2]).reset_index(drop=True)

In [180]:
id_replacer = pd.DataFrame()
for var in ('Dimensions Researcher ID', 
            'Additional Researcher DIM ID to combine', 
            'Additional Researcher DIM ID to combine 2'):
    temp = dupls[dupls[var].notnull()][[var, 'new_id']].copy()
    temp.rename(columns={var:'old_id'}, inplace=True)
    if id_replacer.empty:
        id_replacer = temp.copy()
    else:
        id_replacer = pd.concat([id_replacer, temp]).reset_index(drop=True)

In [181]:
id_replacer.to_csv('../data/id_replacer.csv', index=False)

In [182]:
def id_replace(df):
    df = pd.merge(df, id_replacer, how='left', left_on='rsr_id', right_on='old_id')
    df['rsr_id'] = np.where(df['new_id'].notnull(), df['new_id'], df['rsr_id'])
    del df['old_id'], df['new_id']
    return df

## Clean Up Grant and Publication Files

In [183]:
# Clean Grants File
grnts.columns = [x.lower().replace(' ', '_') for x in grnts.columns]
grnts.rename(columns={'dimensions_researcher_id': 'rsr_id'
                            , 'funding_amount_($)': 'funding_amount'
                            , 'dimensions_grant_id': 'grant_id'
                            , 'funder': 'funder_name'
                            , 'rcdc': 'rcdc_names'
                            , 'prenom_port': 'first_name'
                            , 'nom_port': 'last_name'
                           }, inplace=True)
del grnts['title'], grnts['reference'], grnts['organisme_port'], grnts['research_org_names']
del grnts['research_org_ids'], grnts['for'], grnts['abstract']

In [184]:
# Clean Pubs File
pubs.columns = [x.lower().replace(' ', '_') for x in pubs.columns]
pubs.rename(columns={'dimensions_researcher_id': 'rsr_id'
                          , 'dimensions_publication_id': 'pub_id'
                          , 'publication_year': 'date'
                          , 'rcdc': 'rcdc_names'
                          , 'times_cited': 'citations'
                          , 'pubmed_id': 'pmid'
                         }, inplace=True)
del pubs['prenom_port'], pubs['nom_port'], pubs['organisme_port']
del pubs['additional_researcher_dim_id_to_combine'], pubs['additional_researcher_dim_id_to_combine_2']
del pubs['orcid'], pubs['title'], pubs['issue']
del pubs['pages'], pubs['volume'], pubs['relative_citation_ratio']
del pubs['altmetric'], pubs['open_access'], pubs['author_names'], pubs['research_org_names']
del pubs['research_org_ids'], pubs['for'], pubs['journal_id'], pubs['journal_title']
del pubs['publication_date']
pubs['date'] = pubs['date'].apply(str).replace('\.0', '', regex=True)
pubs['date'] = pubs['date'].apply(lambda x: np.nan if x=="nan" else x+"-01-01")
pubs['citations'] = pd.to_numeric(pubs['citations'])

## Get Rid of RCDC Codes

In [185]:
# Keep RCDC Codes in Seperate Table
grnts_rcdc = grnts[grnts['rcdc_names'].notnull()].copy()
grnts_rcdc = grnts_rcdc[['grant_id', 'rcdc_names']].drop_duplicates()
grnts_rcdc = pd.concat([Series(row['grant_id'], row['rcdc_names'].split(';'))
                                    for _, row in grnts_rcdc.iterrows()]).reset_index()
grnts_rcdc.columns = ['rcdc_name', 'grant_id']
grnts_rcdc['rcdc_name'] = grnts_rcdc['rcdc_name'].str.strip().str.upper()

del grnts['rcdc_names']

In [186]:
# Keep RCDC Codes in Seperate Table
pubs_rcdc = pubs[pubs['rcdc_names'].notnull()].copy()
pubs_rcdc = pubs_rcdc[['pub_id', 'rcdc_names']].drop_duplicates()
pubs_rcdc = pd.concat([Series(row['pub_id'], row['rcdc_names'].split(';'))
                                    for _, row in pubs_rcdc.iterrows()]).reset_index()
pubs_rcdc.columns = ['rcdc_name', 'pub_id']
pubs_rcdc['rcdc_name'] = pubs_rcdc['rcdc_name'].str.strip().str.upper()

del pubs['rcdc_names']

In [187]:
grnts_rcdc.to_csv('../data/topic_lookups/inca_researcher_grants_rcdc.csv', index=False)
pubs_rcdc.to_csv('../data/topic_lookups/inca_researcher_publications_rcdc.csv', index=False)

## Grant Datasets

In [188]:
inca_grnts = grnts.copy()
del inca_grnts['inca_id']

### Seperate INCa-Funded Grants from Researcher Grants

In [189]:
# Get INCA-funded Grants from file
inca_funders = ["French National Cancer Institute", "French Institute of Health and Medical Research"]
inca_funders += ["Ministère des Affaires sociales et de la Santé"]
inca_funded_grnts = inca_grnts[(inca_grnts['funder_name'].isin(inca_funders))
                               & (pd.DatetimeIndex(inca_grnts['start_date']).year>=2007)
                               & (pd.DatetimeIndex(inca_grnts['start_date']).year<=2012)].reset_index(drop=True)
inca_grnts = inca_grnts[(~inca_grnts['funder_name'].isin(inca_funders))
                        | (pd.DatetimeIndex(inca_grnts['start_date']).year<2007)
                        | (pd.DatetimeIndex(inca_grnts['start_date']).year>2012)].reset_index(drop=True)

In [190]:
# Replace Funder Names by "French Funders"
del inca_funded_grnts['funder_name']
inca_funded_grnts['funder_name'] = "INCa/INSERM/DGOS"

### Apply ID Replacer

In [191]:
inca_funded_grnts = id_replace(inca_funded_grnts)
inca_grnts = id_replace(inca_grnts)

### Export

In [192]:
inca_funded_grnts.to_csv('../data/inca_funded_grants.csv', index=False)
inca_grnts.to_csv('../data/inca_researcher_grants.csv', index=False)

## Publication Dataset

### Pull Additional Publication Info from Dimensions API

In [19]:
pub_ids = list(pubs['pub_id'].unique())

In [20]:
# Filter operator 'in' requires 0 < items < 512
max_in_items = 100

# Limit exceeds maximum allowed limit 1000
max_return = 500

# Offset cannot exceed 50000
max_overall_returns = 50000

In [21]:
string = "search publications where id in [{}]"
string += " return publications[id+doi+pmid+author_affiliations+date+supporting_grant_ids+times_cited]"
full_resp = pull_data(string=string, in_list=pub_ids, in_type='publications', return_type='publications', 
                      max_in_items=max_in_items, max_return=max_return, max_overall_returns=max_overall_returns)

Querying: 18000-18100/109001 publications...
RESPONSE ERROR on i=180 and j=0.

Querying: 23900-24000/109001 publications...

HTTPError: 429 Client Error: Too Many Requests for url: https://app.dimensions.ai/api/auth.json

In [None]:
pub_id = []
pmid = []
dates = []
citations = []
dois = []
supporting_grants = []
nb_authors = []
author_id = []
author_country = []
author_affiliation = []
author_affiliation_id = []

for pub in full_resp:
    if 'id' not in pub:
        pub['id'] = np.nan
    if 'pmid' not in pub:
        pub['pmid'] = np.nan
    if 'author_affiliations' not in pub:
        pub['author_affiliations'] = [[]]
    for author in pub['author_affiliations'][0]: 
        if 'researcher_id' not in author:
            author['researcher_id'] = np.nan
        if ('affiliations' not in author)  | (len(author['affiliations'])==0):
            author['affiliations'] = [{'country_code':np.nan, 'name': np.nan, 'id':np.nan}]
        for affiliation in author['affiliations']:
            if 'country_code' not in affiliation:
                affiliation['country_code']=np.nan
            if 'name' not in affiliation:
                affiliation['name']=np.nan
            if 'id' not in affiliation:
                affiliation['id']=np.nan
    if 'date' not in pub:
        pub['date'] = np.nan
    if 'times_cited' not in pub:
        pub['times_cited'] = np.nan
    if 'doi' not in pub:
        pub['doi'] = np.nan
    if 'supporting_grant_ids' not in pub:
        pub['supporting_grant_ids'] = [np.nan]
    nb = len(pub['author_affiliations'][0])
    for author in pub['author_affiliations'][0]:
        nb_authors.append(float(nb))
        author_id.append(author['researcher_id'])
        author_country.append(author['affiliations'][0]['country_code'])
        author_affiliation.append(author['affiliations'][0]['name'])
        author_affiliation_id.append(author['affiliations'][0]['id'])
        pub_id.append(pub['id'])
        dates.append(pub['date'])
        citations.append(float(pub['times_cited']))
        dois.append(pub['doi'])
        pmid.append(pub['pmid'])
        supporting_grants.append(pub['supporting_grant_ids'])

api_pubs = pd.DataFrame({'pub_id':pub_id
                         , 'pmid': pmid
                         , 'date':dates
                         , 'doi':dois
                         , 'citations':citations
                         , 'nb_authors':nb_authors
                         , 'rsr_id':author_id
                         , 'rsr_country':author_country
                         , 'rsr_affiliation':author_affiliation
                         , 'rsr_affiliation_id':author_affiliation_id
                         , 'supporting_grants':supporting_grants
                        })
api_pubs['supporting_grants'] = api_pubs['supporting_grants'].apply(lambda x: np.nan if x==[np.nan] else ';'.join(x))

In [None]:
inca_pubs = pd.merge(pubs[['pub_id', 'rsr_id']].rename(columns={'rsr_id':'original_rsr_id'}).drop_duplicates(), 
                     api_pubs, how='left', on='pub_id')

### Apply ID Replacer

In [None]:
inca_pubs = id_replace(inca_pubs)

### Export

In [None]:
inca_pubs.to_csv('../data/inca_researcher_publications.csv', index=False)

### Export List of Publication IDs

For further matching.

In [None]:
inca_pub_ids = inca_pubs[inca_pubs['pmid'].notnull()][['rsr_id', 'pmid']].drop_duplicates()
inca_pub_ids.to_csv('../data/inca_pub_ids.csv', index=False)

## Researcher Dataset

### Resercher Name

In [212]:
rsr_name = grnts[['inca_id', 'first_name', 'last_name']].drop_duplicates()

### First Year of Publication

In [213]:
# Get First Publication
rsr_first_year = pubs[['inca_id', 'date']].sort_values(['inca_id', 'date']).drop_duplicates('inca_id')

### Affiliation

In [272]:
# Get Affiliation:
rsrs = list(grnts['inca_id'].drop_duplicates())

first_affiliation = []
first_country = []

for rsr in rsrs:
    
    print("Querying {} out of {} reseachers...".format(rsrs.index(rsr)+1, len(rsrs)), end = '\r')

    temp = pubs[(pubs['inca_id']==rsr)&(pubs['date'].notnull())][['pub_id', 'date']]
    publications = list(temp.sort_values('date')['pub_id'])[-500:]
    publications = "\"" + "\", \"".join(publications) + "\""
    ids = list(pubs[pubs['inca_id']==rsr]['rsr_id'].drop_duplicates())

    stop = False
    time.sleep(2)
    string = "search publications where id in [{}]".format(publications)
    string += " return publications[author_affiliations] sort by date limit 1000"
    resp = execute_query(string)
    
    for i in range(len(resp['publications']), 0, -1):
        for elem in resp['publications'][i-1]['author_affiliations'][0]:
            if 'researcher_id' in elem:
                if elem['researcher_id'] in ids:
                    if len(elem['affiliations'])>0:
                        stop = True
                        if 'country_code' in elem['affiliations'][0]:
                            first_country.append(elem['affiliations'][0]['country_code'])
                        else:
                            first_country.append(np.nan)
                        if 'name' in elem['affiliations'][0]:
                            first_affiliation.append(elem['affiliations'][0]['name'])
                        else:
                            first_affiliation.append(np.nan)
                    break
        if stop==True:
            break
    if stop==False:
        first_country.append(np.nan)
        first_affiliation.append(np.nan)

print("\nDone !")
        
rsr_affl = pd.DataFrame({'inca_id':rsrs
                         , 'affiliation': first_affiliation
                         , 'country': first_country
                        })

Querying 90 out of 1001 reseachers...

KeyboardInterrupt: 

### Merge All Together

In [None]:
# Merge all together:
rsr_info = pd.merge(rsr_name, rsr_first_year, how='left', on='inca_id')
rsr_info = pd.merge(rsr_info, rsr_affl, how='left', on='inca_id')
rsr_info = pd.merge(rsr_info, id_lookup.rename(columns={'INCA ID':'inca_id', 'Dimensions Researcher ID':'rsr_id'}),
                    how='left', on='inca_id')

### Add ORCID Responses

In [221]:
# INCA ORCID-responses: keep only when there was an ORCID response.
inca_orcid_responses = inca_orcid_responses[inca_orcid_responses['ORCID Return']=="YES"].reset_index(drop=True)
conf_ids = list(inca_orcid_responses['INCA ID'])

# # Change Funder Name in INCa-Funded grants when they responded to ORCID
rsr_info['orcid_confirmed'] = (rsr_info['inca_id'].isin(conf_ids))

### ID Replacer

In [222]:
rsr_info = id_replace(rsr_info)

In [223]:
rsr_info.head()

Unnamed: 0,inca_id,first_name,last_name,date,rsr_id,orcid_confirmed
0,inca_1,Jérôme,ABADIE,1998-01-01,ur.0642054564.81,False
1,inca_2,Julien,ADAM,2006-01-01,ur.01177206360.47,False
2,inca_3,Antoine,ADENIS,1988-01-01,ur.01303404424.36&ur.01067706306.01,False
3,inca_4,Eric,ADRIAENSSENS,1996-01-01,ur.0673152200.72,False
4,inca_5,Martine,AGGERBECK,1978-01-01,ur.01002761217.53,False


In [224]:
del rsr_info['inca_id']

### Export

In [None]:
rsr_info.to_csv('../data/inca_researcher_info.csv', index=False)

## CSO and Cancer Type Lookup Table

In [None]:
inca_pubs_cso_ct = pd.read_csv('../data/inca_raw/inca_pub_details_with_cso_and_cancertypes.csv', low_memory=False)
inca_pubs_cso_ct = inca_pubs_cso_ct[['Dimensions Publication ID', 'CSO', 'Cancer Types']]
inca_pubs_cso_ct.rename(columns={'Dimensions Publication ID': 'pub_id', 'CSO': 'cso', 'Cancer Types': 'cancer_type'},
                        inplace=True)

### CSO

In [None]:
inca_pubs_cso = inca_pubs_cso_ct[inca_pubs_cso_ct['cso'].notnull()][['pub_id', 'cso']].drop_duplicates()
inca_pubs_cso = pd.concat([Series(row['pub_id'], row['cso'].split(';'))
                                    for _, row in inca_pubs_cso.iterrows()]).reset_index()
inca_pubs_cso.columns = ['cso', 'pub_id']
inca_pubs_cso['cso'] = inca_pubs_cso['cso'].str.strip()
inca_pubs_cso.to_csv('../data/topic_lookups/inca_researcher_publications_cso.csv', index=False)

### Cancer Type

In [None]:
inca_pubs_ct = inca_pubs_cso_ct[inca_pubs_cso_ct['cancer_type'].notnull()][['pub_id', 'cancer_type']].drop_duplicates()
inca_pubs_ct = pd.concat([Series(row['pub_id'], row['cancer_type'].split(';'))
                                    for _, row in inca_pubs_ct.iterrows()]).reset_index()
inca_pubs_ct.columns = ['cancer_type', 'pub_id']
inca_pubs_ct['cancer_type'] = inca_pubs_ct['cancer_type'].str.strip()
inca_pubs_ct.to_csv('../data/topic_lookups/inca_researcher_publications_cancer_type.csv', index=False)

## Sandbox