# INCA Data Cleaning and Extractions

## Python Setup

In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from pandas import Series
from openpyxl import load_workbook
import re
pd.options.display.max_columns = 100
from api_query import execute_query
from api_query import pull_data

## Load In Data

In [2]:
inca_grnts = pd.read_csv('../data/inca_raw/inca_grants_details.csv', low_memory=False)
inca_pubs = pd.read_csv('../data/inca_raw/inca_pub_details.csv', low_memory=False)
inca_orcid_responses = pd.read_csv('../output/researcher_info/researcher_info_ORCID_returns.csv', low_memory=False)

## INCA ID - Dimensions ID Lookup

In [3]:
id_lookup = inca_pubs[['INCA ID', 'Dimensions Researcher ID']].drop_duplicates().reset_index(drop=True)

In [4]:
inca_grnts = pd.merge(inca_grnts, id_lookup, how='left', on='INCA ID')
inca_grnts['Dimensions Researcher ID'] = np.where(inca_grnts['Dimensions Researcher ID'].notnull(), 
                                                  inca_grnts['Dimensions Researcher ID'], inca_grnts['INCA ID'])

## Account for Dimensions Manual Disambiguations

In [5]:
ids = inca_pubs[['Dimensions Researcher ID', 'Additional Researcher DIM ID to combine', 
                 'Additional Researcher DIM ID to combine 2']].copy().drop_duplicates().reset_index(drop=True)

In [6]:
dupls_1 = ids[ids['Additional Researcher DIM ID to combine 2'].notnull()].reset_index(drop=True)
dupls_1['new_id'] = (dupls_1['Dimensions Researcher ID']
                      +"&"+dupls_1['Additional Researcher DIM ID to combine']
                      +"&"+dupls_1['Additional Researcher DIM ID to combine 2'])
dupls_2 = ids[(ids['Additional Researcher DIM ID to combine 2'].isnull())
               &(ids['Additional Researcher DIM ID to combine'].notnull())].reset_index(drop=True)
dupls_2['new_id'] = (dupls_2['Dimensions Researcher ID']
                      +"&"+dupls_2['Additional Researcher DIM ID to combine'])
dupls = pd.concat([dupls_1, dupls_2]).reset_index(drop=True)

In [7]:
id_replacer = pd.DataFrame()
for var in ('Dimensions Researcher ID', 
            'Additional Researcher DIM ID to combine', 
            'Additional Researcher DIM ID to combine 2'):
    temp = dupls[dupls[var].notnull()][[var, 'new_id']].copy()
    temp.rename(columns={var:'old_id'}, inplace=True)
    if id_replacer.empty:
        id_replacer = temp.copy()
    else:
        id_replacer = pd.concat([id_replacer, temp]).reset_index(drop=True)

In [8]:
id_replacer.to_csv('../data/id_replacer.csv', index=False)

In [9]:
def id_replace(df):
    df = pd.merge(df, id_replacer, how='left', left_on='rsr_id', right_on='old_id')
    df['rsr_id'] = np.where(df['new_id'].notnull(), df['new_id'], df['rsr_id'])
    del df['old_id'], df['new_id']
    return df

## Clean Up Grant and Publication Files

In [10]:
# Clean Grants File
inca_grnts.columns = [x.lower().replace(' ', '_') for x in inca_grnts.columns]
inca_grnts.rename(columns={'dimensions_researcher_id': 'rsr_id'
                            , 'funding_amount_($)': 'funding_amount'
                            , 'dimensions_grant_id': 'grant_id'
                            , 'funder': 'funder_name'
                            , 'rcdc': 'rcdc_names'
                           }, inplace=True)
del inca_grnts['prenom_port'], inca_grnts['nom_port'], inca_grnts['title']
del inca_grnts['reference'], inca_grnts['organisme_port'], inca_grnts['research_org_names'], inca_grnts['abstract']
del inca_grnts['research_org_ids'], inca_grnts['for']

In [11]:
# Clean Pubs File
inca_pubs.columns = [x.lower().replace(' ', '_') for x in inca_pubs.columns]
inca_pubs.rename(columns={'dimensions_researcher_id': 'rsr_id'
                          , 'dimensions_publication_id': 'pub_id'
                          , 'publication_year': 'date'
                          , 'rcdc': 'rcdc_names'
                          , 'times_cited': 'citations'
                          , 'pubmed_id': 'pmid'
                         }, inplace=True)
del inca_pubs['inca_id'], inca_pubs['prenom_port'], inca_pubs['nom_port'], inca_pubs['organisme_port']
del inca_pubs['additional_researcher_dim_id_to_combine'], inca_pubs['additional_researcher_dim_id_to_combine_2']
del inca_pubs['orcid'], inca_pubs['title'], inca_pubs['issue']
del inca_pubs['pages'], inca_pubs['volume'], inca_pubs['relative_citation_ratio']
del inca_pubs['altmetric'], inca_pubs['open_access'], inca_pubs['author_names'], inca_pubs['research_org_names']
del inca_pubs['research_org_ids'], inca_pubs['for'], inca_pubs['journal_id'], inca_pubs['journal_title']
del inca_pubs['publication_date']
inca_pubs['date'] = inca_pubs['date'].apply(str).replace('\.0', '', regex=True)
inca_pubs['date'] = inca_pubs['date'].apply(lambda x: np.nan if x=="nan" else x+"-01-01")
inca_pubs['citations'] = pd.to_numeric(inca_pubs['citations'])

## Seperate INCa-Funded Grants from Researcher Grants

In [12]:
# Get INCA-funded Grants from file
inca_funders = ["French National Cancer Institute", "French Institute of Health and Medical Research"]
inca_funders += ["Ministère des Affaires sociales et de la Santé"]
inca_funded_grnts = inca_grnts[(inca_grnts['funder_name'].isin(inca_funders))
                               & (pd.DatetimeIndex(inca_grnts['start_date']).year>=2007)
                               & (pd.DatetimeIndex(inca_grnts['start_date']).year<=2012)].reset_index(drop=True)
inca_grnts = inca_grnts[(~inca_grnts['funder_name'].isin(inca_funders))
                        | (pd.DatetimeIndex(inca_grnts['start_date']).year<2007)
                        | (pd.DatetimeIndex(inca_grnts['start_date']).year>2012)].reset_index(drop=True)

In [13]:
# Replace Funder Names by "French Funders"
del inca_funded_grnts['funder_name']
inca_funded_grnts['funder_name'] = "INCa/INSERM/DGOS"

## Add ORCID Responses

In [14]:
# INCA ORCID-responses: keep only when there was an ORCID response.
inca_orcid_responses = inca_orcid_responses[inca_orcid_responses['ORCID Return']=="YES"].reset_index(drop=True)
conf_ids = list(inca_orcid_responses['INCA ID'])

# Change Funder Name in INCa-Funded grants when they responded to ORCID
inca_funded_grnts['funder_name'] = (inca_funded_grnts['funder_name'] 
                                    + np.where(inca_funded_grnts['inca_id'].isin(conf_ids), ' - ORCID Confirmed', ''))
del inca_funded_grnts['inca_id']

## Remove RCDC Codes

In [15]:
# Keep RCDC Codes in Seperate Table
inca_funded_grnts_rcdc = inca_funded_grnts[inca_funded_grnts['rcdc_names'].notnull()].copy()
inca_funded_grnts_rcdc = inca_funded_grnts_rcdc[['grant_id', 'rcdc_names']].drop_duplicates()
inca_funded_grnts_rcdc = pd.concat([Series(row['grant_id'], row['rcdc_names'].split(';'))
                                    for _, row in inca_funded_grnts_rcdc.iterrows()]).reset_index()
inca_funded_grnts_rcdc.columns = ['rcdc_name', 'grant_id']
inca_funded_grnts_rcdc['rcdc_name'] = inca_funded_grnts_rcdc['rcdc_name'].str.strip().str.upper()

del inca_funded_grnts['rcdc_names']

In [16]:
# Keep RCDC Codes in Seperate Table
inca_grnts_rcdc = inca_grnts[inca_grnts['rcdc_names'].notnull()].copy()
inca_grnts_rcdc = inca_grnts_rcdc[['grant_id', 'rcdc_names']].drop_duplicates()
inca_grnts_rcdc = pd.concat([Series(row['grant_id'], row['rcdc_names'].split(';'))
                                    for _, row in inca_grnts_rcdc.iterrows()]).reset_index()
inca_grnts_rcdc.columns = ['rcdc_name', 'grant_id']
inca_grnts_rcdc['rcdc_name'] = inca_grnts_rcdc['rcdc_name'].str.strip().str.upper()

del inca_grnts['rcdc_names']

In [17]:
# Keep RCDC Codes in Seperate Table
inca_pubs_rcdc = inca_pubs[inca_pubs['rcdc_names'].notnull()].copy()
inca_pubs_rcdc = inca_pubs_rcdc[['pub_id', 'rcdc_names']].drop_duplicates()
inca_pubs_rcdc = pd.concat([Series(row['pub_id'], row['rcdc_names'].split(';'))
                                    for _, row in inca_pubs_rcdc.iterrows()]).reset_index()
inca_pubs_rcdc.columns = ['rcdc_name', 'pub_id']
inca_pubs_rcdc['rcdc_name'] = inca_pubs_rcdc['rcdc_name'].str.strip().str.upper()

del inca_pubs['rcdc_names']

## Pull Additional Publication Info from Dimensions API

In [18]:
pub_ids = list(inca_pubs['pub_id'].unique())

In [19]:
# Filter operator 'in' requires 0 < items < 512
max_in_items = 100

# Limit exceeds maximum allowed limit 1000
max_return = 500

# Offset cannot exceed 50000
max_overall_returns = 50000

In [20]:
string = "search publications where id in [{}]"
string += " return publications[id+author_affiliations+supporting_grant_ids+times_cited+RCDC]"
full_resp = pull_data(string=string, in_list=pub_ids, in_type='publications', return_type='publications', 
                      max_in_items=max_in_items, max_return=max_return, max_overall_returns=max_overall_returns)

Querying: 53700-53800/109001 publications...
RESPONSE ERROR on i=537 and j=0.

Querying: 109000-109001/109001 publications...
Done !


In [21]:
pub_id = []
citations = []
supporting_grants = []
nb_authors = []
author_id = []
author_country = []
author_affiliation = []
author_affiliation_id = []

for pub in full_resp:
    if 'id' not in pub:
        pub['id'] = np.nan
    if 'author_affiliations' not in pub:
        pub['author_affiliations'] = [[]]
    for author in pub['author_affiliations'][0]: 
        if 'researcher_id' not in author:
            author['researcher_id'] = np.nan
        if ('affiliations' not in author)  | (len(author['affiliations'])==0):
            author['affiliations'] = [{'country_code':np.nan, 'name': np.nan, 'id':np.nan}]
        for affiliation in author['affiliations']:
            if 'country_code' not in affiliation:
                affiliation['country_code']=np.nan
            if 'name' not in affiliation:
                affiliation['name']=np.nan
            if 'id' not in affiliation:
                affiliation['id']=np.nan
    if 'times_cited' not in pub:
        pub['times_cited'] = np.nan
    if 'supporting_grant_ids' not in pub:
        pub['supporting_grant_ids'] = [np.nan]
    nb = len(pub['author_affiliations'][0])
    for author in pub['author_affiliations'][0]:
        nb_authors.append(nb)
        author_id.append(author['researcher_id'])
        author_country.append(author['affiliations'][0]['country_code'])
        author_affiliation.append(author['affiliations'][0]['name'])
        author_affiliation_id.append(author['affiliations'][0]['id'])
        pub_id.append(pub['id'])
        citations.append(pub['times_cited'])
        supporting_grants.append(pub['supporting_grant_ids'])

pubs = pd.DataFrame({'pub_id':pub_id
                     , 'citations':citations
                     , 'nb_authors':nb_authors
                     , 'rsr_id':author_id
                     , 'rsr_country':author_country
                     , 'rsr_affiliation':author_affiliation
                     , 'rsr_affiliation_id':author_affiliation_id
                     , 'supporting_grants':supporting_grants
                    })
pubs['supporting_grants'] = pubs['supporting_grants'].apply(lambda x: np.nan if x == [np.nan] else  ';'.join(x))

In [22]:
inca_pubs = pd.merge(inca_pubs[['pub_id', 'doi', 'pmid', 'date']], pubs, how='left', on='pub_id')

## Apply ID Replacer

In [23]:
inca_funded_grnts = id_replace(inca_funded_grnts)
inca_grnts = id_replace(inca_grnts)
inca_pubs = id_replace(inca_pubs)

## Export Grant and Publication Files

In [24]:
inca_funded_grnts.to_csv('../data/inca_funded_grants.csv', index=False)
inca_funded_grnts_rcdc.to_csv('../data/topic_lookups/inca_funded_grants_rcdc.csv', index=False)
inca_grnts.to_csv('../data/inca_researcher_grants.csv', index=False)
inca_grnts_rcdc.to_csv('../data/topic_lookups/inca_researcher_grants_rcdc.csv', index=False)
inca_pubs.to_csv('../data/inca_researcher_publications.csv', index=False)
inca_pubs_rcdc.to_csv('../data/topic_lookups/inca_researcher_publications_rcdc.csv', index=False)

## Export List of Publication IDs

For further matching.

In [25]:
inca_pub_ids = inca_pubs[inca_pubs['pmid'].notnull()][['rsr_id', 'pmid']].drop_duplicates()
inca_pub_ids.to_csv('../data/inca_pub_ids.csv', index=False)

## CSO and Cancer Type Lookup Table

In [26]:
inca_pubs_cso_ct = pd.read_csv('../data/inca_raw/inca_pub_details_with_cso_and_cancertypes.csv', low_memory=False)
inca_pubs_cso_ct = inca_pubs_cso_ct[['Dimensions Publication ID', 'CSO', 'Cancer Types']]
inca_pubs_cso_ct.rename(columns={'Dimensions Publication ID': 'pub_id', 'CSO': 'cso', 'Cancer Types': 'cancer_type'},
                        inplace=True)

### CSO

In [27]:
inca_pubs_cso = inca_pubs_cso_ct[inca_pubs_cso_ct['cso'].notnull()][['pub_id', 'cso']].drop_duplicates()
inca_pubs_cso = pd.concat([Series(row['pub_id'], row['cso'].split(';'))
                                    for _, row in inca_pubs_cso.iterrows()]).reset_index()
inca_pubs_cso.columns = ['cso', 'pub_id']
inca_pubs_cso['cso'] = inca_pubs_cso['cso'].str.strip()
inca_pubs_cso.to_csv('../data/topic_lookups/inca_researcher_publications_cso.csv', index=False)

### Cancer Type

In [28]:
inca_pubs_ct = inca_pubs_cso_ct[inca_pubs_cso_ct['cancer_type'].notnull()][['pub_id', 'cancer_type']].drop_duplicates()
inca_pubs_ct = pd.concat([Series(row['pub_id'], row['cancer_type'].split(';'))
                                    for _, row in inca_pubs_ct.iterrows()]).reset_index()
inca_pubs_ct.columns = ['cancer_type', 'pub_id']
inca_pubs_ct['cancer_type'] = inca_pubs_ct['cancer_type'].str.strip()
inca_pubs_ct.to_csv('../data/topic_lookups/inca_researcher_publications_cancer_type.csv', index=False)

## Sandbox