# INCA Data Cleaning and Extractions

## Python Setup

In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
from pandas import Series
from openpyxl import load_workbook
import re
pd.options.display.max_columns = 100
from api_query import execute_query
from api_query import pull_data
import time
import gender_guesser.detector as gender

## Load In Data

In [None]:
grnts = pd.read_csv('../data/dimensions_raw/inca_grants_details.csv', low_memory=False)
pubs = pd.read_csv('../data/dimensions_raw/inca_pub_details.csv', low_memory=False)
inca_orcid_responses = pd.read_csv('../output/researcher_info/researcher_info_ORCID_returns.csv', low_memory=False)

## INCA ID - Dimensions ID Lookup

In [None]:
id_lookup = pubs[['INCA ID', 'Dimensions Researcher ID']].drop_duplicates().reset_index(drop=True)

In [None]:
grnts = pd.merge(grnts, id_lookup, how='left', on='INCA ID')
grnts['Dimensions Researcher ID'] = np.where(grnts['Dimensions Researcher ID'].notnull(), 
                                                  grnts['Dimensions Researcher ID'], grnts['INCA ID'])

## Account for Dimensions Manual Disambiguations

In [None]:
ids = pubs[['Dimensions Researcher ID', 'Additional Researcher DIM ID to combine', 
                 'Additional Researcher DIM ID to combine 2']].copy().drop_duplicates().reset_index(drop=True)

In [None]:
dupls_1 = ids[ids['Additional Researcher DIM ID to combine 2'].notnull()].reset_index(drop=True)
dupls_1['new_id'] = (dupls_1['Dimensions Researcher ID']
                      +"&"+dupls_1['Additional Researcher DIM ID to combine']
                      +"&"+dupls_1['Additional Researcher DIM ID to combine 2'])
dupls_2 = ids[(ids['Additional Researcher DIM ID to combine 2'].isnull())
               &(ids['Additional Researcher DIM ID to combine'].notnull())].reset_index(drop=True)
dupls_2['new_id'] = (dupls_2['Dimensions Researcher ID']
                      +"&"+dupls_2['Additional Researcher DIM ID to combine'])
dupls = pd.concat([dupls_1, dupls_2]).reset_index(drop=True)

In [None]:
id_replacer = pd.DataFrame()
for var in ('Dimensions Researcher ID', 
            'Additional Researcher DIM ID to combine', 
            'Additional Researcher DIM ID to combine 2'):
    temp = dupls[dupls[var].notnull()][[var, 'new_id']].copy()
    temp.rename(columns={var:'old_id'}, inplace=True)
    if id_replacer.empty:
        id_replacer = temp.copy()
    else:
        id_replacer = pd.concat([id_replacer, temp]).reset_index(drop=True)

In [None]:
id_replacer.to_csv('../data/id_replacer.csv', index=False)

In [None]:
def id_replace(df):
    df = pd.merge(df, id_replacer, how='left', left_on='rsr_id', right_on='old_id')
    df['rsr_id'] = np.where(df['new_id'].notnull(), df['new_id'], df['rsr_id'])
    del df['old_id'], df['new_id']
    return df

## Clean Up Grant and Publication Files

In [None]:
# Clean Grants File
grnts.columns = [x.lower().replace(' ', '_') for x in grnts.columns]
grnts.rename(columns={'dimensions_researcher_id': 'rsr_id'
                            , 'funding_amount_($)': 'funding_amount'
                            , 'dimensions_grant_id': 'grant_id'
                            , 'funder': 'funder_name'
                            , 'prenom_port': 'first_name'
                            , 'nom_port': 'last_name'
                           }, inplace=True)
del grnts['title'], grnts['reference'], grnts['organisme_port'], grnts['research_org_names']
del grnts['research_org_ids'], grnts['for'], grnts['abstract'], grnts['rcdc']

## Grant Datasets

In [None]:
inca_grnts = grnts.copy()
del inca_grnts['inca_id']

### Seperate INCa-Funded Grants from Researcher Grants

In [None]:
# Get INCA-funded Grants from file
inca_funders = ["French National Cancer Institute", "French Institute of Health and Medical Research"]
inca_funders += ["Ministère des Affaires sociales et de la Santé"]
inca_funded_grnts = inca_grnts[(inca_grnts['funder_name'].isin(inca_funders))
                               & (pd.DatetimeIndex(inca_grnts['start_date']).year>=2007)
                               & (pd.DatetimeIndex(inca_grnts['start_date']).year<=2012)].reset_index(drop=True)

In [None]:
# Replace Funder Names by "French Funders"
del inca_funded_grnts['funder_name']
inca_funded_grnts['funder_name'] = "INCa/INSERM/DGOS"

### Apply ID Replacer

In [None]:
inca_funded_grnts = id_replace(inca_funded_grnts)

### Export

In [None]:
inca_funded_grnts.to_csv('../data/inca_funded_grants.csv', index=False)

## Additional Raw Datasets

### ORCID Confirmation Dataset

In [None]:
inca_orcid_responses = inca_orcid_responses[inca_orcid_responses['ORCID Return']=="YES"].reset_index(drop=True)

In [None]:
inca_orcid_responses.head()

In [None]:
inca_orcid_responses = pd.merge(id_lookup, inca_orcid_responses[['INCA ID', 'ORCID Return']].drop_duplicates(), 
                                how='left', on='INCA ID')

In [None]:
inca_orcid_responses = inca_orcid_responses[inca_orcid_responses['ORCID Return'].notnull()].reset_index(drop=True)

In [None]:
inca_orcid_responses['orcid_confirmed'] = True
inca_orcid_responses.rename(columns={'Dimensions Researcher ID': 'rsr_id'}, inplace=True) 
del inca_orcid_responses['ORCID Return'], inca_orcid_responses['INCA ID']

In [None]:
inca_orcid_responses = id_replace(inca_orcid_responses)
inca_orcid_responses = inca_orcid_responses.drop_duplicates().reset_index(drop=True)

In [None]:
inca_orcid_responses.to_csv('../data/inca_orcid_confirmations.csv', index=False)

## CSO and Cancer Type Datasets

In [None]:
inca_pubs_cso_ct = pd.read_csv('../data/dimensions_raw/inca_pub_details_with_cso_and_cancertypes.csv', low_memory=False)
inca_pubs_cso_ct = inca_pubs_cso_ct[['Dimensions Publication ID', 'CSO', 'Cancer Types']]
inca_pubs_cso_ct.rename(columns={'Dimensions Publication ID': 'pub_id', 'CSO': 'cso', 'Cancer Types': 'cancer_type'},
                        inplace=True)

In [None]:
pubs_cso = pd.read_csv('../data/dimensions_raw/results_cso.csv', low_memory=False)
del pubs_cso['doi'], pubs_cso['CSO Code']
pubs_cso.rename(columns={'Dimensions Publication ID':'pub_id', 'CSO Name':'cso_name'}, inplace=True)

pubs_ct = pd.read_csv('../data/dimensions_raw/results_ct.csv', low_memory=False)
del pubs_ct['doi']
pubs_ct.rename(columns={'Dimensions Publication ID':'pub_id', 'Cancer Type':'cancer_type'}, inplace=True)

grants_cso = pd.read_csv('../data/dimensions_raw/grant_results_cso.csv', low_memory=False)
del grants_cso['CSO Code'], grants_cso['funder_name']
grants_cso.rename(columns={'CSO Name':'cso_name'}, inplace=True)

grants_ct = pd.read_csv('../data/dimensions_raw/grant_results_ct.csv', low_memory=False)
del grants_ct['funder_name']
grants_cso.rename(columns={'Cancer Type':'cancer_type'}, inplace=True)

In [None]:
cso_lookup = pd.read_csv('../data/cso_codes/cso_lookup.csv')
cso_lookup.rename(columns={'cso_code':'cso'}, inplace=True)
cso_lookup['cso'] = cso_lookup['cso'].astype(str).str.strip()

### Pub CSO

In [None]:
inca_pubs_cso = inca_pubs_cso_ct[inca_pubs_cso_ct['cso'].notnull()][['pub_id', 'cso']].drop_duplicates()
inca_pubs_cso = pd.concat([Series(row['pub_id'], row['cso'].split(';'))
                                    for _, row in inca_pubs_cso.iterrows()]).reset_index()
inca_pubs_cso.columns = ['cso', 'pub_id']
inca_pubs_cso['cso'] = inca_pubs_cso['cso'].str.strip()
inca_pubs_cso = pd.merge(inca_pubs_cso, cso_lookup, how='left', on='cso')
del inca_pubs_cso['cso']

In [None]:
pubs_cso = pd.concat([inca_pubs_cso, pubs_cso], sort=False).drop_duplicates().reset_index(drop=True)

In [None]:
pubs_cso.to_csv('../data/topic_lookups/publications_cso.csv', index=False)

### Grant CSO

In [None]:
grants_cso.to_csv('../data/topic_lookups/grants_cso.csv', index=False)

### Publication Cancer Type

In [None]:
inca_pubs_ct = inca_pubs_cso_ct[inca_pubs_cso_ct['cancer_type'].notnull()][['pub_id', 'cancer_type']].drop_duplicates()
inca_pubs_ct = pd.concat([Series(row['pub_id'], row['cancer_type'].split(';'))
                                    for _, row in inca_pubs_ct.iterrows()]).reset_index()
inca_pubs_ct.columns = ['cancer_type', 'pub_id']
inca_pubs_ct['cancer_type'] = inca_pubs_ct['cancer_type'].str.strip()

In [None]:
pubs_ct = pd.concat([inca_pubs_ct, pubs_ct], sort=False).drop_duplicates().reset_index(drop=True)

In [None]:
pubs_ct.to_csv('../data/topic_lookups/publications_cancer_type.csv', index=False)

### Grant Cancer Type

In [None]:
grants_ct.to_csv('../data/topic_lookups/grants_cancer_type.csv', index=False)

## Sandbox