# Data Extraction using Dimensions API

## Python Setup

In [None]:
import requests
import time
import pandas as pd
from pandas import Series
pd.options.display.max_rows = 100
import math
import numpy as np
from json import JSONDecodeError
from datetime import datetime
import gender_guesser.detector as gender

## Importing ID Replacer

In [None]:
id_replacer = pd.read_csv('../data/id_replacer.csv')

In [None]:
def id_replace(df):
    df = pd.merge(df, id_replacer, how='left', left_on='rsr_id', right_on='old_id')
    df['rsr_id'] = np.where(df['new_id'].notnull(), df['new_id'], df['rsr_id'])
    del df['old_id'], df['new_id']
    return df

## Connecting to API

For full documentation: https://docs.dimensions.ai/dsl/1.8.0/

The API Query functions are defined in the following code:

In [None]:
from api_query import execute_query
from api_query import pull_data

### API Parameters

In [None]:
# Filter operator 'in' requires 0 < items < 512
max_in_items = 100

# Limit exceeds maximum allowed limit 1000
max_return = 1000

# Offset cannot exceed 50000
max_overall_returns = 50000

## Define Comparison Agencies:

Funding Agencies considered are NIH Wellcome Trust, Australian  NRMC, MRC in UK

In [None]:
counter_funders = ['Wellcome Trust', 'National Health and Medical Research Council', 'Cancer Research UK', 
                   'National Cancer Institute']

## Funded Grants and Researchers

### INCA

In [None]:
inca_funded_grnts = pd.read_csv('../data/inca_funded_grants.csv', low_memory=False)

In [None]:
inca_funded_grnts['last_name'] = inca_funded_grnts['last_name'].str.replace('-', ' ')

In [None]:
inca_funded_grnts.head()

### Counterfactual based on Funding Agencies

In [None]:
string = "search grants where funders.name in [{}] and start_year>=2007 and start_year<=2012"
string += " return grants[id+researchers+funders+start_date+end_date+funding_usd+RCDC]"
full_resp = pull_data(string=string, in_list=counter_funders, in_type='funding agencies', return_type='grants',
                      max_in_items=max_in_items, max_return=max_return, max_overall_returns=max_overall_returns)

In [None]:
first_name = []
last_name = []
rsr_id = []
grant_id = []
funder_name = []
start_dates = []
end_dates = []
funding_amount = []
rcdc_names = []

for grant in full_resp:

    if 'RCDC' not in grant:
        grant['RCDC'] = [{'id':np.nan, 'name':np.nan}]
    if 'researchers' not in grant:
        grant['researchers'] = [{'id':np.nan, 'first_name':np.nan, 'last_name': np.nan}]
    for researcher in grant['researchers']:
        if 'id' not in researcher:
            tmp = researcher
            researcher['id']=tmp
        if 'first_name' not in researcher:
            researcher['first_name']=np.nan
        if 'last_name' not in researcher:
            researcher['last_name']=np.nan
    if 'start_date' not in grant:
        grant['start_date'] = np.nan
    if 'end_date' not in grant:
        grant['end_date'] = np.nan
    if 'funding_usd' not in grant:
        grant['funding_usd'] = np.nan
    for i in range(len(grant['researchers'])-1, -1, -1):
        if 'id' not in grant['researchers'][i]:
            del grant['researchers'][i]
    if 'funders' not in grant:
        grant['funders'] = [{'id':np.nan,'name':np.nan}]
    for researcher in grant['researchers']:
        for funder in grant['funders']:
            if funder['name'] not in counter_funders:
                continue
            rsr_id.append(researcher['id'])
            first_name.append(researcher['first_name'])
            last_name.append(researcher['last_name'])
            grant_id.append(grant['id'])
            funder_name.append(funder['name'])
            start_dates.append(grant['start_date'])
            end_dates.append(grant['end_date'])
            funding_amount.append(float(grant['funding_usd']))
            rcdc_names_t = []
            for rcdc in grant['RCDC']:
                rcdc_names_t.append(str(rcdc['name']))
            rcdc_names.append(";".join(rcdc_names_t))

counter_funded_grnts = pd.DataFrame({'rsr_id':rsr_id
                                        , 'first_name':first_name
                                        , 'last_name':last_name
                                        , 'grant_id':grant_id
                                        , 'funder_name':funder_name
                                        , 'start_date':start_dates
                                        , 'end_date':end_dates
                                        , 'funding_amount':funding_amount
                                     , 'rcdc_names': rcdc_names
                                    }).drop_duplicates().reset_index(drop=True)

In [None]:
counter_funded_grnts['last_name'] = counter_funded_grnts['last_name'].str.upper()

In [None]:
counter_funded_grnts['grant_id'].nunique()

In [None]:
# Remove all grants with no researcher info
counter_funded_grnts = counter_funded_grnts[counter_funded_grnts['rsr_id'].notnull()]

# Remove all other funders that might have appeared
counter_funded_grnts = counter_funded_grnts[(counter_funded_grnts['funder_name'].isin(counter_funders))]

# Restrict to grants funded between 2007 and 2012:
counter_funded_grnts = counter_funded_grnts[(pd.DatetimeIndex(counter_funded_grnts['start_date']).year>=2007)
                          & (pd.DatetimeIndex(counter_funded_grnts['start_date']).year<=2012)].reset_index(drop=True)

Some of the counterfactual agencies (NCI, Cancer Research UK) are cancer-specific funders. Others however, such as the Wellcome Trust and the NHMRC, fund different types of medical research. For this second category of non-cancer-specific funders, we must restrict to grants on the topic of Cancer. I use the RCDC codes to restrict to Cancer-related grants.

In [None]:
# For non-cancer-specific funders, restrict to Cancer Grants:
general_funders = ['Wellcome Trust', 'National Health and Medical Research Council']
counter_funded_grnts = counter_funded_grnts[(counter_funded_grnts['funder_name'].isin(general_funders)==False)
                                            |(counter_funded_grnts['rcdc_names'].str.upper().str.contains("CANCER"))
                                           ].reset_index(drop=True)
del counter_funded_grnts['rcdc_names']

In [None]:
print("Total number of Grants after restrictions: {}".format(counter_funded_grnts['grant_id'].nunique()))

In [None]:
counter_funded_grnts = id_replace(counter_funded_grnts)

### Combine the two

In [None]:
funded_grnts = pd.concat([inca_funded_grnts, counter_funded_grnts], sort=False)

In [None]:
funded_grnts['funder_name'].value_counts()

### Export

In [None]:
cols = ['funder_name', 'grant_id', 'rsr_id', 'first_name', 'last_name', 'start_date', 'end_date', 'funding_amount']
funded_grnts[cols].to_csv('../data/funded_grants.csv', index=False)

## Pull all Grants, Publications, and Personal Information from these Researchers

In [None]:
funded_grnts = pd.read_csv('../data/funded_grants.csv')
rsrs = pd.merge(funded_grnts[['rsr_id']].drop_duplicates(), id_replacer, 
                left_on='rsr_id', right_on='new_id', how='left')
rsrs['id'] = np.where(rsrs['old_id'].notnull(), rsrs['old_id'], rsrs['rsr_id'])
rsrs = list(rsrs['id'].drop_duplicates())
print(len(rsrs))

### Grants

In [None]:
string = "search grants where researchers.id in [{}]"
string += " return grants[id+researchers+funders+start_date+end_date+funding_usd+RCDC]"
full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='grants'
                      , max_in_items=max_in_items, max_return=max_return, max_overall_returns=max_overall_returns)

In [None]:
rsr_id = []
grant_id = []
funder_name = []
start_dates = []
end_dates = []
funding_amount = []

for grant in full_resp:

    if 'researchers' not in grant:
        grant['researchers'] = [{'id':np.nan}]
    for researcher in grant['researchers']: 
        if 'id' not in researcher:
            tmp = researcher
            grant['researchers'].append({'id':tmp})
    if 'start_date' not in grant:
        grant['start_date'] = np.nan
    if 'end_date' not in grant:
        grant['end_date'] = np.nan
    if 'funding_usd' not in grant:
        grant['funding_usd'] = np.nan
    for i in range(len(grant['researchers'])-1, -1, -1):
        if 'id' not in grant['researchers'][i]:
            del grant['researchers'][i]
    if 'funders' not in grant:
        grant['funders'] = [{'id':np.nan,'name':np.nan}]
    for researcher in grant['researchers']:
        for funder in grant['funders']:
            rsr_id.append(researcher['id'])
            grant_id.append(grant['id'])
            funder_name.append(funder['name'])
            start_dates.append(grant['start_date'])
            end_dates.append(grant['end_date'])
            funding_amount.append(float(grant['funding_usd']))
            
grnts = pd.DataFrame({'rsr_id':rsr_id
                      , 'grant_id':grant_id
                      , 'funder_name':funder_name
                      , 'start_date':start_dates
                      , 'end_date':end_dates
                      , 'funding_amount':funding_amount
                     }).drop_duplicates().reset_index(drop=True)

In [None]:
grnts = id_replace(grnts)

In [None]:
grnts.describe(include='all')

In [None]:
grnts.to_csv('../data/researcher_grants.csv', index=False)

In [None]:
data_id = []
rcdc_names = []
for elem in full_resp:
    if 'id' not in elem:
        elem['id'] = np.nan
    if 'RCDC' not in elem:
        elem['RCDC'] = [{'id':np.nan, 'name':np.nan}]
    for rcdc in elem['RCDC']:
        if rcdc['name'] != np.nan:
            data_id.append(elem['id'])
            rcdc_names.append(str(rcdc['name']))

grnts_rcdc = pd.DataFrame({'grant_id':data_id
                           , 'rcdc_name':rcdc_names}).drop_duplicates().reset_index(drop=True)

grnts_rcdc['rcdc_name'] = grnts_rcdc['rcdc_name'].str.strip().str.upper()

In [None]:
grnts_rcdc.to_csv('../data/topic_lookups/grants_rcdc.csv', index=False)

### Publications

In [None]:
string = "search publications where researchers.id in [{}]"
string += " return publications"
string += "[id+doi+pmid+author_affiliations+date+supporting_grant_ids+times_cited+altmetric+type+journal+RCDC]"
full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='publications', 
                      max_in_items=max_in_items, max_return=max_return, max_overall_returns=max_overall_returns)

In [None]:
pub_id = []
pmid = []
dates = []
citations = []
dois = []
supporting_grants = []
nb_authors = []
author_id = []
author_country = []
author_city = []
author_affiliation = []
author_affiliation_id = []
altmetric = []
journal = []
journal_id = []
pub_type = []

for pub in full_resp:
    if 'id' not in pub:
        pub['id'] = np.nan
    if 'pmid' not in pub:
        pub['pmid'] = np.nan
    if 'author_affiliations' not in pub:
        pub['author_affiliations'] = [[]]
    for author in pub['author_affiliations'][0]: 
        if 'researcher_id' not in author:
            author['researcher_id'] = np.nan
        if ('affiliations' not in author)  | (len(author['affiliations'])==0):
            author['affiliations'] = [{'country_code':np.nan, 'name': np.nan, 'id':np.nan, 'city':np.nan}]
        for affiliation in author['affiliations']:
            if 'country_code' not in affiliation:
                affiliation['country_code']=np.nan
            if 'name' not in affiliation:
                affiliation['name']=np.nan
            if 'id' not in affiliation:
                affiliation['id']=np.nan
            if 'city' not in affiliation:
                affiliation['city']=np.nan
    if 'journal' not in pub:
        pub['journal'] = {'id':np.nan, 'title':np.nan}
    if 'id' not in pub['journal']:
        pub['journal']['id'] = np.nan
    if 'title' not in pub['journal']:
        pub['journal']['title'] = np.nan
    if 'date' not in pub:
        pub['date'] = np.nan
    if 'times_cited' not in pub:
        pub['times_cited'] = np.nan
    if 'doi' not in pub:
        pub['doi'] = np.nan
    if 'type' not in pub:
        pub['type'] = np.nan
    if 'altmetric' not in pub:
        pub['altmetric'] = np.nan
    if 'supporting_grant_ids' not in pub:
        pub['supporting_grant_ids'] = [np.nan]
    nb = len(pub['author_affiliations'][0])
    for author in pub['author_affiliations'][0]:
        nb_authors.append(float(nb))
        author_id.append(author['researcher_id'])
        author_country.append(author['affiliations'][0]['country_code'])
        author_city.append(author['affiliations'][0]['city'])
        author_affiliation.append(author['affiliations'][0]['name'])
        author_affiliation_id.append(author['affiliations'][0]['id'])
        pub_id.append(pub['id'])
        dates.append(pub['date'])
        citations.append(float(pub['times_cited']))
        dois.append(pub['doi'])
        pmid.append(pub['pmid'])
        supporting_grants.append(pub['supporting_grant_ids'])
        pub_type.append(pub['type'])
        altmetric.append(pub['altmetric'])
        journal.append(pub['journal']['title'])
        journal_id.append(pub['journal']['id'])

pubs = pd.DataFrame({'pub_id':pub_id
                     , 'pmid': pmid
                     , 'date':dates
                     , 'doi':dois
                     , 'citations':citations
                     , 'nb_authors':nb_authors
                     , 'rsr_id':author_id
                     , 'rsr_country':author_country
                     , 'rsr_city':author_city
                     , 'rsr_affiliation':author_affiliation
                     , 'rsr_affiliation_id':author_affiliation_id
                     , 'supporting_grants':supporting_grants
                     , 'pub_type':pub_type
                     , 'altmetric':altmetric
                     , 'journal':journal
                     , 'journal_id':journal_id
                    })

In [None]:
pubs['supporting_grants'] = pubs['supporting_grants'].apply(lambda x: np.nan if x == [np.nan] else  ';'.join(x))

In [None]:
pubs = pubs.drop_duplicates().reset_index(drop=True)

In [None]:
pubs['date'] = pd.to_datetime(pubs['date'], errors='coerce')
pubs['year'] = pd.DatetimeIndex(pubs['date']).year

In [None]:
pubs.describe(include='all')

In [None]:
pubs = id_replace(pubs)

In [None]:
pubs.to_csv('../data/researcher_publications.csv', index=False)

In [None]:
# Publication IDs:
pub_ids = pubs[pubs['pmid'].notnull()][['rsr_id', 'pmid']].drop_duplicates()
pub_ids.to_csv('../data/pub_ids.csv', index=False)

In [None]:
data_id = []
rcdc_names = []
for elem in full_resp:
    if 'id' not in elem:
        elem['id'] = np.nan
    if 'RCDC' not in elem:
        elem['RCDC'] = [{'id':np.nan, 'name':np.nan}]
    for rcdc in elem['RCDC']:
        if rcdc['name'] != np.nan:
            data_id.append(elem['id'])
            rcdc_names.append(str(rcdc['name']))

pubs_rcdc = pd.DataFrame({'pub_id':data_id
                           , 'rcdc_name':rcdc_names}).drop_duplicates().reset_index(drop=True)

pubs_rcdc['rcdc_name'] = pubs_rcdc['rcdc_name'].str.strip().str.upper()

In [None]:
pubs_rcdc.to_csv('../data/topic_lookups/publications_rcdc.csv', index=False)

## Personal Information

In [None]:
funded_grnts = pd.read_csv('../data/funded_grants.csv', low_memory=False)
pubs = pd.read_csv('../data/researcher_publications.csv', low_memory=False)

### Gender

In [None]:
# Gender Imputer
d = gender.Detector()

# Gender Abbreviation Table
gender_abbr = pd.DataFrame({'gender': ['male', 'mostly_male', 'unknown', 'andy', 'mostly_female', 'female']
                            , 'gender_abbr': ['M', 'M', 'UNKNOWN', 'UNKNOWN', 'F', 'F']})

In [None]:
rsr_gender = funded_grnts[['rsr_id', 'first_name', 'last_name']].drop_duplicates().reset_index(drop=True)

In [None]:
rsr_gender['gender_name'] = rsr_gender['first_name'].fillna('')
rsr_gender['gender_name'] = rsr_gender['gender_name'].apply(lambda x: x.split(" ")[0])
rsr_gender['gender'] = rsr_gender['gender_name'].apply(lambda x: d.get_gender(x))
rsr_gender = pd.merge(rsr_gender, gender_abbr, how='left', on='gender')
del rsr_gender['gender'], rsr_gender['gender_name'], rsr_gender['first_name'], rsr_gender['last_name']
rsr_gender.rename(columns={'gender_abbr':'rsr_gender'}, inplace=True)

In [None]:
rsr_gender['rsr_gender'].value_counts(normalize=True)

### Career Age

In [None]:
rsr_first_year = funded_grnts[['rsr_id', 'start_date']].groupby('rsr_id')['start_date'].min().reset_index()
rsr_first_year['first_grant_year'] = pd.DatetimeIndex(rsr_first_year['start_date']).year
temp = pubs[['rsr_id', 'year']].groupby('rsr_id')['year'].min().reset_index()
temp.rename(columns={'year':'first_pub_year'}, inplace=True)
rsr_first_year = pd.merge(rsr_first_year, temp, on='rsr_id', how='left')
rsr_first_year['rsr_career_start_year'] = rsr_first_year[['first_grant_year','first_pub_year']].min(axis=1)

In [None]:
rsr_first_year = rsr_first_year[['rsr_id', 'rsr_career_start_year']]

### Original Affiliation

Maybe do this with Grant affiliation instead of Publication?

In [None]:
cols = ['rsr_id', 'rsr_affiliation', 'rsr_affiliation_id', 'rsr_country', 'rsr_city', 'year']
affiliation = pubs[pubs['rsr_affiliation'].notnull()][cols]
affiliation = affiliation.sort_values(['rsr_id', 'year']).drop_duplicates('rsr_id', keep='first')
del affiliation['year']

### First n Years of Publications

In [None]:
n = 1

In [None]:
pubs_rcdc = pd.read_csv('../data/topic_lookups/publications_rcdc.csv')
pubs_cso = pd.read_csv('../data/topic_lookups/publications_cso.csv')

In [None]:
first_pubs = pd.merge(rsr_first_year, pubs[['rsr_id', 'year', 'pub_id', 'citations']], how='left', on='rsr_id')
first_pubs = first_pubs[first_pubs['year']<=(first_pubs['rsr_career_start_year']+n)].reset_index(drop=True)

In [None]:
first_topics = first_pubs.groupby(['rsr_id'])
first_topics = pd.DataFrame({'rsr_nb_early_pubs': first_topics['rsr_id'].count()
                             , 'rsr_nb_early_citations': first_topics['citations'].sum()}).reset_index()

#### CSO

In [None]:
temp = pd.merge(first_pubs, pubs_cso, how='left', on='pub_id')
temp = temp.groupby(['rsr_id', 'cso_name']).size().reset_index()
temp['cso_id'] = "cso_"+temp.groupby(['cso_name']).ngroup().astype(str)
temp[['cso_name', 'cso_id']].drop_duplicates().to_csv('../data/topic_lookups/cso_ids.csv', index=False)
temp = pd.pivot_table(temp, index='rsr_id', columns='cso_id', values=0, aggfunc='sum').reset_index()

In [None]:
first_topics = pd.merge(first_topics, temp, how='left', on='rsr_id')

#### RCDC

In [None]:
temp = pd.merge(first_pubs, pubs_rcdc, how='left', on='pub_id')
temp = temp.groupby(['rsr_id', 'rcdc_name']).size().reset_index()
temp['rcdc_id'] = "rcdc_"+temp.groupby(['rcdc_name']).ngroup().astype(str)
temp[['rcdc_name', 'rcdc_id']].drop_duplicates().to_csv('../data/topic_lookups/rcdc_ids.csv', index=False)
temp = pd.pivot_table(temp, index='rsr_id', columns='rcdc_id', values=0, aggfunc='sum').reset_index()

In [None]:
first_topics = pd.merge(first_topics, temp, how='left', on='rsr_id')

In [None]:
first_topics.shape

### ORCID Confirmed

In [None]:
inca_orcid_confirmed = pd.read_csv('../data/inca_orcid_confirmations.csv')

In [None]:
print("Number of ORCID Confirmed INCa Researchers: {}".format(inca_orcid_confirmed.shape[0]))

### Combine All and Export

In [None]:
rsr_info = pd.merge(funded_grnts[['rsr_id', 'first_name', 'last_name']].drop_duplicates().reset_index(drop=True),
                    rsr_gender, how='left', on='rsr_id')
rsr_info = pd.merge(rsr_info, rsr_first_year, how='left', on='rsr_id')
rsr_info = pd.merge(rsr_info, affiliation, how='left', on='rsr_id')
rsr_info = pd.merge(rsr_info, inca_orcid_confirmed, how='left', on='rsr_id')
rsr_info = pd.merge(rsr_info, first_topics, how='left', on='rsr_id')

In [None]:
rsr_info['orcid_confirmed'].fillna(False, inplace=True)

In [None]:
rsr_info.describe(include='all')

In [None]:
rsr_info.to_csv('../data/researcher_info.csv', index=False)

## Sandbox