# Counterfactual Group Identification

## Python Setup

In [1]:
import requests
import time
import pandas as pd
from pandas import Series
pd.options.display.max_rows = 100
import math
import numpy as np
from json import JSONDecodeError

## Importing ID Replacer

In [2]:
id_replacer = pd.read_csv('../data/id_replacer.csv')

In [3]:
def id_replace(df):
    df = pd.merge(df, id_replacer, how='left', left_on='rsr_id', right_on='old_id')
    df['rsr_id'] = np.where(df['new_id'].notnull(), df['new_id'], df['rsr_id'])
    del df['old_id'], df['new_id']
    return df

## Connecting to API

For full documentation: https://docs.dimensions.ai/dsl/1.8.0/

The API Query functions are defined in the following code:

In [4]:
from api_query import execute_query
from api_query import pull_data

### API Parameters

In [5]:
# Filter operator 'in' requires 0 < items < 512
max_in_items = 250

# Limit exceeds maximum allowed limit 1000
max_return = 500

# Offset cannot exceed 50000
max_overall_returns = 50000

## Grant and Publication Parsing Function

In [6]:
def parse_grant(full_resp):
    
    rsr_id = []
    grant_id = []
    funder_name = []
    funder_id = []
    rcdc_codes = []
    rcdc_names = []
    start_dates = []
    end_dates = []
    funding_amount = []
    for grant in full_resp:
        if 'RCDC' not in grant:
            grant['RCDC'] = [{'id':np.nan, 'name':np.nan}]
        if 'researchers' not in grant:
            grant['researchers'] = [{'id':np.nan}]
        for researcher in grant['researchers']: 
            if 'id' not in researcher:
                tmp = researcher
                grant['researchers'].append({'id':tmp})
        if 'start_date' not in grant:
            grant['start_date'] = np.nan
        if 'end_date' not in grant:
            grant['end_date'] = np.nan
        if 'funding_usd' not in grant:
            grant['funding_usd'] = np.nan
        for i in range(len(grant['researchers'])-1, -1, -1):
            if 'id' not in grant['researchers'][i]:
                del grant['researchers'][i]
        if 'funders' not in grant:
            grant['funders'] = [{'id':np.nan,'name':np.nan}]
        for researcher in grant['researchers']:
            for funder in grant['funders']:
                rsr_id.append(researcher['id'])
                grant_id.append(grant['id'])
                funder_id.append(funder['id'])
                funder_name.append(funder['name'])
                start_dates.append(grant['start_date'])
                end_dates.append(grant['end_date'])
                funding_amount.append(grant['funding_usd'])
                rcdc_codes_t = []
                rcdc_names_t = []
                for rcdc in grant['RCDC']:
                    rcdc_codes_t.append(str(rcdc['id']))
                    rcdc_names_t.append(str(rcdc['name']))
                rcdc_codes.append(";".join(rcdc_codes_t))
                rcdc_names.append(";".join(rcdc_names_t))
                
    grnts = pd.DataFrame({'rsr_id':rsr_id
                          , 'grant_id':grant_id
                          , 'funder_name':funder_name
                          , 'funder_id':funder_id
                          , 'rcdc_codes':rcdc_codes
                          , 'rcdc_names':rcdc_names
                          , 'start_date':start_dates
                          , 'end_date':end_dates
                          , 'funding_amount':funding_amount
                         })

    return grnts

In [119]:
def parse_rcdc(full_resp, data_type):
    
    data_id = []
    rcdc_codes = []
    rcdc_names = []
    for elem in full_resp:
        if 'id' not in pub:
            elem['id'] = np.nan
        if 'RCDC' not in elem:
            elem['RCDC'] = [{'id':np.nan, 'name':np.nan}]
        for rcdc in elem['RCDC']:
            if rcdc['name'] != np.nan:
                data_id.append(elem['id'])
                rcdc_names.append(str(rcdc['name']))

    rcdc = pd.DataFrame({data_type+'_id':data_id
                         , 'rcdc_name':rcdc_names})
    rcdc['rcdc_name'] = rcdc['rcdc_name'].str.strip().str.upper()

    return rcdc

In [112]:
def parse_publication(full_resp):

    pub_id = []
    pmid = []
    dates = []
    citations = []
    dois = []
    supporting_grants = []
    nb_authors = []
    author_id = []
    author_country = []
    author_affiliation = []
    author_affiliation_id = []

    for pub in full_resp:
        if 'id' not in pub:
            pub['id'] = np.nan
        if 'pmid' not in pub:
            pub['pmid'] = np.nan
        if 'author_affiliations' not in pub:
            pub['author_affiliations'] = [[]]
        for author in pub['author_affiliations'][0]: 
            if 'researcher_id' not in author:
                author['researcher_id'] = np.nan
            if ('affiliations' not in author)  | (len(author['affiliations'])==0):
                author['affiliations'] = [{'country_code':np.nan, 'name': np.nan, 'id':np.nan}]
            for affiliation in author['affiliations']:
                if 'country_code' not in affiliation:
                    affiliation['country_code']=np.nan
                if 'name' not in affiliation:
                    affiliation['name']=np.nan
                if 'id' not in affiliation:
                    affiliation['id']=np.nan
        if 'date' not in pub:
            pub['date'] = np.nan
        if 'times_cited' not in pub:
            pub['times_cited'] = np.nan
        if 'doi' not in pub:
            pub['doi'] = np.nan
        if 'supporting_grant_ids' not in pub:
            pub['supporting_grant_ids'] = [np.nan]
        nb = len(pub['author_affiliations'][0])
        for author in pub['author_affiliations'][0]:
            nb_authors.append(nb)
            author_id.append(author['researcher_id'])
            author_country.append(author['affiliations'][0]['country_code'])
            author_affiliation.append(author['affiliations'][0]['name'])
            author_affiliation_id.append(author['affiliations'][0]['id'])
            pub_id.append(pub['id'])
            dates.append(pub['date'])
            citations.append(pub['times_cited'])
            dois.append(pub['doi'])
            pmid.append(pub['pmid'])
            supporting_grants.append(pub['supporting_grant_ids'])

    pubs = pd.DataFrame({'pub_id':pub_id
                         , 'pmid': pmid
                         , 'date':dates
                         , 'doi':dois
                         , 'citations':citations
                         , 'nb_authors':nb_authors
                         , 'rsr_id':author_id
                         , 'rsr_country':author_country
                         , 'rsr_affiliation':author_affiliation
                         , 'rsr_affiliation_id':author_affiliation_id
                         , 'supporting_grants':supporting_grants
                        })
    pubs['supporting_grants'] = pubs['supporting_grants'].apply(lambda x: np.nan if x == [np.nan] else  ';'.join(x))
    
    return pubs

## Counterfactual based on Funding Agencies

### Define Comparison Agencies:

Funding Agencies considered are NIH Wellcome Trust, Australian  NRMC, MRC in UK

In [8]:
funders = ['Wellcome Trust', 'National Health and Medical Research Council', 'Cancer Research UK', 
           'National Cancer Institute']

### Pull all Researchers Funded by these Agencies

In [None]:
string = "search grants where funders.name in [{}] and start_year>=2007 and start_year<=2012"
string += " return grants[id+researchers+RCDC+funders+start_date+end_date+funding_usd]"
full_resp = pull_data(string=string, in_list=funders, in_type='funding agencies', return_type='grants',
                      max_in_items=max_in_items, max_return=max_return, max_overall_returns=max_overall_returns)

In [None]:
funded_grnts = parse_grant(full_resp).drop_duplicates().reset_index(drop=True)

In [None]:
# Remove all grants with no researcher info
funded_grnts = funded_grnts[funded_grnts['rsr_id'].notnull()]

# Remove all other funders that might have appeared
funded_grnts = funded_grnts[(funded_grnts['funder_name'].isin(funders))]

# Restrict to grants funded between 2007 and 2012:
funded_grnts = funded_grnts[(pd.DatetimeIndex(funded_grnts['start_date']).year>=2007)
                          & (pd.DatetimeIndex(funded_grnts['start_date']).year<=2012)].reset_index(drop=True)

Some of the counterfactual agencies (NCI, Cancer Research UK) are cancer-specific funders. Others however, such as the Wellcome Trust and the NHMRC, fund different types of medical research. For this second category of non-cancer-specific funders, we must restrict to grants on the topic of Cancer. I use the RCDC codes to restrict to Cancer-related grants.

In [None]:
# For non-cancer-specific funders, restrict to Cancer Grants:
general_funders = ['Wellcome Trust', 'National Health and Medical Research Council']
funded_grnts = funded_grnts[(funded_grnts['funder_name'].isin(general_funders)==False)
                            |(funded_grnts['rcdc_names'].str.upper().str.contains("CANCER"))]

In [None]:
funded_grnts['funder_name'].value_counts()

In [None]:
funded_grnts.head()

In [None]:
# # Keep RCDC Codes in Seperate Table
# funded_grnts_rcdc = funded_grnts[funded_grnts['rcdc_names'].notnull()][['grant_id', 'rcdc_names']].drop_duplicates()
# funded_grnts_rcdc = pd.concat([Series(row['grant_id'], row['rcdc_names'].split(';'))
#                                for _, row in funded_grnts_rcdc.iterrows()]).reset_index()
# funded_grnts_rcdc.columns = ['rcdc_name', 'grant_id']
# funded_grnts_rcdc['rcdc_name'] = funded_grnts_rcdc['rcdc_name'].str.strip()

# del funded_grnts['rcdc_codes'], funded_grnts['rcdc_names']

In [None]:
funded_grnts = id_replace(funded_grnts)

In [None]:
funded_grnts.to_csv('../data/counterfactual_funded_grants.csv', index=False)
funded_grnts_rcdc.to_csv('../data/topic_lookups/counterfactual_funded_grants_rcdc.csv', index=False)

### Pull all Grants and Publications from these Researchers

In [9]:
funded_grnts = pd.read_csv('../data/counterfactual_funded_grants.csv')
rsrs = list(funded_grnts['rsr_id'].drop_duplicates())

#### Grants

In [None]:
string = "search grants where researchers.id in [{}]"
string += " return grants[id+researchers+RCDC+funders+start_date+end_date+funding_usd]"
full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='grants'
                      , max_in_items=max_in_items, max_return=max_return, max_overall_returns=max_overall_returns)

In [None]:
grnts = parse_grant(full_resp)
grnts = grnts.drop_duplicates().reset_index(drop=True)

grnts_rcdc = parse_rcdc(full_resp, data_type='grant')

In [None]:
grnts.describe(include='all')

In [None]:
# Do ID Replace
for i in range(id_replacer.shape[0]):
    grnts['rsr_id'] = grnts['rsr_id'].apply(lambda x: id_replace(x, i))

In [None]:
grnts.to_csv('../data/counterfactual_researcher_grants.csv', index=False)
grnts_rcdc.to_csv('../data/topic_lookups/counterfactual_researcher_grants_rcdc.csv', index=False)

#### Publications

In [113]:
string = "search publications where researchers.id in [{}]"
string += " return publications[id+doi+pmid+author_affiliations+date+supporting_grant_ids+times_cited+RCDC]"
full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='publications', 
                      max_in_items=max_in_items, max_return=max_return, max_overall_returns=max_overall_returns)

Querying: 250-500/13027 researchers...
RESPONSE ERROR on i=1 and j=14.

Querying: 2000-2250/13027 researchers...
RESPONSE ERROR on i=8 and j=14.

Querying: 2250-2500/13027 researchers...
RESPONSE ERROR on i=9 and j=13.

Querying: 4000-4250/13027 researchers...
RESPONSE ERROR on i=16 and j=15.

Querying: 4250-4500/13027 researchers...
RESPONSE ERROR on i=17 and j=17.

Querying: 6500-6750/13027 researchers...
RESPONSE ERROR on i=26 and j=14.

Querying: 7000-7250/13027 researchers...
RESPONSE ERROR on i=28 and j=34.

Querying: 8500-8750/13027 researchers...
RESPONSE ERROR on i=34 and j=11.


RESPONSE ERROR on i=34 and j=12.

Querying: 9000-9250/13027 researchers...
RESPONSE ERROR on i=36 and j=21.

Querying: 9250-9500/13027 researchers...
RESPONSE ERROR on i=37 and j=13.

Querying: 10250-10500/13027 researchers...
RESPONSE ERROR on i=41 and j=11.

Querying: 11000-11250/13027 researchers...
RESPONSE ERROR on i=44 and j=8.

Querying: 11500-11750/13027 researchers...
RESPONSE ERROR on i=46 a

In [108]:
pubs = parse_publication(full_resp)
pubs = pubs.drop_duplicates().reset_index(drop=True)

pubs_rcdc = parse_rcdc(full_resp, data_type='pub')

In [None]:
pubs.describe(include='all')

In [125]:
# Do ID Replace
pubs = pd.merge(pubs, id_replacer, how='left', left_on='rsr_id', right_on='old_id')
pubs['rsr_id'] = np.where(pubs['new_id'].notnull(), pubs['new_id'], 
                                       pubs['rsr_id'])
del pubs['old_id'], pubs['new_id']

In [126]:
# Publication IDs:
pub_ids = pubs[pubs['pmid'].notnull()][['rsr_id', 'pmid']].drop_duplicates()

In [127]:
pubs.to_csv('../data/counterfactual_researcher_publications.csv', index=False)
pubs_rcdc.to_csv('../data/topic_lookups/counterfactual_researcher_publications_rcdc.csv', index=False)
pub_ids.to_csv('../data/counterfactual_pub_ids.csv', index=False)

## Sandbox