# Counterfactual Group Identification

## Python Setup

In [37]:
import requests
import time
import pandas as pd
pd.options.display.max_rows = 100
import math
import numpy as np
from json import JSONDecodeError

## Connecting to API

For full documentation: https://docs.dimensions.ai/dsl/1.8.0/

### Login Information

Fetch my password from a separate .txt file:

In [38]:
with open('../data/password.txt', 'r') as myfile:
    password = myfile.read().strip()

In [39]:
login = {
    'username': 'nj995@nyu.edu',
    'password': password
}

### API Parameters

In [40]:
def execute_query(string):
    # Send credentials to login url to retrieve token.
    resp = requests.post('https://app.dimensions.ai/api/auth.json', json=login)
    resp.raise_for_status()

    # Create http header using the generated token.
    headers = {'Authorization': "JWT " + resp.json()['token']}   
    
    # Execute DSL query.    
    resp = requests.post('https://app.dimensions.ai/api/dsl.json', data=string, headers=headers)
    
    try:
        resp = resp.json()
    except JSONDecodeError:
        resp = "RESPONSE ERROR"
    
    return resp

In [41]:
# Filter operator 'in' requires 0 < items < 512
max_in_items = 500

# Limit exceeds maximum allowed limit 1000
max_return = 1000

# Offset cannot exceed 50000
max_overall_returns = 50000

## Pulling Comparison Group Function

In [42]:
def pull_data(string, in_list, in_type, return_type, 
              max_in_items=max_in_items, max_return=max_return, max_overall_returns=max_overall_returns):
    
    full_resp = []

    for i in range(math.ceil(len(in_list)/max_in_items)):
        min_i, max_i = i*max_in_items, min((i+1)*max_in_items, len(in_list))
        print('Querying: {}-{}/{} {}...'.format(min_i, max_i, len(in_list), in_type), end = '\r')

        in_t = in_list[min_i:max_i]
        string_t = "\"" + "\", \"".join(in_t) + "\""
        query = string.format(string_t)

        j = 0
        loop = True
        while loop == True:
            query_t = query + " limit {} skip {}".format(max_return, max_return*j)
            resp = execute_query(query_t)
            if resp == "RESPONSE ERROR":
                print("\nRESPONSE ERROR on i={} and j={}.\n".format(i, j))
            else:
                full_resp.extend(resp[return_type])

                if len(resp[return_type])<max_return:
                    loop = False
            j += 1
            
            if max_return*(j+1)>max_overall_returns:
                loop = False
            
            time.sleep(2)
        
        count = resp['_stats']['total_count']        
        if resp['_stats']['total_count']>=max_overall_returns:
            print("\nATTENTION! {} {} overall, pulled only {}.\n".format(count, return_type, max_return*j-1))

    print("\nDone !")
        
    return full_resp

## Grant and Publication Parsing Function

In [43]:
def parse_grant(full_resp):
    rsr_id = []
    grant_id = []
    funder_name = []
    funder_id = []
    rcdc_codes = []
    rcdc_names = []
    start_dates = []
    end_dates = []
    funding_amount = []
    for grant in full_resp:
        if 'RCDC' not in grant:
            grant['RCDC'] = [{'id':np.nan, 'name':np.nan}]
        if 'researchers' not in grant:
            grant['researchers'] = [{'id':np.nan}]
        for researcher in grant['researchers']: 
            if 'id' not in researcher:
                tmp = researcher
                grant['researchers'].append({'id':tmp})
        if 'start_date' not in grant:
            grant['start_date'] = np.nan
        if 'end_date' not in grant:
            grant['end_date'] = np.nan
        if 'funding_amount' not in grant:
            grant['funding_amount'] = np.nan
        for i in range(len(grant['researchers'])-1, -1, -1):
            if 'id' not in grant['researchers'][i]:
                del grant['researchers'][i]
        if 'funders' not in grant:
            grant['funders'] = [{'id':np.nan,'name':np.nan}]
        for researcher in grant['researchers']:
            for funder in grant['funders']:
                rsr_id.append(researcher['id'])
                grant_id.append(grant['id'])
                funder_id.append(funder['id'])
                funder_name.append(funder['name'])
                start_dates.append(grant['start_date'])
                end_dates.append(grant['end_date'])
                funding_amount.append(grant['funding_amount'])
                rcdc_codes_t = []
                rcdc_names_t = []
                for rcdc in grant['RCDC']:
                    rcdc_codes_t.append(str(rcdc['id']))
                    rcdc_names_t.append(str(rcdc['name']))
                rcdc_codes.append("; ".join(rcdc_codes_t))
                rcdc_names.append("; ".join(rcdc_names_t))
                
    grnts = pd.DataFrame({'rsr_id':rsr_id, 'grant_id':grant_id
                          , 'funder_name':funder_name, 'funder_id':funder_id
                          , 'rcdc_codes':rcdc_codes, 'rcdc_names':rcdc_names
                          , 'start_date':start_dates, 'end_date':end_dates
                          , 'funding_amount':funding_amount
                         })

    return grnts

In [44]:
def parse_publication(full_resp):
    rsr_id = []
    pub_id = []
    pmid = []
    pmcid = []
    rcdc_codes = []
    rcdc_names = []
    dates = []
    citations = []
    dois = []
    supporting_grants = []
    for pub in full_resp:
        if 'id' not in pub:
            pub['id'] = np.nan
        if 'pmid' not in pub:
            pub['pmid'] = np.nan
        if 'pmcid' not in pub:
            pub['pmcid'] = np.nan
        if 'RCDC' not in pub:
            pub['RCDC'] = [{'id':np.nan, 'name':np.nan}]
        if 'researchers' not in pub:
            pub['researchers'] = [{'id':np.nan}]
        for researcher in pub['researchers']: 
            if 'id' not in researcher:
                tmp = researcher
                pub['researchers'].append({'id':tmp})
        if 'date' not in pub:
            pub['date'] = np.nan
        if 'times_cited' not in pub:
            pub['times_cited'] = np.nan
        if 'doi' not in pub:
            pub['doi'] = np.nan
        if 'supporting_grant_ids' not in pub:
            pub['supporting_grant_ids'] = [np.nan]
        for i in range(len(pub['researchers'])-1, -1, -1):
            if 'id' not in pub['researchers'][i]:
                del pub['researchers'][i]
        for researcher in pub['researchers']:
            rsr_id.append(researcher['id'])
            pub_id.append(pub['id'])
            dates.append(pub['date'])
            citations.append(pub['times_cited'])
            dois.append(pub['doi'])
            pmid.append(pub['pmid'])
            pmcid.append(pub['pmcid'])
            supporting_grants.append(pub['supporting_grant_ids'])
            rcdc_codes_t = []
            rcdc_names_t = []
            for rcdc in pub['RCDC']:
                rcdc_codes_t.append(str(rcdc['id']))
                rcdc_names_t.append(str(rcdc['name']))
            rcdc_codes.append("; ".join(rcdc_codes_t))
            rcdc_names.append("; ".join(rcdc_names_t))
                
    pubs = pd.DataFrame({'rsr_id':rsr_id
                         , 'pub_id':pub_id
                         , 'pmid': pmid
                         , 'pmcid': pmcid
                         , 'rcdc_codes':rcdc_codes, 'rcdc_names':rcdc_names
                         , 'date':dates, 'citations':citations, 'doi':dois
                         , 'supporting_grants':supporting_grants
                         })
    pubs['supporting_grants'] = pubs['supporting_grants'].apply(lambda x: np.nan if x == [np.nan] else  '; '.join(x))

    return pubs

## Method 1: Pulling all Researchers who Collaborated on Grants with INCA-Funded Researchers

### All Researchers on Grants Awarded to INCA Researchers

In [45]:
# rsr_info = pd.read_csv('../data/researchers.csv')
# rsrs = list(rsr_info[rsr_info['id'].notnull()]['id'].drop_duplicates().reset_index(drop = True))

### Pull all Grants Awarded by these Researchers

In [46]:
# string = "search grants where researchers.id in [{}] return grants[id+researchers+RCDC]"

In [47]:
# full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='grants')

In [48]:
# rsr_id = []
# grant_id = []
# rcdc_codes = []
# rcdc_names = []
# for grant in full_resp:
#     for researcher in grant['researchers']:
#         rsr_id.append(researcher['id'])
#         grant_id.append(grant['id'])
#         rcdc_codes_t = []
#         rcdc_names_t = []
#         if 'RCDC' in grant:
#             for rcdc in grant['RCDC']:
#                 rcdc_codes_t.append(str(rcdc['id']))
#                 rcdc_names_t.append(rcdc['name'])
#             rcdc_codes.append("; ".join(rcdc_codes_t))
#             rcdc_names.append("; ".join(rcdc_names_t))
#         else:
#             rcdc_codes.append("")
#             rcdc_names.append("")

In [49]:
# collab_grnts = pd.DataFrame({'rsr_id':rsr_id, 'grant_id':grant_id, 'rcdc_codes':rcdc_codes, 'rcdc_names':rcdc_names})
# collab_rsrs = collab_grnts.groupby('rsr_id')['grant_id'].nunique().reset_index()

### Analysis of Results

In [50]:
# # INCA-funded rsrs:
# rsrs_inca = rsrs
# print("{} INCA-funded researchers (with Dimensions IDs)".format(len(rsrs_inca)))

# # INCA-funded rsrs who have grants with the API:
# # rsrs_inca_t = inca_rsrs_grants['rsr_id'].drop_duplicates().reset_index(drop = True)
# # print("{} INCA-funded researchers appear to have grants with the Dimensions API".format(len(rsrs_inca_t)))

# # rsrs who are on grants where with >1 INCA-funded rsrs
# rsrs_all = collab_rsrs['rsr_id'].drop_duplicates().reset_index(drop = True)
# print("{} researchers participated in grants with at least one INCA-funded researcher".format(len(rsrs_all)))

In [51]:
# rsrs_comp_1 = rsrs_all[rsrs_all.apply(lambda x: x not in list(rsrs_inca))].reset_index(drop = True)

# # Number of comparison rsrs identified:
# print("Size of comparison group identified: {} researchers".format(len(rsrs_comp_1)))

## Method 2: Pulling all Researchers who Co-Authored with INCA-Funded Researchers

### All Researchers on Grants Awarded to INCA Researchers

In [52]:
# rsr_info = pd.read_csv('../data/researchers.csv')
# rsrs = list(rsr_info[rsr_info['id'].notnull()]['id'].drop_duplicates().reset_index(drop = True))

### Pull all Publications by these Researchers

In [53]:
# string = "search publications where researchers.id in [{}] return publications[id+researchers]"

In [54]:
# full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='publications')

In [55]:
# rsr_id = []
# pub_id = []
# for pub in full_resp:
#     for researcher in pub['researchers']:
#         rsr_id.append(researcher['id'])
#         pub_id.append(pub['id'])

In [56]:
# collab_pubs = pd.DataFrame({'rsr_id':rsr_id, 'publication_id':pub_id})
# collab_rsrs = collab_pubs.groupby('rsr_id')['publication_id'].nunique().reset_index()

### Analysis of Results

In [57]:
# # INCA-funded rsrs:
# rsrs_inca = rsrs
# print("{} INCA-funded researchers (with Dimensions IDs)".format(len(rsrs_inca)))

# # rsrs who are on publications with >1 INCA-funded rsrs
# rsrs_all = collab_rsrs['rsr_id'].drop_duplicates().reset_index(drop = True)
# print("{} researchers participated in publications with at least one INCA-funded researcher".format(len(rsrs_all)))

In [58]:
# rsrs_comp_2 = rsrs_all[rsrs_all.apply(lambda x: x not in list(rsrs_inca))].reset_index(drop = True)

# # Number of comparison rsrs identified:
# print("Size of comparison group identified: {} researchers".format(len(rsrs_comp_2)))

## Method 3: Counterfactual based on Funding Agencies

### Define Comparison Agencies:

Funding Agencies considered are NIH Wellcome Trust, Australian  NRMC, MRC in UK

In [59]:
funders = ['Wellcome Trust', 'National Health and Medical Research Council', 'Cancer Research UK', 
           'National Cancer Institute']

### Pull all Researchers Funded by these Agencies

In [60]:
# string = "search grants where funders.name in [{}] and start_year>=2007 and start_year<=2012"
# string += " return grants[id+researchers+RCDC+funders+start_date+end_date+funding_usd]"
# full_resp = pull_data(string=string, in_list=funders, in_type='funding agencies', return_type='grants')

In [61]:
# fundr_grnts = parse_grant(full_resp).drop_duplicates().reset_index(drop=True)

In [62]:
# # Remove all other funders that might have appeared
# fundr_grnts = fundr_grnts[(fundr_grnts['funder_name'].isin(funders))]

# # For non-cancer-specific funders, resstrict to Cancer Grants:
# general_funders = ['Wellcome Trust', 'National Health and Medical Research Council']
# fundr_grnts = fundr_grnts[(fundr_grnts['funder_name'].isin(general_funders)==False)
#                           |(fundr_grnts['rcdc_names'].str.contains("Cancer"))]

# # Restrict to grants funded between 2007 and 2012:
# fundr_grnts = fundr_grnts[(pd.DatetimeIndex(fundr_grnts['start_date']).year>=2007)
#                           & (pd.DatetimeIndex(fundr_grnts['start_date']).year<=2012)].reset_index(drop=True)

In [63]:
# fundr_grnts['funder_name'].value_counts()

In [64]:
# fundr_grnts.to_csv('../data/counterfactual/counterfactual_funded_grants.csv', index=False)

### Pull all Grants and Publications from these Researchers

In [65]:
fundr_grnts = pd.read_csv('../data/counterfactual/counterfactual_funded_grants.csv')
rsrs = fundr_grnts[fundr_grnts['rsr_id'].notnull()]['rsr_id'].drop_duplicates()

#### Grants

In [66]:
# string = "search grants where researchers.id in [{}]"
# string += " return grants[id+researchers+RCDC+funders+start_date+end_date+funding_usd]"
# full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='grants')

In [67]:
# grnts = parse_grant(full_resp)
# grnts = grnts.drop_duplicates().reset_index(drop=True)

In [68]:
# grnts.describe(include='all')

In [69]:
# grnts.to_csv('../data/counterfactual/counterfactual_researcher_grants.csv', index=False)

#### Publications

In [70]:
string = "search publications where researchers.id in [{}]"
string += " return publications[id+doi+pmid+pmcid+researchers+date+supporting_grant_ids+times_cited+RCDC]"
full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='publications')

Querying: 0-500/13027 researchers...

KeyboardInterrupt: 

In [35]:
pubs = parse_publication(full_resp)
pubs = pubs.drop_duplicates().reset_index(drop=True)

In [36]:
pubs.describe(include='all')

Unnamed: 0,rsr_id,pub_id,rcdc_codes,rcdc_names,date,citations,doi,supporting_grants
count,6864351,6864351,6864351.0,6864351.0,6864236,6864351.0,6571293,3355994
unique,1065311,1085784,263320.0,263320.0,12150,,1024681,303636
top,ur.012724545020.23,pub.1054508044,,,2011-10-24,,10.1007/bf03375463,grant.2438826
freq,2694,2125,2030551.0,2030551.0,93168,,2125,26637
mean,,,,,,54.03595,,
std,,,,,,190.4285,,
min,,,,,,0.0,,
25%,,,,,,2.0,,
50%,,,,,,15.0,,
75%,,,,,,47.0,,


In [37]:
pubs.to_csv('../data/counterfactual/counterfactual_researcher_publications.csv', index=False)

## Sandbox