# Counterfactual Group Identification

## Python Setup

In [11]:
import requests
import time
import pandas as pd
pd.options.display.max_rows = 100
import math
import numpy as np
from json import JSONDecodeError

## Connecting to API

For full documentation: https://docs.dimensions.ai/dsl/1.8.0/

### Login Information

Fetch my password from a separate .txt file:

In [12]:
with open('../data/password.txt', 'r') as myfile:
    password = myfile.read().strip()

In [13]:
login = {
    'username': 'nj995@nyu.edu',
    'password': password
}

### API Parameters

In [14]:
def execute_query(string):
    # Send credentials to login url to retrieve token.
    resp = requests.post('https://app.dimensions.ai/api/auth.json', json=login)
    resp.raise_for_status()

    # Create http header using the generated token.
    headers = {'Authorization': "JWT " + resp.json()['token']}   
    
    # Execute DSL query.    
    resp = requests.post('https://app.dimensions.ai/api/dsl.json', data=string, headers=headers)
    
    try:
        resp = resp.json()
    except JSONDecodeError:
        resp = "RESPONSE ERROR"
    
    return resp

In [15]:
# Filter operator 'in' requires 0 < items < 512
max_in_items = 100

# Limit exceeds maximum allowed limit 1000
max_return = 500

# Offset cannot exceed 50000
max_overall_returns = 50000

## Pulling Comparison Group Function

In [16]:
def pull_data(string, in_list, in_type, return_type, 
              max_in_items=max_in_items, max_return=max_return, max_overall_returns=max_overall_returns):
    
    full_resp = []

    for i in range(math.ceil(len(in_list)/max_in_items)):
        min_i, max_i = i*max_in_items, min((i+1)*max_in_items, len(in_list))
        print('Querying: {}-{}/{} {}...'.format(min_i, max_i, len(in_list), in_type), end = '\r')

        in_t = in_list[min_i:max_i]
        string_t = "\"" + "\", \"".join(in_t) + "\""
        query = string.format(string_t)

        j = 0
        loop = True
        while loop == True:
            query_t = query + " limit {} skip {}".format(max_return, max_return*j)
            resp = execute_query(query_t)
            if resp == "RESPONSE ERROR":
                print("\nRESPONSE ERROR on i={} and j={}.\n".format(i, j))
            else:
                full_resp.extend(resp[return_type])

                if len(resp[return_type])<max_return:
                    loop = False
            j += 1
            
            if max_return*(j+1)>max_overall_returns:
                loop = False
            
            time.sleep(2)
        
        count = resp['_stats']['total_count']        
        if resp['_stats']['total_count']>=max_overall_returns:
            print("\nATTENTION! {} {} overall, pulled only {}.\n".format(count, return_type, max_return*j-1))

    print("\nDone !")
        
    return full_resp

## Grant and Publication Parsing Function

In [17]:
def parse_grant(full_resp):
    rsr_id = []
    grant_id = []
    funder_name = []
    funder_id = []
    rcdc_codes = []
    rcdc_names = []
    start_dates = []
    end_dates = []
    funding_amount = []
    for grant in full_resp:
        if 'RCDC' not in grant:
            grant['RCDC'] = [{'id':np.nan, 'name':np.nan}]
        if 'researchers' not in grant:
            grant['researchers'] = [{'id':np.nan}]
        for researcher in grant['researchers']: 
            if 'id' not in researcher:
                tmp = researcher
                grant['researchers'].append({'id':tmp})
        if 'start_date' not in grant:
            grant['start_date'] = np.nan
        if 'end_date' not in grant:
            grant['end_date'] = np.nan
        if 'funding_amount' not in grant:
            grant['funding_amount'] = np.nan
        for i in range(len(grant['researchers'])-1, -1, -1):
            if 'id' not in grant['researchers'][i]:
                del grant['researchers'][i]
        if 'funders' not in grant:
            grant['funders'] = [{'id':np.nan,'name':np.nan}]
        for researcher in grant['researchers']:
            for funder in grant['funders']:
                rsr_id.append(researcher['id'])
                grant_id.append(grant['id'])
                funder_id.append(funder['id'])
                funder_name.append(funder['name'])
                start_dates.append(grant['start_date'])
                end_dates.append(grant['end_date'])
                funding_amount.append(grant['funding_amount'])
                rcdc_codes_t = []
                rcdc_names_t = []
                for rcdc in grant['RCDC']:
                    rcdc_codes_t.append(str(rcdc['id']))
                    rcdc_names_t.append(str(rcdc['name']))
                rcdc_codes.append("; ".join(rcdc_codes_t))
                rcdc_names.append("; ".join(rcdc_names_t))
                
    grnts = pd.DataFrame({'rsr_id':rsr_id, 'grant_id':grant_id
                          , 'funder_name':funder_name, 'funder_id':funder_id
                          , 'rcdc_codes':rcdc_codes, 'rcdc_names':rcdc_names
                          , 'start_date':start_dates, 'end_date':end_dates
                          , 'funding_amount':funding_amount
                         })

    return grnts

In [18]:
def parse_publication(full_resp):
    rsr_id = []
    pub_id = []
    rcdc_codes = []
    rcdc_names = []
    dates = []
    citations = []
    dois = []
    supporting_grants = []
    for pub in full_resp:
        if 'id' not in pub:
            pub['id'] = np.nan
        if 'RCDC' not in pub:
            pub['RCDC'] = [{'id':np.nan, 'name':np.nan}]
        if 'researchers' not in pub:
            pub['researchers'] = [{'id':np.nan}]
        for researcher in pub['researchers']: 
            if 'id' not in researcher:
                tmp = researcher
                pub['researchers'].append({'id':tmp})
        if 'date' not in pub:
            pub['date'] = np.nan
        if 'times_cited' not in pub:
            pub['times_cited'] = np.nan
        if 'doi' not in pub:
            pub['doi'] = np.nan
        if 'supporting_grant_ids' not in pub:
            pub['supporting_grant_ids'] = [np.nan]
        for i in range(len(pub['researchers'])-1, -1, -1):
            if 'id' not in pub['researchers'][i]:
                del pub['researchers'][i]
        for researcher in pub['researchers']:
            rsr_id.append(researcher['id'])
            pub_id.append(pub['id'])
            dates.append(pub['date'])
            citations.append(pub['times_cited'])
            dois.append(pub['doi'])
            supporting_grants.append(pub['supporting_grant_ids'])
            rcdc_codes_t = []
            rcdc_names_t = []
            for rcdc in pub['RCDC']:
                rcdc_codes_t.append(str(rcdc['id']))
                rcdc_names_t.append(str(rcdc['name']))
            rcdc_codes.append("; ".join(rcdc_codes_t))
            rcdc_names.append("; ".join(rcdc_names_t))
                
    pubs = pd.DataFrame({'rsr_id':rsr_id, 'pub_id':pub_id
                         , 'rcdc_codes':rcdc_codes, 'rcdc_names':rcdc_names
                         , 'date':dates, 'citations':citations, 'doi':dois
                         , 'supporting_grants':supporting_grants
                         })
    pubs['supporting_grants'] = pubs['supporting_grants'].apply(lambda x: np.nan if x == [np.nan] else  '; '.join(x))

    return pubs

## Method 1: Pulling all Researchers who Collaborated on Grants with INCA-Funded Researchers

### All Researchers on Grants Awarded to INCA Researchers

In [None]:
# rsr_info = pd.read_csv('../data/researchers.csv')
# rsrs = list(rsr_info[rsr_info['id'].notnull()]['id'].drop_duplicates().reset_index(drop = True))

### Pull all Grants Awarded by these Researchers

In [None]:
# string = "search grants where researchers.id in [{}] return grants[id+researchers+RCDC]"

In [None]:
# full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='grants')

In [None]:
# rsr_id = []
# grant_id = []
# rcdc_codes = []
# rcdc_names = []
# for grant in full_resp:
#     for researcher in grant['researchers']:
#         rsr_id.append(researcher['id'])
#         grant_id.append(grant['id'])
#         rcdc_codes_t = []
#         rcdc_names_t = []
#         if 'RCDC' in grant:
#             for rcdc in grant['RCDC']:
#                 rcdc_codes_t.append(str(rcdc['id']))
#                 rcdc_names_t.append(rcdc['name'])
#             rcdc_codes.append("; ".join(rcdc_codes_t))
#             rcdc_names.append("; ".join(rcdc_names_t))
#         else:
#             rcdc_codes.append("")
#             rcdc_names.append("")

In [None]:
# collab_grnts = pd.DataFrame({'rsr_id':rsr_id, 'grant_id':grant_id, 'rcdc_codes':rcdc_codes, 'rcdc_names':rcdc_names})
# collab_rsrs = collab_grnts.groupby('rsr_id')['grant_id'].nunique().reset_index()

### Analysis of Results

In [None]:
# # INCA-funded rsrs:
# rsrs_inca = rsrs
# print("{} INCA-funded researchers (with Dimensions IDs)".format(len(rsrs_inca)))

# # INCA-funded rsrs who have grants with the API:
# # rsrs_inca_t = inca_rsrs_grants['rsr_id'].drop_duplicates().reset_index(drop = True)
# # print("{} INCA-funded researchers appear to have grants with the Dimensions API".format(len(rsrs_inca_t)))

# # rsrs who are on grants where with >1 INCA-funded rsrs
# rsrs_all = collab_rsrs['rsr_id'].drop_duplicates().reset_index(drop = True)
# print("{} researchers participated in grants with at least one INCA-funded researcher".format(len(rsrs_all)))

In [None]:
# rsrs_comp_1 = rsrs_all[rsrs_all.apply(lambda x: x not in list(rsrs_inca))].reset_index(drop = True)

# # Number of comparison rsrs identified:
# print("Size of comparison group identified: {} researchers".format(len(rsrs_comp_1)))

## Method 2: Pulling all Researchers who Co-Authored with INCA-Funded Researchers

### All Researchers on Grants Awarded to INCA Researchers

In [None]:
# rsr_info = pd.read_csv('../data/researchers.csv')
# rsrs = list(rsr_info[rsr_info['id'].notnull()]['id'].drop_duplicates().reset_index(drop = True))

### Pull all Publications by these Researchers

In [None]:
# string = "search publications where researchers.id in [{}] return publications[id+researchers]"

In [None]:
# full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='publications')

In [None]:
# rsr_id = []
# pub_id = []
# for pub in full_resp:
#     for researcher in pub['researchers']:
#         rsr_id.append(researcher['id'])
#         pub_id.append(pub['id'])

In [None]:
# collab_pubs = pd.DataFrame({'rsr_id':rsr_id, 'publication_id':pub_id})
# collab_rsrs = collab_pubs.groupby('rsr_id')['publication_id'].nunique().reset_index()

### Analysis of Results

In [None]:
# # INCA-funded rsrs:
# rsrs_inca = rsrs
# print("{} INCA-funded researchers (with Dimensions IDs)".format(len(rsrs_inca)))

# # rsrs who are on publications with >1 INCA-funded rsrs
# rsrs_all = collab_rsrs['rsr_id'].drop_duplicates().reset_index(drop = True)
# print("{} researchers participated in publications with at least one INCA-funded researcher".format(len(rsrs_all)))

In [None]:
# rsrs_comp_2 = rsrs_all[rsrs_all.apply(lambda x: x not in list(rsrs_inca))].reset_index(drop = True)

# # Number of comparison rsrs identified:
# print("Size of comparison group identified: {} researchers".format(len(rsrs_comp_2)))

## Method 3: Counterfactual based on Funding Agencies

### Define Comparison Agencies:

Funding Agencies considered are NIH Wellcome Trust, Australian  NRMC, MRC in UK

In [19]:
funders = ['Wellcome Trust', 'National Health and Medical Research Council', 'Cancer Research UK', 
           'National Cancer Institute']

### Pull all Researchers Funded by these Agencies

In [None]:
string = "search grants where funders.name in [{}] and start_year>=2007 and start_year<=2012"
string += " return grants[id+researchers+RCDC+funders+start_date+end_date+funding_usd]"
full_resp = pull_data(string=string, in_list=funders, in_type='funding agencies', return_type='grants')

In [None]:
fundr_grnts = parse_grant(full_resp).drop_duplicates().reset_index(drop=True)

In [None]:
# Remove all other funders that might have appeared
fundr_grnts = fundr_grnts[(fundr_grnts['funder_name'].isin(funders))]

# For non-cancer-specific funders, resstrict to Cancer Grants:
general_funders = ['Wellcome Trust', 'National Health and Medical Research Council']
fundr_grnts = fundr_grnts[(fundr_grnts['funder_name'].isin(general_funders)==False)
                          |(fundr_grnts['rcdc_names'].str.contains("Cancer"))]

# Restrict to grants funded between 2007 and 2012:
fundr_grnts = fundr_grnts[(pd.DatetimeIndex(fundr_grnts['start_date']).year>=2007)
                          & (pd.DatetimeIndex(fundr_grnts['start_date']).year<=2012)].reset_index(drop=True)

In [None]:
fundr_grnts['funder_name'].value_counts()

In [None]:
fundr_grnts.to_csv('../data/counterfactual/counterfactual_funded_grants.csv', index=False)

### Pull all Grants and Publications from these Researchers

In [20]:
fundr_grnts = pd.read_csv('../data/counterfactual/counterfactual_funded_grants.csv')
rsrs = fundr_grnts[fundr_grnts['rsr_id'].notnull()]['rsr_id'].drop_duplicates()

#### Grants

In [None]:
string = "search grants where researchers.id in [{}]"
string += " return grants[id+researchers+RCDC+funders+start_date+end_date+funding_usd]"
full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='grants')

In [None]:
grnts = parse_grant(full_resp)
grnts = grnts.drop_duplicates().reset_index(drop=True)

In [None]:
grnts.describe(include='all')

In [None]:
grnts.to_csv('../data/counterfactual/counterfactual_researcher_grants.csv', index=False)

#### Publications

In [21]:
string = "search publications where researchers.id in [{}]"
string += " return publications[id+researchers+date+doi+supporting_grant_ids+times_cited+RCDC]"
full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='publications')

Querying: 400-500/13027 researchers...
RESPONSE ERROR on i=4 and j=5.

Querying: 2000-2100/13027 researchers...
RESPONSE ERROR on i=20 and j=6.

Querying: 2200-2300/13027 researchers...
RESPONSE ERROR on i=22 and j=4.

Querying: 4200-4300/13027 researchers...
RESPONSE ERROR on i=42 and j=6.

Querying: 4300-4400/13027 researchers...
RESPONSE ERROR on i=43 and j=8.

Querying: 4500-4600/13027 researchers...

SSLError: HTTPSConnectionPool(host='app.dimensions.ai', port=443): Max retries exceeded with url: /api/auth.json (Caused by SSLError(SSLError("bad handshake: SysCallError(54, 'ECONNRESET')",),))

In [None]:
pubs = parse_publication(full_resp)
pubs = pubs.drop_duplicates().reset_index(drop=True)

In [None]:
pubs.describe(include='all')

In [None]:
pubs.to_csv('../data/counterfactual/counterfactual_researcher_publications.csv', index=False)

## Sandbox

In [97]:
import requests

#   The credentials to be used
login = {
    'username': 'nj995@nyu.edu',
    'password': password
}

#   Send credentials to login url to retrieve token. Raise
#   an error, if the return code indicates a problem.
#   Please use the URL of the system you'd like to access the API
#   in the example below.
resp = requests.post('https://app.dimensions.ai/api/auth.json', json=login)
resp.raise_for_status()

#   Create http header using the generated token.
headers = {
    'Authorization': "JWT " + resp.json()['token']
}


string = 'search publications where researchers.id in ["ur.01202737463.96", "ur.015674223332.58", "ur.013010145372.31", "ur.011313360332.51", "ur.01141226360.54", "ur.01112156155.34", "ur.0605030736.29", "ur.014301262332.40", "ur.0600074022.23", "ur.01156201704.01", "ur.01353745746.59", "ur.0604131004.05", "ur.0751761400.00", "ur.01015475066.34", "ur.01261146267.06", "ur.0711513572.31", "ur.0772354011.41", "ur.01014205075.45", "ur.01105337344.26", "ur.01256356077.11", "ur.01152212773.18", "ur.011402350541.85", "ur.010371661517.51", "ur.0672152455.86", "ur.0732472611.02", "ur.01116371135.90", "ur.01345516513.05", "ur.01064544102.71", "ur.0724620037.24", "ur.014251473547.60", "ur.01007427161.08", "ur.01236422707.67", "ur.01054322061.75", "ur.01305325772.60", "ur.013640560135.92", "ur.0777742233.90", "ur.013765300377.32", "ur.0666436611.32", "ur.01005132353.24", "ur.01247326511.02", "ur.0764171720.06", "ur.01203545107.19", "ur.0644467574.15", "ur.0760330620.74", "ur.01217272123.47", "ur.01177604614.47", "ur.015402573057.95", "ur.0673414137.91", "ur.01314074042.08", "ur.01316543003.47", "ur.0735524505.51", "ur.013237704577.12", "ur.0713342226.14", "ur.01332331073.28", "ur.0636726033.44", "ur.014135501347.25", "ur.01066132744.05", "ur.0747742220.33", "ur.01105375745.68", "ur.01024221070.23", "ur.0614762647.80", "ur.01177033022.74", "ur.01367601343.61", "ur.0631021217.43", "ur.01222740527.10", "ur.01306134155.52", "ur.0676201107.20", "ur.0620754753.04", "ur.0611157352.25", "ur.01152551730.11", "ur.01137652666.28", "ur.01356062255.31", "ur.01327063407.67", "ur.0657272552.37", "ur.014344631152.09", "ur.010131650267.18", "ur.0617234032.33", "ur.011275031217.12", "ur.01005154541.22", "ur.011436500207.26", "ur.01325041140.60", "ur.0651202231.24", "ur.01200371710.83", "ur.0626772374.33", "ur.07462720232.71", "ur.011650443431.18", "ur.01162310775.46", "ur.01006303753.26", "ur.01161510677.60", "ur.0641135726.40", "ur.01054264565.20", "ur.01301043554.11", "ur.0714770166.98", "ur.0742737707.24", "ur.01323024461.91", "ur.01054277520.73", "ur.0606644173.97", "ur.010460364674.11", "ur.01321750743.70", "ur.01311723006.34", "ur.0737601773.45", "ur.0737707452.39", "ur.01240467104.91", "ur.01306204206.71", "ur.01053465175.96", "ur.013613621777.30", "ur.01165147003.72", "ur.015214035357.43", "ur.012607214677.06", "ur.0627545135.27", "ur.0737550317.08", "ur.0733327250.37", "ur.01162111326.55", "ur.01122514433.47", "ur.01342300235.56", "ur.0757064066.46", "ur.016170013305.12", "ur.014210055537.45", "ur.01117075632.95", "ur.0576012305.16", "ur.0653312607.36", "ur.012706321057.78", "ur.01155232405.71", "ur.01366501232.37", "ur.01051150324.42", "ur.0760217130.82", "ur.01117465373.03", "ur.01312141733.72", "ur.0732625222.53", "ur.01072746376.40", "ur.01154461541.62", "ur.011062775537.73", "ur.01000615241.08", "ur.01016023671.74", "ur.01335506047.63", "ur.01173223632.99", "ur.012404244027.38", "ur.01036361256.24", "ur.01257242276.14", "ur.0731677142.47", "ur.01162104421.14", "ur.01173124211.89", "ur.01212722003.04", "ur.01314252321.11", "ur.0627174752.40", "ur.01051272131.44", "ur.01265573603.85", "ur.01355341014.38", "ur.015651250037.98", "ur.01221150143.06", "ur.013773340007.18", "ur.016157562137.30", "ur.01033627435.43", "ur.0575614052.26", "ur.0674066744.08", "ur.01345753154.94", "ur.01024724414.35", "ur.01036302714.52", "ur.01151576226.01", "ur.01370235414.07", "ur.0667714372.65", "ur.013111646177.43", "ur.01106612172.87", "ur.0734763244.08", "ur.0617760334.42", "ur.0777715440.15", "ur.01205717204.52", "ur.01322145604.54", "ur.01303264045.28", "ur.0674367554.40", "ur.01036476201.15", "ur.0732003253.57", "ur.01343474566.01", "ur.01261475636.05", "ur.011105234537.67", "ur.01342355762.29", "ur.01314172663.75", "ur.01040302023.28", "ur.0773432270.25", "ur.01001013200.18", "ur.01301442767.61", "ur.055716301.40", "ur.0701011757.22", "ur.01035174633.79", "ur.0634230053.54", "ur.0670205103.94", "ur.016564157752.80", "ur.014755714717.38", "ur.01261654265.12", "ur.01273465550.17", "ur.01320526713.47", "ur.01176711510.97", "ur.0761406544.55", "ur.011675057424.25", "ur.01045144766.67", "ur.01351460210.76", "ur.0636701477.86", "ur.0577057764.60", "ur.01075301637.18", "ur.01126153450.53", "ur.0756442603.68", "ur.01333255235.11", "ur.0745271445.97", "ur.01361667651.18", "ur.01346675446.22", "ur.0603240061.97", "ur.0611763734.56", "ur.0672335566.45", "ur.01045660030.75", "ur.01012477034.17", "ur.01355745464.73", "ur.0754734542.52", "ur.01214473752.81", "ur.01224647022.55", "ur.0601227214.26", "ur.01170627620.60", "ur.01304723307.97", "ur.01303346104.21", "ur.0773300443.74", "ur.01131061052.70", "ur.011675663547.74", "ur.0657552271.41", "ur.01242574260.49", "ur.01122614676.52", "ur.01075424275.69", "ur.01012225354.94", "ur.015001266322.20", "ur.01350551144.97", "ur.0663550706.74", "ur.01322660063.23", "ur.0744336325.10", "ur.01314621607.24", "ur.01255100412.07", "ur.0655145416.09", "ur.01236415450.50", "ur.01120775141.31", "ur.01241555620.25", "ur.01202316744.36", "ur.01250302263.06", "ur.0666244325.76", "ur.0627014214.42", "ur.013072654327.44", "ur.0667575342.85", "ur.0630052062.43", "ur.0631633437.74", "ur.01160326560.69", "ur.01077675340.04", "ur.01250205034.43", "ur.016074256064.51", "ur.0603254777.44"] return publications[id+researchers+date+doi+supporting_grant_ids+times_cited+RCDC] limit 1000 skip 7000'

#   Execute DSL query.
resp = requests.post(
    'https://app.dimensions.ai/api/dsl.json',
    data=string,
    headers=headers)

#   Display raw result
# print(resp.json())

In [98]:
resp

<Response [502]>