# Counterfactual Group Identification

## Python Setup

In [1]:
import requests
import time
import pandas as pd
pd.options.display.max_rows = 100
import math
import numpy as np
from json import JSONDecodeError

## Connecting to API

### Login Information

Fetch my password from a separate .txt file:

In [2]:
with open('../data/password.txt', 'r') as myfile:
    password = myfile.read().strip()

In [3]:
login = {
    'username': 'nj995@nyu.edu',
    'password': password
}

### API Parameters

In [4]:
def execute_query(string):
    # Send credentials to login url to retrieve token.
    resp = requests.post('https://app.dimensions.ai/api/auth.json', json=login)
    resp.raise_for_status()

    # Create http header using the generated token.
    headers = {'Authorization': "JWT " + resp.json()['token']}   
    
    # Execute DSL query.    
    resp = requests.post('https://app.dimensions.ai/api/dsl.json', data=string, headers=headers)
    
    try:
        resp = resp.json()
    except JSONDecodeError:
        resp = "RESPONSE ERROR"
    
    return resp

In [5]:
# Filter operator 'in' requires 0 < items < 512
max_in_items = 250

# Limit exceeds maximum allowed limit 1000
max_return = 1000

# Offset cannot exceed 50000
max_overall_returns = 50000

## Pulling Comparison Group Function

In [6]:
def pull_data(string, in_list, in_type, return_type, max_in_items=max_in_items, max_return=max_return):
    
    full_resp = []

    for i in range(math.ceil(len(in_list)/max_in_items)):
        min_i, max_i = i*max_in_items, min((i+1)*max_in_items, len(in_list))
        print('Querying: {}-{}/{} {}...'.format(min_i, max_i, len(in_list), in_type), end = '\r')

        in_t = in_list[min_i:max_i]
        string_t = "\"" + "\", \"".join(in_t) + "\""
        query = string.format(string_t)

        j = 0
        loop = True
        while loop == True:
            query_t = query + " limit {} skip {}".format(max_return, max_return*j)
            resp = execute_query(query_t)
            if resp == "RESPONSE ERROR":
                print("\nRESPONSE ERROR on i={} and j={}.\n".format(i, j))
            else:
                full_resp.extend(resp[return_type])

                if len(resp[return_type])<max_return:
                    loop = False
            j += 1
            
            if max_return*(j+1)>max_overall_returns:
                loop = False
            
            time.sleep(1)
        
        count = resp['_stats']['total_count']        
        if resp['_stats']['total_count']>=max_overall_returns:
            print("\nATTENTION! {} {} overall, pulled only {}.\n".format(count, return_type, max_return*j-1))

    print("\nDone !")
        
    return full_resp

## Grant and Publication Parsing Function

In [7]:
def parse_grant(full_resp):
    rsr_id = []
    grant_id = []
    funder_name = []
    funder_id = []
    rcdc_codes = []
    rcdc_names = []
    start_dates = []
    end_dates = []
    funding_amount = []
    for grant in full_resp:
        if 'RCDC' not in grant:
            grant['RCDC'] = [{'id':np.nan, 'name':np.nan}]
        if 'researchers' not in grant:
            grant['researchers'] = [{'id':np.nan}]
        for researcher in grant['researchers']: 
            if 'id' not in researcher:
                tmp = researcher
                grant['researchers'].append({'id':tmp})
        if 'start_date' not in grant:
            grant['start_date'] = np.nan
        if 'end_date' not in grant:
            grant['end_date'] = np.nan
        if 'funding_amount' not in grant:
            grant['funding_amount'] = np.nan
        for i in range(len(grant['researchers'])-1, -1, -1):
            if 'id' not in grant['researchers'][i]:
                del grant['researchers'][i]
        if 'funders' not in grant:
            grant['funders'] = [{'id':np.nan,'name':np.nan}]
        for researcher in grant['researchers']:
            for funder in grant['funders']:
                rsr_id.append(researcher['id'])
                grant_id.append(grant['id'])
                funder_id.append(funder['id'])
                funder_name.append(funder['name'])
                start_dates.append(grant['start_date'])
                end_dates.append(grant['end_date'])
                funding_amount.append(grant['funding_amount'])
                rcdc_codes_t = []
                rcdc_names_t = []
                for rcdc in grant['RCDC']:
                    rcdc_codes_t.append(str(rcdc['id']))
                    rcdc_names_t.append(str(rcdc['name']))
                rcdc_codes.append("; ".join(rcdc_codes_t))
                rcdc_names.append("; ".join(rcdc_names_t))
                
    grnts = pd.DataFrame({'rsr_id':rsr_id, 'grant_id':grant_id
                          , 'funder_name':funder_name, 'funder_id':funder_id
                          , 'rcdc_codes':rcdc_codes, 'rcdc_names':rcdc_names
                          , 'start_date':start_dates, 'end_date':end_dates
                          , 'funding_amount':funding_amount
                         })

    return grnts

In [8]:
def parse_publication(full_resp):
    rsr_id = []
    pub_id = []
    rcdc_codes = []
    rcdc_names = []
    dates = []
    citations = []
    dois = []
    supporting_grants = []
    for pub in full_resp:
        if 'id' not in pub:
            pub['id'] = np.nan
        if 'RCDC' not in pub:
            pub['RCDC'] = [{'id':np.nan, 'name':np.nan}]
        if 'researchers' not in pub:
            pub['researchers'] = [{'id':np.nan}]
        for researcher in pub['researchers']: 
            if 'id' not in researcher:
                tmp = researcher
                pub['researchers'].append({'id':tmp})
        if 'date' not in pub:
            pub['date'] = np.nan
        if 'times_cited' not in pub:
            pub['times_cited'] = np.nan
        if 'doi' not in pub:
            pub['doi'] = np.nan
        if 'supporting_grant_ids' not in pub:
            pub['supporting_grant_ids'] = [np.nan]
        for i in range(len(pub['researchers'])-1, -1, -1):
            if 'id' not in pub['researchers'][i]:
                del pub['researchers'][i]
        for researcher in pub['researchers']:
            rsr_id.append(researcher['id'])
            pub_id.append(pub['id'])
            dates.append(pub['date'])
            citations.append(pub['times_cited'])
            dois.append(pub['doi'])
            supporting_grants.append(pub['supporting_grant_ids'])
            rcdc_codes_t = []
            rcdc_names_t = []
            for rcdc in pub['RCDC']:
                rcdc_codes_t.append(str(rcdc['id']))
                rcdc_names_t.append(str(rcdc['name']))
            rcdc_codes.append("; ".join(rcdc_codes_t))
            rcdc_names.append("; ".join(rcdc_names_t))
                
    pubs = pd.DataFrame({'rsr_id':rsr_id, 'pub_id':pub_id
                         , 'rcdc_codes':rcdc_codes, 'rcdc_names':rcdc_names
                         , 'date':dates, 'citations':citations, 'doi':dois
                         , 'supporting_grants':supporting_grants
                         })
    pubs['supporting_grants'] = pubs['supporting_grants'].apply(lambda x: np.nan if x == [np.nan] else  '; '.join(x))

    return pubs

## Method 1: Pulling all Researchers who Collaborated on Grants with INCA-Funded Researchers

### All Researchers on Grants Awarded to INCA Researchers

In [9]:
# rsr_info = pd.read_csv('../data/researchers.csv')
# rsrs = list(rsr_info[rsr_info['id'].notnull()]['id'].drop_duplicates().reset_index(drop = True))

### Pull all Grants Awarded by these Researchers

In [10]:
# string = "search grants where researchers.id in [{}] return grants[id+researchers+RCDC]"

In [11]:
# full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='grants')

In [12]:
# rsr_id = []
# grant_id = []
# rcdc_codes = []
# rcdc_names = []
# for grant in full_resp:
#     for researcher in grant['researchers']:
#         rsr_id.append(researcher['id'])
#         grant_id.append(grant['id'])
#         rcdc_codes_t = []
#         rcdc_names_t = []
#         if 'RCDC' in grant:
#             for rcdc in grant['RCDC']:
#                 rcdc_codes_t.append(str(rcdc['id']))
#                 rcdc_names_t.append(rcdc['name'])
#             rcdc_codes.append("; ".join(rcdc_codes_t))
#             rcdc_names.append("; ".join(rcdc_names_t))
#         else:
#             rcdc_codes.append("")
#             rcdc_names.append("")

In [13]:
# collab_grnts = pd.DataFrame({'rsr_id':rsr_id, 'grant_id':grant_id, 'rcdc_codes':rcdc_codes, 'rcdc_names':rcdc_names})
# collab_rsrs = collab_grnts.groupby('rsr_id')['grant_id'].nunique().reset_index()

### Analysis of Results

In [14]:
# # INCA-funded rsrs:
# rsrs_inca = rsrs
# print("{} INCA-funded researchers (with Dimensions IDs)".format(len(rsrs_inca)))

# # INCA-funded rsrs who have grants with the API:
# # rsrs_inca_t = inca_rsrs_grants['rsr_id'].drop_duplicates().reset_index(drop = True)
# # print("{} INCA-funded researchers appear to have grants with the Dimensions API".format(len(rsrs_inca_t)))

# # rsrs who are on grants where with >1 INCA-funded rsrs
# rsrs_all = collab_rsrs['rsr_id'].drop_duplicates().reset_index(drop = True)
# print("{} researchers participated in grants with at least one INCA-funded researcher".format(len(rsrs_all)))

In [15]:
# rsrs_comp_1 = rsrs_all[rsrs_all.apply(lambda x: x not in list(rsrs_inca))].reset_index(drop = True)

# # Number of comparison rsrs identified:
# print("Size of comparison group identified: {} researchers".format(len(rsrs_comp_1)))

## Method 2: Pulling all Researchers who Co-Authored with INCA-Funded Researchers

### All Researchers on Grants Awarded to INCA Researchers

In [16]:
# rsr_info = pd.read_csv('../data/researchers.csv')
# rsrs = list(rsr_info[rsr_info['id'].notnull()]['id'].drop_duplicates().reset_index(drop = True))

### Pull all Publications by these Researchers

In [17]:
# string = "search publications where researchers.id in [{}] return publications[id+researchers]"

In [18]:
# full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='publications')

In [19]:
# rsr_id = []
# pub_id = []
# for pub in full_resp:
#     for researcher in pub['researchers']:
#         rsr_id.append(researcher['id'])
#         pub_id.append(pub['id'])

In [20]:
# collab_pubs = pd.DataFrame({'rsr_id':rsr_id, 'publication_id':pub_id})
# collab_rsrs = collab_pubs.groupby('rsr_id')['publication_id'].nunique().reset_index()

### Analysis of Results

In [21]:
# # INCA-funded rsrs:
# rsrs_inca = rsrs
# print("{} INCA-funded researchers (with Dimensions IDs)".format(len(rsrs_inca)))

# # rsrs who are on publications with >1 INCA-funded rsrs
# rsrs_all = collab_rsrs['rsr_id'].drop_duplicates().reset_index(drop = True)
# print("{} researchers participated in publications with at least one INCA-funded researcher".format(len(rsrs_all)))

In [22]:
# rsrs_comp_2 = rsrs_all[rsrs_all.apply(lambda x: x not in list(rsrs_inca))].reset_index(drop = True)

# # Number of comparison rsrs identified:
# print("Size of comparison group identified: {} researchers".format(len(rsrs_comp_2)))

## Method 3: Counterfactual based on Funding Agencies

### Define Comparison Agencies:

Funding Agencies considered are NIH Wellcome Trust, Australian  NRMC, MRC in UK

In [23]:
funders = ['Wellcome Trust', 'National Health and Medical Research Council', 'Cancer Research UK', 
           'National Cancer Institute']

### Pull all Researchers Funded by these Agencies

In [24]:
string = "search grants where funders.name in [{}] and start_year>=2007 and start_year<=2012"
string += " return grants[id+researchers+RCDC+funders+start_date+end_date+funding_amount]"
full_resp = pull_data(string=string, in_list=funders, in_type='funding agencies', return_type='grants')

Querying: 0-4/4 funding agencies...
Done !


In [25]:
fundr_grnts = parse_grant(full_resp).drop_duplicates().reset_index(drop=True)

In [26]:
# Remove all other funders that might have appeared
fundr_grnts[(fundr_grnts['funder_name'].isin(funders))]

# For non-cancer-specific funders, resstrict to Cancer Grants:
general_funders = ['Wellcome Trust', 'National Health and Medical Research Council']
fundr_grnts = fundr_grnts[(fundr_grnts['funder_name'].isin(general_funders)==False)
                          |(fundr_grnts['rcdc_names'].str.contains("Cancer"))]

# Restrict to grants funded between 2007 and 2012:
fundr_grnts = fundr_grnts[(pd.DatetimeIndex(fundr_grnts['start_date']).year>=2007)
                          & (pd.DatetimeIndex(fundr_grnts['start_date']).year<=2012)].reset_index(drop=True)

In [27]:
fundr_grnts.to_csv('../data/counterfactual/counterfactual_funded_grants.csv', index=False)

### Pull all Grants and Publications from these Researchers

In [28]:
fundr_grnts = pd.read_csv('../data/counterfactual/counterfactual_funded_grants.csv')
rsrs = fundr_grnts[fundr_grnts['rsr_id'].notnull()]['rsr_id'].drop_duplicates()

#### Grants

In [29]:
string = "search grants where researchers.id in [{}]"
string += " return grants[id+researchers+RCDC+funders+start_date+end_date+funding_amount]"
full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='grants')

Querying: 12000-12165/12165 researchers...
Done !


In [30]:
grnts = parse_grant(full_resp)
grnts = grnts.drop_duplicates().reset_index(drop=True)

In [31]:
grnts.describe(include='all')

Unnamed: 0,end_date,funder_id,funder_name,funding_amount,grant_id,rcdc_codes,rcdc_names,rsr_id,start_date
count,110731,111688,111688,105604.0,111688,111688.0,111688.0,111688,111411
unique,3026,201,200,,65766,24308.0,24308.0,37112,5143
top,2017-01-01,grid.48336.3a,National Cancer Institute,,grant.3575135,,,ur.01117731572.33,2009-01-01
freq,2390,28046,28046,,50,19453.0,19453.0,384,3091
mean,,,,1417444.0,,,,,
std,,,,4695819.0,,,,,
min,,,,0.0,,,,,
25%,,,,203055.5,,,,,
50%,,,,489364.0,,,,,
75%,,,,1359881.0,,,,,


In [32]:
grnts.to_csv('../data/counterfactual/counterfactual_researcher_grants.csv', index=False)

#### Publications

In [33]:
string = "search publications where researchers.id in [{}]"
string += " return publications[id+researchers+date+doi+supporting_grant_ids+times_cited+RCDC]"
full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='publications')

Querying: 250-500/12165 researchers...
RESPONSE ERROR on i=1 and j=6.

Querying: 1750-2000/12165 researchers...
RESPONSE ERROR on i=7 and j=8.

Querying: 2250-2500/12165 researchers...
RESPONSE ERROR on i=9 and j=6.

Querying: 3750-4000/12165 researchers...
RESPONSE ERROR on i=15 and j=7.

Querying: 4000-4250/12165 researchers...
RESPONSE ERROR on i=16 and j=7.

Querying: 6000-6250/12165 researchers...
RESPONSE ERROR on i=24 and j=6.

Querying: 7750-8000/12165 researchers...
RESPONSE ERROR on i=31 and j=5.


RESPONSE ERROR on i=31 and j=6.

Querying: 8250-8500/12165 researchers...
RESPONSE ERROR on i=33 and j=5.

Querying: 8750-9000/12165 researchers...
RESPONSE ERROR on i=35 and j=13.

Querying: 9500-9750/12165 researchers...
RESPONSE ERROR on i=38 and j=5.

Querying: 10250-10500/12165 researchers...
RESPONSE ERROR on i=41 and j=3.

Querying: 10500-10750/12165 researchers...
RESPONSE ERROR on i=42 and j=4.

Querying: 12000-12165/12165 researchers...
Done !


In [34]:
pubs = parse_publication(full_resp)
pubs = pubs.drop_duplicates().reset_index(drop=True)

In [35]:
pubs.head()

Unnamed: 0,citations,date,doi,pub_id,rcdc_codes,rcdc_names,rsr_id,supporting_grants
0,0,2018-12-02,10.1038/s41598-018-22599-w,pub.1101436524,338; 363; 337; 526,Biotechnology; Human Genome; Bioengineering; G...,ur.0644660213.73,grant.2440602
1,0,2018-12-02,10.1038/s41598-018-22599-w,pub.1101436524,338; 363; 337; 526,Biotechnology; Human Genome; Bioengineering; G...,ur.01244615341.63,grant.2440602
2,0,2018-12-02,10.1038/s41598-018-22599-w,pub.1101436524,338; 363; 337; 526,Biotechnology; Human Genome; Bioengineering; G...,ur.0655465515.31,grant.2440602
3,0,2018-12-02,10.1038/s41598-018-22599-w,pub.1101436524,338; 363; 337; 526,Biotechnology; Human Genome; Bioengineering; G...,ur.01020001441.06,grant.2440602
4,0,2018-12-02,10.1038/s41598-018-22599-w,pub.1101436524,338; 363; 337; 526,Biotechnology; Human Genome; Bioengineering; G...,ur.01214442221.77,grant.2440602


In [36]:
pubs.describe(include='all')

Unnamed: 0,citations,date,doi,pub_id,rcdc_codes,rcdc_names,rsr_id,supporting_grants
count,6015826.0,6015717,5760441,6015826,6015826.0,6015826.0,6015826,2899105
unique,,11607,887282,940153,214263.0,214263.0,966489,261864
top,,2011-08-02,10.1080/15548627.2015.1100356,pub.1058406632,,,ur.012724545020.23,grant.2438826
freq,,81960,1554,1554,1776352.0,1776352.0,2370,22545
mean,53.15537,,,,,,,
std,186.8054,,,,,,,
min,0.0,,,,,,,
25%,2.0,,,,,,,
50%,15.0,,,,,,,
75%,46.0,,,,,,,


In [37]:
pubs.to_csv('../data/counterfactual/counterfactual_researcher_publications.csv', index=False)

## Sandbox