# Counterfactual Group Identification

## Python Setup

In [195]:
import requests
import time
import pandas as pd
pd.options.display.max_rows = 100
import math
import numpy as np

## Connecting to API

### Login Information

Fetch my password from a separate .txt file:

In [3]:
with open('../data/password.txt', 'r') as myfile:
    password = myfile.read().strip()

In [4]:
login = {
    'username': 'nj995@nyu.edu',
    'password': password
}

### API Parameters

In [6]:
def execute_query(string):
    # Send credentials to login url to retrieve token.
    resp = requests.post('https://app.dimensions.ai/api/auth.json', json=login)
    resp.raise_for_status()

    # Create http header using the generated token.
    headers = {'Authorization': "JWT " + resp.json()['token']}   
    
    # Execute DSL query.    
    resp = requests.post('https://app.dimensions.ai/api/dsl.json', data= string, headers=headers)
    resp = resp.json()
    return resp

In [5]:
# Filter operator 'in' requires 0 < items < 512
max_in_items = 512

# Limit exceeds maximum allowed limit 1000
max_return = 1000

## Pulling Comparison Group

In [154]:
def pull_data(string, in_list, in_type, return_type, max_in_items=max_in_items, max_return=max_return):
    
    full_resp = []

    for i in range(math.ceil(len(in_list)/max_in_items)):
        min_i, max_i = i*max_in_items, min((i+1)*max_in_items, len(in_list))
        print('Querying: {}-{}/{} {}'.format(min_i, max_i, len(in_list), in_type), end = '\r')

        in_t = in_list[min_i:max_i]
        string_t = "\"" + "\", \"".join(in_t) + "\""
        query = string.format(string_t)

        j = 0
        loop = True
        while loop == True:
            query_t = query + " limit {} skip {}".format(max_return, max_return*j)
            resp = execute_query(query_t)
            full_resp.extend(resp[return_type])

            if len(resp[return_type])<max_return:
                loop = False
            j += 1
            time.sleep(1)
        loop = False
            
    print("\nDone!")
    return full_resp

### Method 1: Pulling all Researchers who Collaborated on Grants with INCA-Funded Researchers

#### All Researchers on Grants Awarded to INCA Researchers

In [155]:
rsr_info = pd.read_csv('../data/researchers.csv')
rsrs = list(rsr_info[rsr_info['id'].notnull()]['id'].drop_duplicates().reset_index(drop = True))

#### Pull all Grants Awarded by these Researchers

In [146]:
string = "search grants where researchers.id in [{}] return grants[id+researchers+RCDC]"

In [147]:
full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='grants')

Querying: 1024-1069/1069 researchers
Done!


In [127]:
rsr_id = []
grant_id = []
rcdc_codes = []
rcdc_names = []
for grant in full_resp:
    for researcher in grant['researchers']:
        rsr_id.append(researcher['id'])
        grant_id.append(grant['id'])
        rcdc_codes_t = []
        rcdc_names_t = []
        if 'RCDC' in grant:
            for rcdc in grant['RCDC']:
                rcdc_codes_t.append(str(rcdc['id']))
                rcdc_names_t.append(rcdc['name'])
            rcdc_codes.append("; ".join(rcdc_codes_t))
            rcdc_names.append("; ".join(rcdc_names_t))
        else:
            rcdc_codes.append("")
            rcdc_names.append("")

In [132]:
collab_grnts = pd.DataFrame({'rsr_id':rsr_id, 'grant_id':grant_id, 'rcdc_codes':rcdc_codes, 'rcdc_names':rcdc_names})
collab_rsrs = collab_grnts.groupby('rsr_id')['grant_id'].nunique().reset_index()

#### Analysis of Results

In [136]:
# INCA-funded rsrs:
rsrs_inca = rsrs
print("{} INCA-funded researchers (with Dimensions IDs)".format(len(rsrs_inca)))

# INCA-funded rsrs who have grants with the API:
# rsrs_inca_t = inca_rsrs_grants['rsr_id'].drop_duplicates().reset_index(drop = True)
# print("{} INCA-funded researchers appear to have grants with the Dimensions API".format(len(rsrs_inca_t)))

# rsrs who are on grants where with >1 INCA-funded rsrs
rsrs_all = collab_rsrs['rsr_id'].drop_duplicates().reset_index(drop = True)
print("{} researchers participated in grants with at least one INCA-funded researcher".format(len(rsrs_all)))

1069 INCA-funded researchers (with Dimensions IDs)
588 researchers participated in grants with at least one INCA-funded researcher


In [137]:
rsrs_comp_1 = rsrs_all[rsrs_all.apply(lambda x: x not in list(rsrs_inca))].reset_index(drop = True)

# Number of comparison rsrs identified:
print("Size of comparison group identified: {} researchers".format(len(rsrs_comp_1)))

Size of comparison group identified: 379 researchers


### Method 2: Pulling all Researchers who Co-Authored with INCA-Funded Researchers

#### All Researchers on Grants Awarded to INCA Researchers

In [156]:
rsr_info = pd.read_csv('../data/researchers.csv')
rsrs = list(rsr_info[rsr_info['id'].notnull()]['id'].drop_duplicates().reset_index(drop = True))

#### Pull all Grants Awarded by these Researchers

In [157]:
string = "search publications where researchers.id in [{}] return publications[id+researchers]"

In [159]:
full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='publications')

Querying: 1024-1069/1069 researchers
Done!


In [162]:
rsr_id = []
pub_id = []
for pub in full_resp:
    for researcher in pub['researchers']:
        rsr_id.append(researcher['id'])
        pub_id.append(pub['id'])

In [164]:
collab_pubs = pd.DataFrame({'rsr_id':rsr_id, 'publication_id':pub_id})
collab_rsrs = collab_pubs.groupby('rsr_id')['publication_id'].nunique().reset_index()

#### Analysis of Results

In [165]:
# INCA-funded rsrs:
rsrs_inca = rsrs
print("{} INCA-funded researchers (with Dimensions IDs)".format(len(rsrs_inca)))

# rsrs who are on publications with >1 INCA-funded rsrs
rsrs_all = collab_rsrs['rsr_id'].drop_duplicates().reset_index(drop = True)
print("{} researchers participated in publications with at least one INCA-funded researcher".format(len(rsrs_all)))

1069 INCA-funded researchers (with Dimensions IDs)
164485 researchers participated in publications with at least one INCA-funded researcher


In [166]:
rsrs_comp_2 = rsrs_all[rsrs_all.apply(lambda x: x not in list(rsrs_inca))].reset_index(drop = True)

# Number of comparison rsrs identified:
print("Size of comparison group identified: {} researchers".format(len(rsrs_comp_2)))

Size of comparison group identified: 163494 researchers


### Method 3: Counterfactual based on Funding Agencies

#### Define Comparison Agencies:

Funding Agencies considered are NIH Wellcome Trust, Australian  NRMC, MRC in UK

In [180]:
funders = ['MRC', 'Wellcome Trust', 'Australian Research Council'
           , 'National Health and Medical Research Council', 'Cancer Research UK']

#### Pull all Researchers Funded by these Agencies

In [196]:
string = "search grants where funding_org_name in [{}] return grants[id+researchers+RCDC+funders]"

In [197]:
full_resp = pull_data(string=string, in_list=funders, in_type='funding agencies', return_type='grants')

Querying: 0-5/5 funding agencies
Done!


In [216]:
rsr_id = []
grant_id = []
funder_name = []
funder_id = []
rcdc_codes = []
rcdc_names = []
for grant in full_resp:
    if 'RCDC' not in grant:
        grant['RCDC'] = [{'id':np.nan, 'name':np.nan}]
    if 'researcher' not in grant:
        grant['researchers'] = [{'first_name':np.nan,'id':np.nan,'last_name':np.nan}]
    if 'funders' not in grant:
        grant['funders'] = [{'acronym': '','country_name':np.nan,'id':np.nan,'name':np.nan}]
    for researcher in grant['researchers']:
        for funder in grant['funders']:
            rsr_id.append(researcher['id'])
            grant_id.append(grant['id'])
            funder_id.append(funder['id'])
            funder_name.append(funder['name'])
            rcdc_codes_t = []
            rcdc_names_t = []
            for rcdc in grant['RCDC']:
                rcdc_codes_t.append(str(rcdc['id']))
                rcdc_names_t.append(str(rcdc['name']))
            rcdc_codes.append("; ".join(rcdc_codes_t))
            rcdc_names.append("; ".join(rcdc_names_t))

In [225]:
fundr_grnts = pd.DataFrame({'rsr_id':rsr_id, 'grant_id':grant_id
                            , 'funder_name':funder_name, 'funder_id':funder_id
                            , 'rcdc_codes':rcdc_codes, 'rcdc_names':rcdc_names})
fundr_rsrs = fundr_grnts.groupby('rsr_id')['grant_id'].nunique().reset_index()

In [222]:
fundr_grnts['funder_name'].value_counts()

Australian Research Council                     25415
National Health and Medical Research Council    25400
Wellcome Trust                                  24264
Medical Research Council                         8313
Cancer Research UK                               4499
Name: funder_name, dtype: int64

## Sandbox

In [176]:
grnt_dtl = pd.read_csv('../data/inca_grants_details.csv', low_memory=False)

In [177]:
grnt_dtl['Funder'].value_counts()

French National Cancer Institute                                                      848
Ministère des Affaires sociales et de la Santé                                        524
French Institute of Health and Medical Research                                       162
French National Research Agency                                                       122
Swiss National Science Foundation                                                      42
European Research Council                                                              18
Worldwide Cancer Research                                                              14
German Research Foundation                                                             14
Canadian Institutes of Health Research                                                 12
Human Frontiers Science Program                                                         8
NATIONAL CANCER INSTITUTE                                                               6
NATIONAL I

In [179]:
grnt_dtl[grnt_dtl['Funder']=="National Health and Medical Research Council"]

Unnamed: 0,INCA ID,prenom_port,nom_port,organisme_port,Dimensions Grant ID,Title,Abstract,Funder,Reference,Research Org Names,Research Org IDs,FOR,RCDC,Funding Amount ($),Start Date,End Date
821,inca_577,Antoine,GESSAIN,Institut Pasteur de Paris,grant.6721777,Infection with the Human T cell Lymphotropic V...,The Human T Lymphotropic Virus type 1 is endem...,National Health and Medical Research Council,1088517,Baker IDI Heart and Diabetes Institute,grid.1051.5,1108 Medical Microbiology,Rare Diseases;Infectious Diseases,677887.0,2015-01-01,2018-01-01


In [223]:
string = "search grants where funding_org_name in [\"Cancer Research UK\"] return grants[id+researchers+RCDC+funders]"