# Counterfactual Group Identification

## Python Setup

In [199]:
import requests
import time
import pandas as pd
from pandas import Series
pd.options.display.max_rows = 100
import math
import numpy as np
from json import JSONDecodeError
from datetime import datetime
import gender_guesser.detector as gender

## Importing ID Replacer

In [2]:
id_replacer = pd.read_csv('../data/id_replacer.csv')

In [3]:
def id_replace(df):
    df = pd.merge(df, id_replacer, how='left', left_on='rsr_id', right_on='old_id')
    df['rsr_id'] = np.where(df['new_id'].notnull(), df['new_id'], df['rsr_id'])
    del df['old_id'], df['new_id']
    return df

## Connecting to API

For full documentation: https://docs.dimensions.ai/dsl/1.8.0/

The API Query functions are defined in the following code:

In [4]:
from api_query import execute_query
from api_query import pull_data

### API Parameters

In [5]:
# Filter operator 'in' requires 0 < items < 512
max_in_items = 100

# Limit exceeds maximum allowed limit 1000
max_return = 1000

# Offset cannot exceed 50000
max_overall_returns = 50000

## Grant and Publication Parsing Function

In [6]:
def parse_grant(full_resp):
    
    rsr_id = []
    grant_id = []
    funder_name = []
    funder_id = []
    rcdc_codes = []
    rcdc_names = []
    start_dates = []
    end_dates = []
    funding_amount = []
    for grant in full_resp:
        if 'RCDC' not in grant:
            grant['RCDC'] = [{'id':np.nan, 'name':np.nan}]
        if 'researchers' not in grant:
            grant['researchers'] = [{'id':np.nan}]
        for researcher in grant['researchers']: 
            if 'id' not in researcher:
                tmp = researcher
                grant['researchers'].append({'id':tmp})
        if 'start_date' not in grant:
            grant['start_date'] = np.nan
        if 'end_date' not in grant:
            grant['end_date'] = np.nan
        if 'funding_usd' not in grant:
            grant['funding_usd'] = np.nan
        for i in range(len(grant['researchers'])-1, -1, -1):
            if 'id' not in grant['researchers'][i]:
                del grant['researchers'][i]
        if 'funders' not in grant:
            grant['funders'] = [{'id':np.nan,'name':np.nan}]
        for researcher in grant['researchers']:
            for funder in grant['funders']:
                rsr_id.append(researcher['id'])
                grant_id.append(grant['id'])
                funder_id.append(funder['id'])
                funder_name.append(funder['name'])
                start_dates.append(grant['start_date'])
                end_dates.append(grant['end_date'])
                funding_amount.append(float(grant['funding_usd']))
                rcdc_codes_t = []
                rcdc_names_t = []
                for rcdc in grant['RCDC']:
                    rcdc_codes_t.append(str(rcdc['id']))
                    rcdc_names_t.append(str(rcdc['name']))
                rcdc_codes.append(";".join(rcdc_codes_t))
                rcdc_names.append(";".join(rcdc_names_t))
                
    grnts = pd.DataFrame({'rsr_id':rsr_id
                          , 'grant_id':grant_id
                          , 'funder_name':funder_name
                          , 'funder_id':funder_id
                          , 'rcdc_codes':rcdc_codes
                          , 'rcdc_names':rcdc_names
                          , 'start_date':start_dates
                          , 'end_date':end_dates
                          , 'funding_amount':funding_amount
                         })

    return grnts

In [7]:
def parse_publication(full_resp):

    pub_id = []
    pmid = []
    dates = []
    citations = []
    dois = []
    supporting_grants = []
    nb_authors = []
    author_id = []
    author_country = []
    author_affiliation = []
    author_affiliation_id = []

    for pub in full_resp:
        if 'id' not in pub:
            pub['id'] = np.nan
        if 'pmid' not in pub:
            pub['pmid'] = np.nan
        if 'author_affiliations' not in pub:
            pub['author_affiliations'] = [[]]
        for author in pub['author_affiliations'][0]: 
            if 'researcher_id' not in author:
                author['researcher_id'] = np.nan
            if ('affiliations' not in author)  | (len(author['affiliations'])==0):
                author['affiliations'] = [{'country_code':np.nan, 'name': np.nan, 'id':np.nan}]
            for affiliation in author['affiliations']:
                if 'country_code' not in affiliation:
                    affiliation['country_code']=np.nan
                if 'name' not in affiliation:
                    affiliation['name']=np.nan
                if 'id' not in affiliation:
                    affiliation['id']=np.nan
        if 'date' not in pub:
            pub['date'] = np.nan
        if 'times_cited' not in pub:
            pub['times_cited'] = np.nan
        if 'doi' not in pub:
            pub['doi'] = np.nan
        if 'supporting_grant_ids' not in pub:
            pub['supporting_grant_ids'] = [np.nan]
        nb = len(pub['author_affiliations'][0])
        for author in pub['author_affiliations'][0]:
            nb_authors.append(float(nb))
            author_id.append(author['researcher_id'])
            author_country.append(author['affiliations'][0]['country_code'])
            author_affiliation.append(author['affiliations'][0]['name'])
            author_affiliation_id.append(author['affiliations'][0]['id'])
            pub_id.append(pub['id'])
            dates.append(pub['date'])
            citations.append(float(pub['times_cited']))
            dois.append(pub['doi'])
            pmid.append(pub['pmid'])
            supporting_grants.append(pub['supporting_grant_ids'])

    pubs = pd.DataFrame({'pub_id':pub_id
                         , 'pmid': pmid
                         , 'date':dates
                         , 'doi':dois
                         , 'citations':citations
                         , 'nb_authors':nb_authors
                         , 'rsr_id':author_id
                         , 'rsr_country':author_country
                         , 'rsr_affiliation':author_affiliation
                         , 'rsr_affiliation_id':author_affiliation_id
                         , 'supporting_grants':supporting_grants
                        })
    pubs['supporting_grants'] = pubs['supporting_grants'].apply(lambda x: np.nan if x == [np.nan] else  ';'.join(x))
    
    return pubs

In [8]:
def parse_rcdc(full_resp, data_type):
    
    data_id = []
    rcdc_codes = []
    rcdc_names = []
    for elem in full_resp:
        if 'id' not in elem:
            elem['id'] = np.nan
        if 'RCDC' not in elem:
            elem['RCDC'] = [{'id':np.nan, 'name':np.nan}]
        for rcdc in elem['RCDC']:
            if rcdc['name'] != np.nan:
                data_id.append(elem['id'])
                rcdc_names.append(str(rcdc['name']))

    rcdc = pd.DataFrame({data_type+'_id':data_id
                         , 'rcdc_name':rcdc_names})
    rcdc['rcdc_name'] = rcdc['rcdc_name'].str.strip().str.upper()

    return rcdc

## Counterfactual based on Funding Agencies

### Define Comparison Agencies:

Funding Agencies considered are NIH Wellcome Trust, Australian  NRMC, MRC in UK

In [9]:
funders = ['Wellcome Trust', 'National Health and Medical Research Council', 'Cancer Research UK', 
           'National Cancer Institute']

### Pull all Researchers Funded by these Agencies

In [10]:
string = "search grants where funders.name in [{}] and start_year>=2007 and start_year<=2012"
string += " return grants[id+researchers+RCDC+funders+start_date+end_date+funding_usd]"
full_resp = pull_data(string=string, in_list=funders, in_type='funding agencies', return_type='grants',
                      max_in_items=max_in_items, max_return=max_return, max_overall_returns=max_overall_returns)

Querying: 0-4/4 funding agencies...
Done !


In [13]:
funded_grnts = parse_grant(full_resp).drop_duplicates().reset_index(drop=True)

funded_grnts_rcdc = parse_rcdc(full_resp, data_type='grant')

In [14]:
# Remove all grants with no researcher info
funded_grnts = funded_grnts[funded_grnts['rsr_id'].notnull()]

# Remove all other funders that might have appeared
funded_grnts = funded_grnts[(funded_grnts['funder_name'].isin(funders))]

# Restrict to grants funded between 2007 and 2012:
funded_grnts = funded_grnts[(pd.DatetimeIndex(funded_grnts['start_date']).year>=2007)
                          & (pd.DatetimeIndex(funded_grnts['start_date']).year<=2012)].reset_index(drop=True)

Some of the counterfactual agencies (NCI, Cancer Research UK) are cancer-specific funders. Others however, such as the Wellcome Trust and the NHMRC, fund different types of medical research. For this second category of non-cancer-specific funders, we must restrict to grants on the topic of Cancer. I use the RCDC codes to restrict to Cancer-related grants.

In [15]:
# For non-cancer-specific funders, restrict to Cancer Grants:
general_funders = ['Wellcome Trust', 'National Health and Medical Research Council']
funded_grnts = funded_grnts[(funded_grnts['funder_name'].isin(general_funders)==False)
                            |(funded_grnts['rcdc_names'].str.upper().str.contains("CANCER"))].reset_index(drop=True)

In [16]:
funded_grnts['funder_name'].value_counts()

National Cancer Institute                       14682
National Health and Medical Research Council     3112
Cancer Research UK                               2290
Wellcome Trust                                    416
Name: funder_name, dtype: int64

In [17]:
funded_grnts.head()

Unnamed: 0,end_date,funder_id,funder_name,funding_amount,grant_id,rcdc_codes,rcdc_names,rsr_id,start_date
0,2018-11-30,grid.48336.3a,National Cancer Institute,3181288.0,grant.2482176,503;546;337;316;507,Cancer;Patient Safety;Bioengineering;Breast Ca...,ur.015107420202.13,2012-12-31
1,2018-11-30,grid.48336.3a,National Cancer Institute,3181288.0,grant.2482176,503;546;337;316;507,Cancer;Patient Safety;Bioengineering;Breast Ca...,ur.0755627762.12,2012-12-31
2,2018-11-30,grid.48336.3a,National Cancer Institute,3181288.0,grant.2482176,503;546;337;316;507,Cancer;Patient Safety;Bioengineering;Breast Ca...,ur.016035601337.41,2012-12-31
3,2017-11-30,grid.48336.3a,National Cancer Institute,896400.0,grant.2411281,526;414;344;503;507,Genetics;Colo-Rectal Cancer;Digestive Diseases...,ur.0671420757.16,2012-12-18
4,2013-12-16,grid.48336.3a,National Cancer Institute,45010.0,grant.2344785,,,ur.01363472252.68,2012-12-17


In [18]:
funded_grnts = id_replace(funded_grnts)

In [19]:
funded_grnts.to_csv('../data/counterfactual_funded_grants.csv', index=False)
funded_grnts_rcdc.to_csv('../data/topic_lookups/counterfactual_funded_grants_rcdc.csv', index=False)

### Pull all Grants, Publications, and Personal Information from these Researchers

In [10]:
funded_grnts = pd.read_csv('../data/counterfactual_funded_grants.csv')
rsrs = list(funded_grnts['rsr_id'].drop_duplicates())
print(len(rsrs))

13026


#### Grants

In [23]:
string = "search grants where researchers.id in [{}]"
string += " return grants[id+researchers+RCDC+funders+start_date+end_date+funding_usd]"
full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='grants'
                      , max_in_items=max_in_items, max_return=max_return, max_overall_returns=max_overall_returns)

Querying: 13000-13026/13026 researchers...
Done !


In [24]:
grnts = parse_grant(full_resp).drop_duplicates().reset_index(drop=True)

grnts_rcdc = parse_rcdc(full_resp, data_type='grant')

In [25]:
grnts.describe(include='all')

Unnamed: 0,end_date,funder_id,funder_name,funding_amount,grant_id,rcdc_codes,rcdc_names,rsr_id,start_date
count,170924,171959,171959,165659.0,171959,171959.0,171959.0,171959,171671
unique,3172,199,198,,71803,27032.0,27032.0,77588,5442
top,2008-05-31,grid.48336.3a,National Cancer Institute,,grant.2439890,,,ur.01117731572.33,1977-12-01
freq,3944,40543,40543,,823,35243.0,35243.0,384,3935
mean,,,,10044510.0,,,,,
std,,,,21950740.0,,,,,
min,,,,0.0,,,,,
25%,,,,312782.0,,,,,
50%,,,,1113952.0,,,,,
75%,,,,11230670.0,,,,,


In [26]:
grnts = id_replace(grnts)

In [27]:
grnts.to_csv('../data/counterfactual_researcher_grants.csv', index=False)
grnts_rcdc.to_csv('../data/topic_lookups/counterfactual_researcher_grants_rcdc.csv', index=False)

#### Publications

In [28]:
string = "search publications where researchers.id in [{}]"
string += " return publications[id+doi+pmid+author_affiliations+date+supporting_grant_ids+times_cited+RCDC]"
full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='publications', 
                      max_in_items=max_in_items, max_return=max_return, max_overall_returns=max_overall_returns)

Querying: 400-500/13026 researchers...
RESPONSE ERROR on i=4 and j=2.

Querying: 1900-2000/13026 researchers...
RESPONSE ERROR on i=19 and j=2.

Querying: 2500-2600/13026 researchers...
RESPONSE ERROR on i=25 and j=2.

Querying: 4200-4300/13026 researchers...
RESPONSE ERROR on i=42 and j=3.

Querying: 4300-4400/13026 researchers...
RESPONSE ERROR on i=43 and j=4.

Querying: 6500-6600/13026 researchers...
RESPONSE ERROR on i=65 and j=3.

Querying: 7000-7100/13026 researchers...
RESPONSE ERROR on i=70 and j=6.

Querying: 8500-8600/13026 researchers...
RESPONSE ERROR on i=85 and j=2.

Querying: 9000-9100/13026 researchers...
RESPONSE ERROR on i=90 and j=2.

Querying: 9600-9700/13026 researchers...
RESPONSE ERROR on i=96 and j=3.

Querying: 10300-10400/13026 researchers...
RESPONSE ERROR on i=103 and j=2.

Querying: 11200-11300/13026 researchers...
RESPONSE ERROR on i=112 and j=1.

Querying: 11500-11600/13026 researchers...
RESPONSE ERROR on i=115 and j=1.

Querying: 13000-13026/13026 rese

In [29]:
pubs = parse_publication(full_resp).drop_duplicates().reset_index(drop=True)

pubs_rcdc = parse_rcdc(full_resp, data_type='pub')

In [30]:
pubs.describe(include='all')

Unnamed: 0,citations,date,doi,nb_authors,pmid,pub_id,rsr_affiliation,rsr_affiliation_id,rsr_country,rsr_id,supporting_grants
count,7221031.0,7220911,6909608,7221031.0,5564862.0,7221031,5511771,4418433,4418433,6842206,3508539
unique,,12114,1021104,,821223.0,1082124,413614,18576,174,1063280,302713
top,,2011-11-02,10.1007/bf03375463,,27770180.0,pub.1054508044,The University of Texas MD Anderson Cancer Center,grid.240145.6,US,ur.012724545020.23,grant.2438826
freq,,94839,3268,,3268.0,3268,118493,118493,2899045,2565,27455
mean,54.16,,,22.22274,,,,,,,
std,191.4654,,,148.4193,,,,,,,
min,0.0,,,1.0,,,,,,,
25%,2.0,,,6.0,,,,,,,
50%,15.0,,,8.0,,,,,,,
75%,47.0,,,13.0,,,,,,,


In [31]:
pubs = id_replace(pubs)

In [32]:
# Publication IDs:
pub_ids = pubs[pubs['pmid'].notnull()][['rsr_id', 'pmid']].drop_duplicates()

In [33]:
pubs.to_csv('../data/counterfactual_researcher_publications.csv', index=False)
pubs_rcdc.to_csv('../data/topic_lookups/counterfactual_researcher_publications_rcdc.csv', index=False)
pub_ids.to_csv('../data/counterfactual_pub_ids.csv', index=False)

#### Personal Information

In [275]:
first_name = []
last_name = []
first_affiliation = []
first_country = []
first_pub_year = []

for rsr in rsrs:
    
    print("Querying {} out of {} reseachers...".format(rsrs.index(rsr)+1, len(rsrs)), end = '\r')
        
    # Pull Name:
    time.sleep(2)
    stop = False
    string = "search grants where researchers.id in [\"{}\"]".format(rsr)
    string += " return researchers"
    resp = execute_query(string)
    for elem in resp['researchers']:
        if elem['id'] == rsr:
            stop = True
            if 'first_name' in elem:
                first_name.append(elem['first_name'])
            else:
                first_name.append(np.nan)
            if 'last_name' in elem:
                last_name.append(elem['last_name'])
            else:
                last_name.append(np.nan)
            break
    if stop==False:
        first_name.append(np.nan)
        last_name.append(np.nan)
    
    # Number of publications:
    string = "search publications where researchers.id in [\"{}\"] and date is not empty".format(rsr)
    string += " return publications sort by date limit 1"
    resp = execute_query(string)
    nb_pubs = resp['_stats']['total_count']

    # Pull First Country, Affiliation, and Pub Date:    
    time.sleep(2)
    stop = False
    if nb_pubs == 0:
        first_pub_year.append(np.nan)
        first_country.append(np.nan)
        first_affiliation.append(np.nan)
    else:
        string = "search publications where researchers.id in [\"{}\"] and date is not empty".format(rsr)
        string += " return publications[author_affiliations+date] sort by date"
        string += " limit 500 skip {}".format(max(nb_pubs-500, 0))
        resp = execute_query(string)
        first_pub_year.append(datetime.strptime(resp['publications'][len(resp['publications'])-1]['date'], 
                                                '%Y-%m-%d').year)
        for i in range(len(resp['publications']), 0, -1):
            for elem in resp['publications'][i-1]['author_affiliations'][0]:
                if 'researcher_id' in elem:
                    if elem['researcher_id'] == rsr:
                        if len(elem['affiliations'])>0:
                            stop = True
                            if 'country_code' in elem['affiliations'][0]:
                                first_country.append(elem['affiliations'][0]['country_code'])
                            else:
                                first_country.append(np.nan)
                            if 'name' in elem['affiliations'][0]:
                                first_affiliation.append(elem['affiliations'][0]['name'])
                            else:
                                first_affiliation.append(np.nan)
                        break
            if stop==True:
                break
        if stop==False:
            first_country.append(np.nan)
            first_affiliation.append(np.nan)
            
print("\nDone !")

rsr_info = pd.DataFrame({'rsr_id':rsrs
                         , 'first_name':first_name
                         , 'last_name': last_name
                         , 'affiliation': first_affiliation
                         , 'country': first_country
                         , 'career_start': first_pub_year
                        })

Querying 59 out of 13026 reseachers...

KeyboardInterrupt: 

In [277]:
rsr_info = pd.DataFrame({'rsr_id':rsrs[:58]
                         , 'first_name':first_name
                         , 'last_name': last_name
                         , 'affiliation': first_affiliation
                         , 'country': first_country
                         , 'career_start': first_pub_year
                        })

In [265]:
rsr_info

Unnamed: 0,affiliation,career_start,country,first_name,last_name,rsr_id
0,Kansas State University,1975,US,Linda G,Shapiro,ur.015107420202.13
1,Maine Medical Center,1986,US,Donald L,Weaver,ur.0755627762.12


In [None]:
# Gender Imputation

d = gender.Detector()

# Gender Abbreviation Table
gender_abbr = pd.DataFrame({'gender': ['male', 'mostly_male', 'unknown', 'andy', 'mostly_female', 'female']
                            , 'gender_abbr': ['M', 'M', 'UNKNOWN', 'UNKNOWN', 'F', 'F']})

In [None]:
rsr_info['gender'] = rsr_info['first_name'].apply(lambda x: d.get_gender(x))
rsr_info = pd.merge(rsr_info, gender_abbr, how='left', on='gender')
del rsr_info['gender']
rsr_info.rename(columns={'gender_abbr':'gender'}, inplace=True)

In [None]:
rsr_info['gender'].value_counts(normalize=True)

In [None]:
rsr_info.to_csv('../data/counterfactual_researcher_info.csv', index=False)

## Sandbox