# Data Extraction using Dimensions API

## Python Setup

In [184]:
import requests
import time
import pandas as pd
from pandas import Series
pd.options.display.max_rows = 100
import math
import numpy as np
from json import JSONDecodeError
from datetime import datetime
import gender_guesser.detector as gender

## Importing ID Replacer

In [185]:
id_replacer = pd.read_csv('../data/id_replacer.csv')

In [186]:
def id_replace(df):
    df = pd.merge(df, id_replacer, how='left', left_on='rsr_id', right_on='old_id')
    df['rsr_id'] = np.where(df['new_id'].notnull(), df['new_id'], df['rsr_id'])
    del df['old_id'], df['new_id']
    return df

## Connecting to API

For full documentation: https://docs.dimensions.ai/dsl/1.8.0/

The API Query functions are defined in the following code:

In [187]:
from api_query import execute_query
from api_query import pull_data

### API Parameters

In [188]:
# Filter operator 'in' requires 0 < items < 512
max_in_items = 100

# Limit exceeds maximum allowed limit 1000
max_return = 1000

# Offset cannot exceed 50000
max_overall_returns = 50000

## Define Comparison Agencies:

Funding Agencies considered are NIH Wellcome Trust, Australian  NRMC, MRC in UK

In [189]:
counter_funders = ['Wellcome Trust', 'National Health and Medical Research Council', 'Cancer Research UK', 
                   'National Cancer Institute']

## Funded Grants and Researchers

### INCA

In [190]:
inca_funded_grnts = pd.read_csv('../data/inca_funded_grants.csv', low_memory=False)

In [191]:
inca_funded_grnts['last_name'] = inca_funded_grnts['last_name'].str.replace('-', ' ')

In [192]:
inca_funded_grnts.head()

Unnamed: 0,first_name,last_name,grant_id,funding_amount,start_date,end_date,rsr_id,funder_name
0,Jérôme,ABADIE,grant.7426242,65061.0,2011-12-13,2013-06-12,ur.0642054564.81,INCa/INSERM/DGOS
1,Julien,ADAM,grant.7426178,240110.0,2011-11-01,2014-11-01,ur.01177206360.47,INCa/INSERM/DGOS
2,Antoine,ADENIS,grant.7154464,248109.0,2010-06-01,2013-06-01,ur.01303404424.36&ur.01067706306.01,INCa/INSERM/DGOS
3,Eric,ADRIAENSSENS,grant.7154483,400306.0,2010-11-04,2013-11-04,ur.0673152200.72,INCa/INSERM/DGOS
4,Eric,ADRIAENSSENS,grant.7154359,140617.0,2007-12-18,2010-06-30,ur.0673152200.72,INCa/INSERM/DGOS


### Counterfactual based on Funding Agencies

In [193]:
string = "search grants where funders.name in [{}] and start_year>=2007 and start_year<=2012"
string += " return grants[id+researchers+funders+start_date+end_date+funding_usd+RCDC]"
full_resp = pull_data(string=string, in_list=counter_funders, in_type='funding agencies', return_type='grants',
                      max_in_items=max_in_items, max_return=max_return, max_overall_returns=max_overall_returns)

Querying: 0-4/4 funding agencies...
Done !


In [194]:
first_name = []
last_name = []
rsr_id = []
grant_id = []
funder_name = []
start_dates = []
end_dates = []
funding_amount = []
rcdc_names = []

for grant in full_resp:

    if 'RCDC' not in grant:
        grant['RCDC'] = [{'id':np.nan, 'name':np.nan}]
    if 'researchers' not in grant:
        grant['researchers'] = [{'id':np.nan, 'first_name':np.nan, 'last_name': np.nan}]
    for researcher in grant['researchers']:
        if 'id' not in researcher:
            tmp = researcher
            researcher['id']=tmp
        if 'first_name' not in researcher:
            researcher['first_name']=np.nan
        if 'last_name' not in researcher:
            researcher['last_name']=np.nan
    if 'start_date' not in grant:
        grant['start_date'] = np.nan
    if 'end_date' not in grant:
        grant['end_date'] = np.nan
    if 'funding_usd' not in grant:
        grant['funding_usd'] = np.nan
    for i in range(len(grant['researchers'])-1, -1, -1):
        if 'id' not in grant['researchers'][i]:
            del grant['researchers'][i]
    if 'funders' not in grant:
        grant['funders'] = [{'id':np.nan,'name':np.nan}]
    for researcher in grant['researchers']:
        for funder in grant['funders']:
            if funder['name'] not in counter_funders:
                continue
            rsr_id.append(researcher['id'])
            first_name.append(researcher['first_name'])
            last_name.append(researcher['last_name'])
            grant_id.append(grant['id'])
            funder_name.append(funder['name'])
            start_dates.append(grant['start_date'])
            end_dates.append(grant['end_date'])
            funding_amount.append(float(grant['funding_usd']))
            rcdc_names_t = []
            for rcdc in grant['RCDC']:
                rcdc_names_t.append(str(rcdc['name']))
            rcdc_names.append(";".join(rcdc_names_t))

counter_funded_grnts = pd.DataFrame({'rsr_id':rsr_id
                                        , 'first_name':first_name
                                        , 'last_name':last_name
                                        , 'grant_id':grant_id
                                        , 'funder_name':funder_name
                                        , 'start_date':start_dates
                                        , 'end_date':end_dates
                                        , 'funding_amount':funding_amount
                                     , 'rcdc_names': rcdc_names
                                    }).drop_duplicates().reset_index(drop=True)

In [195]:
counter_funded_grnts['last_name'] = counter_funded_grnts['last_name'].str.upper()

In [196]:
counter_funded_grnts['grant_id'].nunique()

30003

In [197]:
# Remove all grants with no researcher info
counter_funded_grnts = counter_funded_grnts[counter_funded_grnts['rsr_id'].notnull()]

# Remove all other funders that might have appeared
counter_funded_grnts = counter_funded_grnts[(counter_funded_grnts['funder_name'].isin(counter_funders))]

# Restrict to grants funded between 2007 and 2012:
counter_funded_grnts = counter_funded_grnts[(pd.DatetimeIndex(counter_funded_grnts['start_date']).year>=2007)
                          & (pd.DatetimeIndex(counter_funded_grnts['start_date']).year<=2012)].reset_index(drop=True)

Some of the counterfactual agencies (NCI, Cancer Research UK) are cancer-specific funders. Others however, such as the Wellcome Trust and the NHMRC, fund different types of medical research. For this second category of non-cancer-specific funders, we must restrict to grants on the topic of Cancer. I use the RCDC codes to restrict to Cancer-related grants.

In [198]:
# For non-cancer-specific funders, restrict to Cancer Grants:
general_funders = ['Wellcome Trust', 'National Health and Medical Research Council']
counter_funded_grnts = counter_funded_grnts[(counter_funded_grnts['funder_name'].isin(general_funders)==False)
                                            |(counter_funded_grnts['rcdc_names'].str.upper().str.contains("CANCER"))
                                           ].reset_index(drop=True)
del counter_funded_grnts['rcdc_names']

In [199]:
print("Total number of Grants after restrictions: {}".format(counter_funded_grnts['grant_id'].nunique()))

Total number of Grants after restrictions: 15607


In [200]:
counter_funded_grnts = id_replace(counter_funded_grnts)

### Combine the two

In [201]:
funded_grnts = pd.concat([inca_funded_grnts, counter_funded_grnts], sort=False)

In [202]:
funded_grnts['funder_name'].value_counts()

National Cancer Institute                       14682
National Health and Medical Research Council     3112
Cancer Research UK                               2290
INCa/INSERM/DGOS                                 1502
Wellcome Trust                                    416
Name: funder_name, dtype: int64

### Export

In [203]:
cols = ['funder_name', 'grant_id', 'rsr_id', 'first_name', 'last_name', 'start_date', 'end_date', 'funding_amount']
funded_grnts[cols].to_csv('../data/funded_grants.csv', index=False)

## Pull all Grants, Publications, and Personal Information from these Researchers

In [102]:
funded_grnts = pd.read_csv('../data/funded_grants.csv')
rsrs = pd.merge(funded_grnts[['rsr_id']].drop_duplicates(), id_replacer, 
                left_on='rsr_id', right_on='new_id', how='left')
rsrs['id'] = np.where(rsrs['old_id'].notnull(), rsrs['old_id'], rsrs['rsr_id'])
rsrs = list(rsrs['id'].drop_duplicates())
print(len(rsrs))

14108


### Grants

In [103]:
string = "search grants where researchers.id in [{}]"
string += " return grants[id+researchers+funders+start_date+end_date+funding_usd+RCDC]"
full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='grants'
                      , max_in_items=max_in_items, max_return=max_return, max_overall_returns=max_overall_returns)

Querying: 14100-14108/14108 researchers...
Done !


In [105]:
rsr_id = []
grant_id = []
funder_name = []
start_dates = []
end_dates = []
funding_amount = []

for grant in full_resp:

    if 'researchers' not in grant:
        grant['researchers'] = [{'id':np.nan}]
    for researcher in grant['researchers']: 
        if 'id' not in researcher:
            tmp = researcher
            grant['researchers'].append({'id':tmp})
    if 'start_date' not in grant:
        grant['start_date'] = np.nan
    if 'end_date' not in grant:
        grant['end_date'] = np.nan
    if 'funding_usd' not in grant:
        grant['funding_usd'] = np.nan
    for i in range(len(grant['researchers'])-1, -1, -1):
        if 'id' not in grant['researchers'][i]:
            del grant['researchers'][i]
    if 'funders' not in grant:
        grant['funders'] = [{'id':np.nan,'name':np.nan}]
    for researcher in grant['researchers']:
        for funder in grant['funders']:
            rsr_id.append(researcher['id'])
            grant_id.append(grant['id'])
            funder_name.append(funder['name'])
            start_dates.append(grant['start_date'])
            end_dates.append(grant['end_date'])
            funding_amount.append(float(grant['funding_usd']))
            
grnts = pd.DataFrame({'rsr_id':rsr_id
                      , 'grant_id':grant_id
                      , 'funder_name':funder_name
                      , 'start_date':start_dates
                      , 'end_date':end_dates
                      , 'funding_amount':funding_amount
                     }).drop_duplicates().reset_index(drop=True)

In [106]:
grnts = id_replace(grnts)

In [107]:
grnts.describe(include='all')

Unnamed: 0,rsr_id,grant_id,funder_name,start_date,end_date,funding_amount
count,173553,173553,173553,173234,172453,167175.0
unique,78499,72719,199,5528,3301,
top,ur.01117731572.33,grant.2439890,National Cancer Institute,1977-12-01,2008-05-31,
freq,384,823,40552,3935,3944,
mean,,,,,,9958314.0
std,,,,,,21869690.0
min,,,,,,0.0
25%,,,,,,310999.5
50%,,,,,,1082538.0
75%,,,,,,11008410.0


In [108]:
grnts.to_csv('../data/researcher_grants.csv', index=False)

In [109]:
data_id = []
rcdc_names = []
for elem in full_resp:
    if 'id' not in elem:
        elem['id'] = np.nan
    if 'RCDC' not in elem:
        elem['RCDC'] = [{'id':np.nan, 'name':np.nan}]
    for rcdc in elem['RCDC']:
        if rcdc['name'] != np.nan:
            data_id.append(elem['id'])
            rcdc_names.append(str(rcdc['name']))

grnts_rcdc = pd.DataFrame({'grant_id':data_id
                           , 'rcdc_name':rcdc_names}).drop_duplicates().reset_index(drop=True)

grnts_rcdc['rcdc_name'] = grnts_rcdc['rcdc_name'].str.strip().str.upper()

In [111]:
grnts_rcdc.to_csv('../data/topic_lookups/grants_rcdc.csv', index=False)

### Publications

In [139]:
string = "search publications where researchers.id in [{}]"
string += " return publications"
string += "[id+doi+pmid+author_affiliations+date+supporting_grant_ids+times_cited+altmetric+type+journal+RCDC]"
full_resp = pull_data(string=string, in_list=rsrs, in_type='researchers', return_type='publications', 
                      max_in_items=max_in_items, max_return=max_return, max_overall_returns=max_overall_returns)

Querying: 1000-1100/14108 researchers...
RESPONSE ERROR on i=10 and j=3.

Querying: 1500-1600/14108 researchers...
RESPONSE ERROR on i=15 and j=2.

Querying: 3000-3100/14108 researchers...
RESPONSE ERROR on i=30 and j=3.

Querying: 3600-3700/14108 researchers...
RESPONSE ERROR on i=36 and j=2.

Querying: 5200-5300/14108 researchers...
RESPONSE ERROR on i=52 and j=3.

Querying: 5300-5400/14108 researchers...
RESPONSE ERROR on i=53 and j=4.

Querying: 7600-7700/14108 researchers...
RESPONSE ERROR on i=76 and j=3.

Querying: 8100-8200/14108 researchers...
RESPONSE ERROR on i=81 and j=7.

Querying: 9600-9700/14108 researchers...
RESPONSE ERROR on i=96 and j=2.

Querying: 10100-10200/14108 researchers...
RESPONSE ERROR on i=101 and j=2.

Querying: 10800-10900/14108 researchers...
RESPONSE ERROR on i=108 and j=5.

Querying: 11400-11500/14108 researchers...
RESPONSE ERROR on i=114 and j=2.

Querying: 12200-12300/14108 researchers...
RESPONSE ERROR on i=122 and j=1.

Querying: 12600-12700/1410

In [158]:
pub_id = []
pmid = []
dates = []
citations = []
dois = []
supporting_grants = []
nb_authors = []
author_id = []
author_country = []
author_city = []
author_affiliation = []
author_affiliation_id = []
altmetric = []
journal = []
journal_id = []
pub_type = []

for pub in full_resp:
    if 'id' not in pub:
        pub['id'] = np.nan
    if 'pmid' not in pub:
        pub['pmid'] = np.nan
    if 'author_affiliations' not in pub:
        pub['author_affiliations'] = [[]]
    for author in pub['author_affiliations'][0]: 
        if 'researcher_id' not in author:
            author['researcher_id'] = np.nan
        if ('affiliations' not in author)  | (len(author['affiliations'])==0):
            author['affiliations'] = [{'country_code':np.nan, 'name': np.nan, 'id':np.nan, 'city':np.nan}]
        for affiliation in author['affiliations']:
            if 'country_code' not in affiliation:
                affiliation['country_code']=np.nan
            if 'name' not in affiliation:
                affiliation['name']=np.nan
            if 'id' not in affiliation:
                affiliation['id']=np.nan
            if 'city' not in affiliation:
                affiliation['city']=np.nan
    if 'journal' not in pub:
        pub['journal'] = {'id':np.nan, 'title':np.nan}
    if 'id' not in pub['journal']:
        pub['journal']['id'] = np.nan
    if 'title' not in pub['journal']:
        pub['journal']['title'] = np.nan
    if 'date' not in pub:
        pub['date'] = np.nan
    if 'times_cited' not in pub:
        pub['times_cited'] = np.nan
    if 'doi' not in pub:
        pub['doi'] = np.nan
    if 'type' not in pub:
        pub['type'] = np.nan
    if 'altmetric' not in pub:
        pub['altmetric'] = np.nan
    if 'supporting_grant_ids' not in pub:
        pub['supporting_grant_ids'] = [np.nan]
    nb = len(pub['author_affiliations'][0])
    for author in pub['author_affiliations'][0]:
        nb_authors.append(float(nb))
        author_id.append(author['researcher_id'])
        author_country.append(author['affiliations'][0]['country_code'])
        author_city.append(author['affiliations'][0]['city'])
        author_affiliation.append(author['affiliations'][0]['name'])
        author_affiliation_id.append(author['affiliations'][0]['id'])
        pub_id.append(pub['id'])
        dates.append(pub['date'])
        citations.append(float(pub['times_cited']))
        dois.append(pub['doi'])
        pmid.append(pub['pmid'])
        supporting_grants.append(pub['supporting_grant_ids'])
        pub_type.append(pub['type'])
        altmetric.append(pub['altmetric'])
        journal.append(pub['journal']['title'])
        journal_id.append(pub['journal']['id'])

pubs = pd.DataFrame({'pub_id':pub_id
                     , 'pmid': pmid
                     , 'date':dates
                     , 'doi':dois
                     , 'citations':citations
                     , 'nb_authors':nb_authors
                     , 'rsr_id':author_id
                     , 'rsr_country':author_country
                     , 'rsr_city':author_city
                     , 'rsr_affiliation':author_affiliation
                     , 'rsr_affiliation_id':author_affiliation_id
                     , 'supporting_grants':supporting_grants
                     , 'pub_type':pub_type
                     , 'altmetric':altmetric
                     , 'journal':journal
                     , 'journal_id':journal_id
                    })

In [162]:
pubs['supporting_grants'] = pubs['supporting_grants'].apply(lambda x: np.nan if x == [np.nan] else  ';'.join(x))

In [None]:
pubs = pubs.drop_duplicates().reset_index(drop=True)

In [163]:
pubs['date'] = pd.to_datetime(pubs['date'], errors='coerce')
pubs['year'] = pd.DatetimeIndex(pubs['date']).year

In [164]:
pubs.describe(include='all')

Unnamed: 0,pub_id,pmid,date,doi,citations,nb_authors,rsr_id,rsr_country,rsr_city,rsr_affiliation,rsr_affiliation_id,supporting_grants,pub_type,altmetric,journal,journal_id,year
count,8014661,6127441.0,8014524,7646173,8014661.0,8014661.0,7594998,4819216,4819216,6111156,4819216,3552117,8014661,3300813.0,7564608,7564608,8014524.0
unique,1181135,891240.0,12476,1110028,,,1153185,176,4543,494342,19442,305095,5,,12876,12981,
top,pub.1054508044,27770180.0,2011-11-14 00:00:00,10.1007/bf03375463,,,ur.012724545020.23,US,Houston,The University of Texas MD Anderson Cancer Center,grid.240145.6,grant.2438826,article,,Cancer Research,jour.1319913,
freq,3268,3268.0,101082,3268,,,2556,2912477,175300,119466,119466,28440,7542793,,296104,296104,
first,,,1949-01-25 00:00:00,,,,,,,,,,,,,,
last,,,2018-12-31 00:00:00,,,,,,,,,,,,,,
mean,,,,,52.00107,24.11288,,,,,,,,23.34145,,,2008.088
std,,,,,185.2514,168.0416,,,,,,,,107.9216,,,8.427831
min,,,,,0.0,1.0,,,,,,,,0.0,,,1949.0
25%,,,,,2.0,6.0,,,,,,,,2.0,,,2004.0


In [165]:
pubs = id_replace(pubs)

In [166]:
pubs.to_csv('../data/researcher_publications.csv', index=False)

In [170]:
# Publication IDs:
pub_ids = pubs[pubs['pmid'].notnull()][['rsr_id', 'pmid']].drop_duplicates()
pub_ids.to_csv('../data/pub_ids.csv', index=False)

In [171]:
data_id = []
rcdc_names = []
for elem in full_resp:
    if 'id' not in elem:
        elem['id'] = np.nan
    if 'RCDC' not in elem:
        elem['RCDC'] = [{'id':np.nan, 'name':np.nan}]
    for rcdc in elem['RCDC']:
        if rcdc['name'] != np.nan:
            data_id.append(elem['id'])
            rcdc_names.append(str(rcdc['name']))

pubs_rcdc = pd.DataFrame({'pub_id':data_id
                           , 'rcdc_name':rcdc_names}).drop_duplicates().reset_index(drop=True)

pubs_rcdc['rcdc_name'] = pubs_rcdc['rcdc_name'].str.strip().str.upper()

In [172]:
pubs_rcdc.to_csv('../data/topic_lookups/publications_rcdc.csv', index=False)

## Personal Information

In [204]:
funded_grnts = pd.read_csv('../data/funded_grants.csv', low_memory=False)
pubs = pd.read_csv('../data/researcher_publications.csv', low_memory=False)

### Gender

In [205]:
# Gender Imputer
d = gender.Detector()

# Gender Abbreviation Table
gender_abbr = pd.DataFrame({'gender': ['male', 'mostly_male', 'unknown', 'andy', 'mostly_female', 'female']
                            , 'gender_abbr': ['M', 'M', 'UNKNOWN', 'UNKNOWN', 'F', 'F']})

In [206]:
rsr_gender = funded_grnts[['rsr_id', 'first_name', 'last_name']].drop_duplicates().reset_index(drop=True)

In [207]:
rsr_gender['gender_name'] = rsr_gender['first_name'].fillna('')
rsr_gender['gender_name'] = rsr_gender['gender_name'].apply(lambda x: x.split(" ")[0])
rsr_gender['gender'] = rsr_gender['gender_name'].apply(lambda x: d.get_gender(x))
rsr_gender = pd.merge(rsr_gender, gender_abbr, how='left', on='gender')
del rsr_gender['gender'], rsr_gender['gender_name'], rsr_gender['first_name'], rsr_gender['last_name']
rsr_gender.rename(columns={'gender_abbr':'rsr_gender'}, inplace=True)

In [208]:
rsr_gender['rsr_gender'].value_counts(normalize=True)

M          0.551271
F          0.291774
UNKNOWN    0.156955
Name: rsr_gender, dtype: float64

### Career Age

In [235]:
rsr_first_year = funded_grnts[['rsr_id', 'start_date']].groupby('rsr_id')['start_date'].min().reset_index()
rsr_first_year['first_grant_year'] = pd.DatetimeIndex(rsr_first_year['start_date']).year
temp = pubs[['rsr_id', 'year']].groupby('rsr_id')['year'].min().reset_index()
temp.rename(columns={'year':'first_pub_year'}, inplace=True)
rsr_first_year = pd.merge(rsr_first_year, temp, on='rsr_id', how='left')
rsr_first_year['rsr_career_start_year'] = rsr_first_year[['first_grant_year','first_pub_year']].min(axis=1)

In [239]:
rsr_first_year = rsr_first_year[['rsr_id', 'rsr_career_start_year']]

### Original Affiliation

Maybe do this with Grant affiliation instead of Publication?

In [210]:
cols = ['rsr_id', 'rsr_affiliation', 'rsr_affiliation_id', 'rsr_country', 'rsr_city', 'year']
affiliation = pubs[pubs['rsr_affiliation'].notnull()][cols]
affiliation = affiliation.sort_values(['rsr_id', 'year']).drop_duplicates('rsr_id', keep='first')
del affiliation['year']

### First n Years of Publications

In [262]:
n = 1

In [263]:
pubs_rcdc = pd.read_csv('../data/topic_lookups/publications_rcdc.csv')
pubs_cso = pd.read_csv('../data/topic_lookups/publications_cso.csv')

In [287]:
first_pubs = pd.merge(rsr_first_year, pubs[['rsr_id', 'year', 'pub_id', 'citations']], how='left', on='rsr_id')
first_pubs = first_pubs[first_pubs['year']<=(first_pubs['rsr_career_start_year']+n)].reset_index(drop=True)

In [288]:
first_topics = first_pubs.groupby(['rsr_id'])
first_topics = pd.DataFrame({'rsr_nb_early_pubs': first_topics['rsr_id'].count()
                             , 'rsr_nb_early_citations': first_topics['citations'].sum()}).reset_index()

#### CSO

In [290]:
temp = pd.merge(first_pubs, pubs_cso, how='left', on='pub_id')
temp = temp.groupby(['rsr_id', 'cso_name']).size().reset_index()
temp['cso_id'] = "cso_"+temp.groupby(['cso_name']).ngroup().astype(str)
temp[['cso_name', 'cso_id']].drop_duplicates().to_csv('../data/topic_lookups/cso_ids.csv', index=False)
temp = pd.pivot_table(temp, index='rsr_id', columns='cso_id', values=0, aggfunc='sum').reset_index()

In [291]:
first_topics = pd.merge(first_topics, temp, how='left', on='rsr_id')

#### RCDC

In [292]:
temp = pd.merge(first_pubs, pubs_rcdc, how='left', on='pub_id')
temp = temp.groupby(['rsr_id', 'rcdc_name']).size().reset_index()
temp['rcdc_id'] = "rcdc_"+temp.groupby(['rcdc_name']).ngroup().astype(str)
temp[['rcdc_name', 'rcdc_id']].drop_duplicates().to_csv('../data/topic_lookups/rcdc_ids.csv', index=False)
temp = pd.pivot_table(temp, index='rsr_id', columns='rcdc_id', values=0, aggfunc='sum').reset_index()

In [293]:
first_topics = pd.merge(first_topics, temp, how='left', on='rsr_id')

In [299]:
first_topics.shape

(11897, 272)

### ORCID Confirmed

In [294]:
inca_orcid_confirmed = pd.read_csv('../data/inca_orcid_confirmations.csv')

In [295]:
print("Number of ORCID Confirmed INCa Researchers: {}".format(inca_orcid_confirmed.shape[0]))

Number of ORCID Confirmed INCa Researchers: 173


### Combine All and Export

In [303]:
rsr_info = pd.merge(funded_grnts[['rsr_id', 'first_name', 'last_name']].drop_duplicates().reset_index(drop=True),
                    rsr_gender, how='left', on='rsr_id')
rsr_info = pd.merge(rsr_info, rsr_first_year, how='left', on='rsr_id')
rsr_info = pd.merge(rsr_info, affiliation, how='left', on='rsr_id')
rsr_info = pd.merge(rsr_info, inca_orcid_confirmed, how='left', on='rsr_id')
rsr_info = pd.merge(rsr_info, first_topics, how='left', on='rsr_id')

In [304]:
rsr_info['orcid_confirmed'].fillna(False, inplace=True)

In [305]:
rsr_info.describe(include='all')

Unnamed: 0,rsr_id,first_name,last_name,rsr_gender,rsr_career_start_year,rsr_affiliation,rsr_affiliation_id,rsr_country,rsr_city,orcid_confirmed,...,rcdc_90,rcdc_91,rcdc_92,rcdc_93,rcdc_94,rcdc_95,rcdc_96,rcdc_97,rcdc_98,rcdc_99
count,14004,13978,14004,14004,14004.0,12173,9804,9804,9804,14004,...,8.0,332.0,329.0,121.0,1231.0,107.0,50.0,30.0,458.0,14.0
unique,14004,7683,9132,3,,4030,1709,60,727,2,...,,,,,,,,,,
top,ur.01163565663.79,David,WANG,M,,Harvard University,grid.38142.3c,US,Cambridge,False,...,,,,,,,,,,
freq,1,142,91,7720,,197,197,6492,414,13834,...,,,,,,,,,,
mean,,,,,1996.344045,,,,,,...,1.0,1.454819,1.407295,1.487603,1.706742,1.794393,1.7,1.666667,1.539301,1.071429
std,,,,,10.437329,,,,,,...,0.0,0.859147,0.89276,0.837812,1.360635,1.138837,1.147313,0.994236,1.06649,0.267261
min,,,,,1949.0,,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,,,,,1989.0,,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,,,,,1997.0,,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,,,,,2005.0,,,,,,...,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0


In [306]:
rsr_info.to_csv('../data/researcher_info.csv', index=False)

## Sandbox