In [4]:
import pandas as pd
import urllib
import re
import os

In [4]:
#download the NHMRC funding data if not already available

source = 'https://nhmrc.gov.au/sites/default/files/documents/attachments/all-grants-2000-2016.xlsx'

if not os.path.exists('DATA/NHMRC/all-grants-2000-2016.xlsx'):
    os.makedirs('DATA/NHMRC/', exist_ok=True)
    urllib.request.urlretrieve(source, 'DATA/NHMRC/all-grants-2000-2016.xlsx')

In [2]:
#data sourced from NHMRC website 20/10/2018
nhmrc_df = pd.read_excel('DATA/NHMRC/all-grants-2000-2016.xlsx', sheet_name=1, skiprows=3)

In [3]:
unique_CI = pd.Series(nhmrc_df['CIA NAME'].unique())

unique_CI.str.split(expand=True)[0].unique()

array(['Prof', 'Dr', 'Ms', 'Mr', 'A/Pr', 'E/Pr', 'Miss', 'Mrs',
       'Mary-Anne', 'Stacey', 'Lee', 'Victoria', 'Suzanne', 'Peter',
       'Jennifer', 'Sarah', 'Jay', 'Kip', 'David', 'Nikki', 'Allan',
       'Yufeng', 'Srinivas', 'Heidi', 'Rosalind', 'Mingming', 'Oscar',
       'Sri', 'Kerryann', 'Martin', 'Hansoo', 'Leon', 'Alexander',
       'Kamalini', 'Daniela', 'Rita', 'Kristy', 'Jonathan', 'Dana',
       'Sharon', 'Andrew', 'Janine', 'Jan', 'Douglas', 'Cherise',
       'University', 'Kerryn', 'Maria', 'Natalie', 'Louise', 'Jane',
       'Rosalba', 'Nita', 'Lisa', 'Murdoch', 'Sandra', 'Robert', 'QIM',
       'Cate', 'Anne', 'Ralph', 'Sianna', 'RAOBio@rgms', 'ECU', 'Susan',
       'Joanne', 'Liesel', 'Craig', 'Katie', 'Stefan', 'Yvonne', 'Rose',
       'Anna', 'Nick', 'RAO', 'Nadine', 'Julie', 'Primary', 'Deepa',
       'Nigel', 'Carol', 'Gerald', 'Tammy', 'Kristal', 'Leanne',
       'Frances', 'Christopher', 'Zhitao', 'Kevin', 'Linda'], dtype=object)

To start with I want to generate a search string for each unique CI in the NHMRC dataset which consists of just the persons name. Because I intend to search PubMed I need to exclude institutional funding.

Basically I want to first remove all titles in the name series. Above the titles contained in the data set are:
'Prof', 'Dr', 'Ms', 'Mr', 'A/Pr', 'E/Pr', 'Miss', 'Mrs', 

In addition to strings that are obviously titles, there are also these suspect strings.
'University', 'QIM', 'RAOBio@rgms', 'ECU', 'RAO', 'Primary', 
I want to first select the data with these strings to get a better idea of what filtaration to apply!

In [4]:
bad_names = '|^'.join(('^University', 'QIM', 'RAOBio@rgms', 'ECU', 'RAO', 'Primary'))

Seems there are a few grants that were entered for instituions and not for individuals... I think I'll just filter these out, there are after all only 6 of them.

In [5]:
title_pat = '|^'.join(('^Prof', 'Dr', 'Ms', 'Mr', 'A/Pr', 'E/Pr', 'Miss', 'Mrs',))
title_pat

'^Prof|^Dr|^Ms|^Mr|^A/Pr|^E/Pr|^Miss|^Mrs'

In [6]:
filters = \
    ((nhmrc_df['STATUS'] == 'Open')                       #Open funding
    & ~nhmrc_df['CIA NAME'].str.contains(bad_names)
    & (nhmrc_df['START YR'] > 2013)
    & (nhmrc_df['GRANT SUB TYPE'].str.contains('Project')))

In [7]:
search_string = lambda S: urllib.parse.quote(f'(({S.CI}[Author]) ' 
                            f'AND ("{S.YR}"[Date - Publication] : "3000"[Date - Publication])) ' 
                           f'AND {S.INS}[Affiliation]')

In [8]:
search_df = nhmrc_df.loc[filters, ['CIA NAME', 'ADMINISTERING INSTITUTION', 'START YR']].copy()
search_df['CIA NAME'] = search_df['CIA NAME'].str.replace(title_pat, '')
search_df.columns = ['CI', 'INS', 'YR']
#search_df
search_df['search_string'] = search_df.apply(search_string, raw=False, axis=1)

In [11]:
search_df.to_csv('DATA/NHMRC/searches.csv')

-----------------------------------------

https://www.ncbi.nlm.nih.gov/pmc/tools/get-metadata/

In [17]:
req = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%28%28%20Giuseppe%20Verdile%5BAuthor%5D%29%20AND%20%28%222014%22%5BDate%20-%20Publication%5D%20%3A%20%223000%22%5BDate%20-%20Publication%5D%29%29%20AND%20Curtin%20University%20of%20Technology%5BAffiliation%5D'
print(urllib.request.urlopen(req).read())

b'<?xml version="1.0" encoding="UTF-8" ?>\n<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd">\n<eSearchResult><Count>4</Count><RetMax>4</RetMax><RetStart>0</RetStart><IdList>\n<Id>29562546</Id>\n<Id>28655134</Id>\n<Id>28387666</Id>\n<Id>27031482</Id>\n</IdList><TranslationSet><Translation>     <From>Giuseppe Verdile[Author]</From>     <To>Verdile, Giuseppe[Full Author Name]</To>    </Translation></TranslationSet><TranslationStack>   <TermSet>    <Term>Verdile, Giuseppe[Full Author Name]</Term>    <Field>Full Author Name</Field>    <Count>56</Count>    <Explode>N</Explode>   </TermSet>   <TermSet>    <Term>"2014"[PDAT]</Term>    <Field>PDAT</Field>    <Count>0</Count>    <Explode>N</Explode>   </TermSet>   <TermSet>    <Term>"3000"[PDAT]</Term>    <Field>PDAT</Field>    <Count>0</Count>    <Explode>N</Explode>   </TermSet>   <OP>RANGE</OP>   <OP>GROUP</OP>   <OP>AND</OP>   <OP>GROUP</OP>   <TermSet>    <Te

In [None]:
'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%28%28%20Giuseppe%20Verdile%5BAuthor%5D%29%20AND%20%28%222014%22%5BDate%20-%20Publication%5D%20%3A%20%223000%22%5BDate%20-%20Publication%5D%29%29%20AND%20Curtin%20University%20of%20Technology%5BAffiliation%5D'

'https://www.ncbi.nlm.nih.gov/pubmed?term=%28%28%20Giuseppe%20Verdile%5BAuthor%5D%29%20AND%20%28%222014%22%5BDate%20-%20Publication%5D%20%3A%20%223000%22%5BDate%20-%20Publication%5D%29%29%20AND%20Curtin%20University%20of%20Technology%5BAffiliation%5D'

In [9]:
for ix in search_name.iteritems(): pass

In [19]:
search_df = pd.DataFrame([search_name, nhmrc_df['ADMINISTERING INSTITUTION']], index=['CI_NAME', 'INSTITUTE']).T

In [20]:
search_df.head()

Unnamed: 0,CI_NAME,INSTITUTE
0,McLean A,University of Adelaide
1,Young R,Austin Hospital Medical Research Foundation
2,Anderson I,Victorian Aboriginal Health Service
3,Kay K,University of New South Wales
4,Woolcock G,La Trobe University


In [30]:
institute = nhmrc_df['ADMINISTERING INSTITUTION']

4363

0        False
1        False
2         True
3        False
4        False
5        False
6        False
7        False
8        False
9        False
10       False
11       False
12        True
13        True
14        True
15        True
16       False
17        True
18        True
19        True
20        True
21       False
22        True
23        True
24        True
25       False
26        True
27        True
28       False
29       False
         ...  
20982    False
20983    False
20984    False
20985    False
20986    False
20987    False
20988    False
20989    False
20990    False
20991    False
20992    False
20993    False
20994    False
20995    False
20996    False
20997    False
20998    False
20999    False
21000    False
21001    False
21002    False
21003    False
21004    False
21005    False
21006    False
21007    False
21008    False
21009    False
21010    False
21011    False
Name: GRANT SUB TYPE, Length: 21012, dtype: bool

In [13]:
f"^{'|^'.join(checks)}"

'^University|^QIM|^RAOBio@rgms|^ECU|^RAO|^Primary'

In [8]:
.str.replace(title_pat, '')

Unnamed: 0,GRANT ID,APPLICATION YEAR,CIA NAME,GRANT SUB TYPE,GRANT TITLE,ADMINISTERING INSTITUTION,STATE,SECTOR,STATUS,START YR,END YR,BUDGET TOTAL,BROAD RESEARCH AREA,FIELD OF RESEARCH,KEYWORDS,MEDIA SUMMARY,ACHIEVEMENTS,EXPECTED FUTURE OUTCOMES,Unnamed: 18
0,943301,1993,Prof Allan McLean,Research Unit Grant,Road Accident Research Unit,University of Adelaide,SA,University,Closed,1994,2000,44441.19,Not Applicable,Environmental and Occupational Health and Safety,| biomechanics | brain injury | impact | phys...,Media Summary not available,,,.
1,947582,1993,Dr Richard Young,Dora Lush Biomedical Postgraduate Scholarship,Altered gene expression in cardiac hypertrophy,Austin Hospital Medical Research Foundation,VIC,Hospital,Closed,1996,2000,695.95,Basic Science,Cardiology (incl. Cardiovascular Diseases),| myocardial infarctio | contractile proteins...,Media Summary not available,,,.
2,956084,1994,Prof Ian Anderson,CARG Project Grant,Developing a HIV and HepC Risk Reduction Progr...,Victorian Aboriginal Health Service,VIC,Government,Closed,1995,2001,76978.62,Basic Science,Medical Virology,,Media Summary not available,,,.
3,956510,1994,Ms Kathleen Kay,CARG Scholarship,World Health Organisation's AIDS policy histor...,University of New South Wales,NSW,University,Closed,1995,2000,3527.39,Public Health,Medical Virology,,Media Summary not available,,,.
4,956545,1994,Mr GWE Woolcock,CARG Scholarship,New social movements and HIV/AIDS: reconceptua...,La Trobe University,VIC,University,Closed,1996,2001,249.0,Public Health,Medical Virology,,Media Summary not available,,,.
