Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added img/favicons/nih_favicon.ico
Binary file not shown.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ strict-rfc3339==0.5
xmltodict==0.9.2
SPARQLWrapper==1.6.4
django-robots==1.1
beautifulsoup4==4.4.1
253 changes: 253 additions & 0 deletions scrapi/harvesters/nih.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
"""
Harvester for NIH.gov Research Portal Online Reporting Tools (RePORTER) for the SHARE Notification Service

Getting weekly summary from ExPORTER on nih.gov, parse XML and normalize the data
An example file: http://exporter.nih.gov/XMLData/final/RePORTER_PRJ_X_FY2015_035.zip

Note: This harvester assigns incorrect dates to some xml files due to an inconsistency in the numbering of week of the
month in the project file names. It is guaranteed that all data are harvested in a sufficiently long time frame
(>1 month).
"""

from __future__ import unicode_literals


import logging
import re

from bs4 import BeautifulSoup
from datetime import date, timedelta
from dateutil.parser import parse
from io import BytesIO
from lxml import etree
from zipfile import ZipFile

from scrapi import requests
from scrapi import settings
from scrapi.base import XMLHarvester
from scrapi.util import copy_to_unicode
from scrapi.linter.document import RawDocument
from scrapi.base.schemas import default_name_parser
from scrapi.base.helpers import compose, single_result, build_properties, datetime_formatter


logger = logging.getLogger(__name__)


def daterange(start_date, end_date):
"""
Get all the dates between the start_date and the end_date
"""
for ordinal in range(start_date.toordinal(), end_date.toordinal()):
yield date.fromordinal(ordinal)


def get_days_of_week(start_date, end_date, day_of_week):
"""
First convert start_date and end_date to have the day of week we require.
Then get all the dates of the specified day of week between the start_date and end_date.
"""
start_date = start_date - timedelta(days=(start_date.weekday() - day_of_week))
end_date = end_date - timedelta(days=(end_date.weekday() - day_of_week))

for ordinal in range(start_date.toordinal(), end_date.toordinal() + 1):
if date.fromordinal(ordinal).weekday() == day_of_week:
yield date.fromordinal(ordinal)


def get_fiscal_year(mydate=date.today()):
"""
Return the current fiscal year. Each fiscal year starts on October 1
"""
if mydate.month < 10:
return mydate.year
else:
return mydate.year + 1


def get_fiscal_years(dates):
"""
Given a range of dates, get unique fiscal years
"""
return tuple(set(map(get_fiscal_year, dates)))


def parse_month_column(month_column, day_of_week):
"""
Given a month column string, return the date of a day (Monday by default) of that week
An example of a month column string: September, 2015 - WEEK 1
"""
month_year, week = iter(map(lambda x: x.strip(), month_column.split('-')))
first_day = parse('1 ' + month_year)
first_day -= timedelta(days=(first_day.weekday() - day_of_week + 7 * (1 if first_day.weekday() - day_of_week <= 0 else 0)))
week = int(re.search('.*([0-9]{1,2})', week).group(1))
mydate = first_day + timedelta(week * 7)
return mydate.date()


def parse_row(row, day_of_week):
"""
Get a row of the ExPORTER table, return the date of a day (Monday by default) of that week, the fiscal year,
and the url of the xml file
To keep the format consistent, if the record is from previous fiscal years, None is returned
"""
row_text = list(map(lambda x: x.text.strip('\t').strip('\n').strip('\r').strip('<td>').strip('</td>'), row))
row_text = list(map(lambda x: x.strip(), row_text))
month_column = row_text[1]
fiscal_year = int(row_text[2])
url = row[3].find('a').get('href')

if month_column.lower() == u"all":
return (None, fiscal_year, url)
elif re.match('[A-Za-z]*, [0-9]{4} - .*', month_column):
date = parse_month_column(month_column, day_of_week)
return (date, fiscal_year, url)


def parse_rows(rows, day_of_week):
"""
A generator to parse all the rows
"""
for row in rows:
yield parse_row(row('td'), day_of_week)


def construct_urls(base_url, start_date, end_date, rows, day_of_week=0):
"""
Given date range, constructs urls of corresponded XML files.
"""
dates = [i for i in get_days_of_week(start_date, end_date, day_of_week)]
fiscal_years = get_fiscal_years(dates)
for data in parse_rows(rows, day_of_week):
if data[0] in dates or (data[0] is None and data[1] in fiscal_years):
yield "".join([base_url, data[2]])


def get_xml_files(urls):
for zip_url in urls:
data = requests.get(zip_url)
zipfile = ZipFile(BytesIO(data.content))
with zipfile.open(zipfile.namelist()[0], 'r') as f:
yield f.read()


def xml_records(files):
for xml_file in files:
records = etree.XML(xml_file).xpath('row')
for record in records:
yield record


def add_affiliation(name, org_name):
name['affiliation'] = [{'name': org_name.text}]
return name


def nih_name_parser(names, org_name):
"""
Takes a list of names and organization names, and attempts to parse them
"""
names = default_name_parser(names)
return list(map(add_affiliation, names, org_name))


class NIHHarvesters(XMLHarvester):

short_name = 'nih'
long_name = 'NIH Research Portal Online Reporting Tools'
url = 'http://exporter.nih.gov/ExPORTER_Catalog.aspx/'
project_base_url = 'https://projectreporter.nih.gov/project_info_description.cfm?aid={}'
foa_base_url = 'http://grants.nih.gov/grants/guide/pa-files/{}.html'
DEFAULT_ENCODING = 'UTF-8'
record_encoding = None

@property
def schema(self):
return {
"contributors": ('//PIS/PI/PI_NAME/node()', '//ORG_NAME', nih_name_parser),
"uris": {
"canonicalUri": ("//APPLICATION_ID/node()", compose(self.construct_project_url, single_result)),
"descriptorUris": ("//APPLICATION_ID/node()", "//FOA_NUMBER/node()",
self.construct_descriptor_uris)
},
"providerUpdatedDateTime": ("AWARD_NOTICE_DATE/node()", compose(datetime_formatter, single_result)),
"title": ('//PROJECT_TITLE/node()', single_result),
"tags": ('//PROJECT_TERMSX/TERM/node()'),
"otherProperties": build_properties(
("applicationID", "//APPLICATION_ID/node()"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this turn into a URL?

('activity', '//ACTIVITY/node()'),
('administeringIC', '//ADMINISTERING_IC/node()'),
('arraFunded', '//ARRA_FUNDED/node()'),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is arra?

('budgetStart', '//BUDGET_START/node()'),
('budgetEnd', '//BUDGET_END/node()'),
('FOANumber', '//FOA_NUMBER/node()'),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uri?

('fullProjectNumber', '//FULL_PROJECT_NUM/node()'),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uri?

('fundingICs', '//FUNDING_ICs/node()'),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uris?

('fiscalYear', '//FY/node()'),
('NIHSpendingCats', '//NIH_SPENDING_CATS/@xsi:nil'),
('organizationCity', '//ORG_CITY/node()'),
('organizationCountry', '//ORG_CONTRY/node()'),
('organizationDistrict', '//ORG_DISTRICT/node()'),
('organizationDUNS', '//ORG_DUNS/node()'),
('organizationDept', '//ORG_DEPT/node()'),
('organizationFIPS', '//ORG_FIPS/node()'),
('organizationState', '//ORG_STATE/node()'),
('organizationZipcode', '//ORG_ZIPCODE/node()'),
('ICName', '//IC_NAME/node()'),
('organizationName', '//ORG_NAME/node()'),
('projectStart', '//PROJECT_START/node()'),
('projectEnd', '//PROJECT_END/node()'),
('PHR', '//PHR/node()'),
('serialNumber', '//SERIAL_NUMBER/node()'),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uri?

('studySection', '//STUDY_SECTION/node()'),
('studySectionName', '//STUDY_SECTION_NAME/node()'),
('supportYear', '//SUPPORT_YEAR/node()'),
('suffix', '//SUFFIX/node()'),
('subProjectID', '//SUBPROJECT_ID/@xsi:nil'),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uri?

('totalCost', '//TOTAL_COST/node()'),
('totalCostSubProject', '//TOTAL_COST_SUB_PROJECT/node()'),
('coreProjectNumber', '//CORE_PROJECT_NUM/node()'),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uri?

('CFDACode', '//CFDA_CODE/node()'),
('programOfficerName', '//PROGRAM_OFFICER_NAME/node()'),
('edInstType', '//ED_INST_TYPE/node()'),
('awardNoticeDate', '//AWARD_NOTICE_DATE/node()'),
('fundingMechanism', '//FUNDING_MECHANISM/node()')
)
}

def construct_project_url(self, application_id):
return self.project_base_url.format(application_id)

def construct_descriptor_uris(self, application_id, foa_number):
return [
self.project_base_url.format(application_id[0]) if application_id else None,
self.foa_base_url.format(foa_number[0] if foa_number else None)
]

namespaces = {'xsi': "http://www.w3.org/2001/XMLSchema-instance"}

def harvest(self, start_date=None, end_date=None):
"""
Return a list of RawDocuments
"""
start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
end_date = end_date or date.today()

base_url = 'http://exporter.nih.gov/'
table_url = 'http://exporter.nih.gov/ExPORTER_Catalog.aspx/'

# get ExPORTER page html and rows storing records
html = requests.get(table_url).content
soup = BeautifulSoup(html, 'lxml')
table = soup.find('table', id="ContentPlaceHolder1_ProjectData_dgProjectData")
rows = table.find_all('tr', class_="row_bg")
urls = [i for i in construct_urls(base_url, start_date, end_date, rows)]

return [
RawDocument({
'doc': etree.tostring(record, encoding=self.DEFAULT_ENCODING),
'source': self.short_name,
'docID': copy_to_unicode(record.xpath('.//APPLICATION_ID/node()', namespaces=self.namespaces)[0]),
'filetype': 'xml'
}) for record in xml_records(get_xml_files(urls))
]
2,418 changes: 2,418 additions & 0 deletions tests/vcr/nih.yaml

Large diffs are not rendered by default.