-
Notifications
You must be signed in to change notification settings - Fork 47
add nih harvester #405
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add nih harvester #405
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,3 +25,4 @@ strict-rfc3339==0.5 | |
| xmltodict==0.9.2 | ||
| SPARQLWrapper==1.6.4 | ||
| django-robots==1.1 | ||
| beautifulsoup4==4.4.1 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,253 @@ | ||
| """ | ||
| Harvester for NIH.gov Research Portal Online Reporting Tools (RePORTER) for the SHARE Notification Service | ||
|
|
||
| Getting weekly summary from ExPORTER on nih.gov, parse XML and normalize the data | ||
| An example file: http://exporter.nih.gov/XMLData/final/RePORTER_PRJ_X_FY2015_035.zip | ||
|
|
||
| Note: This harvester assigns incorrect dates to some xml files due to an inconsistency in the numbering of week of the | ||
| month in the project file names. It is guaranteed that all data are harvested in a sufficiently long time frame | ||
| (>1 month). | ||
| """ | ||
|
|
||
| from __future__ import unicode_literals | ||
|
|
||
|
|
||
| import logging | ||
| import re | ||
|
|
||
| from bs4 import BeautifulSoup | ||
| from datetime import date, timedelta | ||
| from dateutil.parser import parse | ||
| from io import BytesIO | ||
| from lxml import etree | ||
| from zipfile import ZipFile | ||
|
|
||
| from scrapi import requests | ||
| from scrapi import settings | ||
| from scrapi.base import XMLHarvester | ||
| from scrapi.util import copy_to_unicode | ||
| from scrapi.linter.document import RawDocument | ||
| from scrapi.base.schemas import default_name_parser | ||
| from scrapi.base.helpers import compose, single_result, build_properties, datetime_formatter | ||
|
|
||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def daterange(start_date, end_date): | ||
| """ | ||
| Get all the dates between the start_date and the end_date | ||
| """ | ||
| for ordinal in range(start_date.toordinal(), end_date.toordinal()): | ||
| yield date.fromordinal(ordinal) | ||
|
|
||
|
|
||
| def get_days_of_week(start_date, end_date, day_of_week): | ||
| """ | ||
| First convert start_date and end_date to have the day of week we require. | ||
| Then get all the dates of the specified day of week between the start_date and end_date. | ||
| """ | ||
| start_date = start_date - timedelta(days=(start_date.weekday() - day_of_week)) | ||
| end_date = end_date - timedelta(days=(end_date.weekday() - day_of_week)) | ||
|
|
||
| for ordinal in range(start_date.toordinal(), end_date.toordinal() + 1): | ||
| if date.fromordinal(ordinal).weekday() == day_of_week: | ||
| yield date.fromordinal(ordinal) | ||
|
|
||
|
|
||
| def get_fiscal_year(mydate=date.today()): | ||
| """ | ||
| Return the current fiscal year. Each fiscal year starts on October 1 | ||
| """ | ||
| if mydate.month < 10: | ||
| return mydate.year | ||
| else: | ||
| return mydate.year + 1 | ||
|
|
||
|
|
||
| def get_fiscal_years(dates): | ||
| """ | ||
| Given a range of dates, get unique fiscal years | ||
| """ | ||
| return tuple(set(map(get_fiscal_year, dates))) | ||
|
|
||
|
|
||
| def parse_month_column(month_column, day_of_week): | ||
| """ | ||
| Given a month column string, return the date of a day (Monday by default) of that week | ||
| An example of a month column string: September, 2015 - WEEK 1 | ||
| """ | ||
| month_year, week = iter(map(lambda x: x.strip(), month_column.split('-'))) | ||
| first_day = parse('1 ' + month_year) | ||
| first_day -= timedelta(days=(first_day.weekday() - day_of_week + 7 * (1 if first_day.weekday() - day_of_week <= 0 else 0))) | ||
| week = int(re.search('.*([0-9]{1,2})', week).group(1)) | ||
| mydate = first_day + timedelta(week * 7) | ||
| return mydate.date() | ||
|
|
||
|
|
||
| def parse_row(row, day_of_week): | ||
| """ | ||
| Get a row of the ExPORTER table, return the date of a day (Monday by default) of that week, the fiscal year, | ||
| and the url of the xml file | ||
| To keep the format consistent, if the record is from previous fiscal years, None is returned | ||
| """ | ||
| row_text = list(map(lambda x: x.text.strip('\t').strip('\n').strip('\r').strip('<td>').strip('</td>'), row)) | ||
| row_text = list(map(lambda x: x.strip(), row_text)) | ||
| month_column = row_text[1] | ||
| fiscal_year = int(row_text[2]) | ||
| url = row[3].find('a').get('href') | ||
|
|
||
| if month_column.lower() == u"all": | ||
| return (None, fiscal_year, url) | ||
| elif re.match('[A-Za-z]*, [0-9]{4} - .*', month_column): | ||
| date = parse_month_column(month_column, day_of_week) | ||
| return (date, fiscal_year, url) | ||
|
|
||
|
|
||
| def parse_rows(rows, day_of_week): | ||
| """ | ||
| A generator to parse all the rows | ||
| """ | ||
| for row in rows: | ||
| yield parse_row(row('td'), day_of_week) | ||
|
|
||
|
|
||
| def construct_urls(base_url, start_date, end_date, rows, day_of_week=0): | ||
| """ | ||
| Given date range, constructs urls of corresponded XML files. | ||
| """ | ||
| dates = [i for i in get_days_of_week(start_date, end_date, day_of_week)] | ||
| fiscal_years = get_fiscal_years(dates) | ||
| for data in parse_rows(rows, day_of_week): | ||
| if data[0] in dates or (data[0] is None and data[1] in fiscal_years): | ||
| yield "".join([base_url, data[2]]) | ||
|
|
||
|
|
||
| def get_xml_files(urls): | ||
| for zip_url in urls: | ||
| data = requests.get(zip_url) | ||
| zipfile = ZipFile(BytesIO(data.content)) | ||
| with zipfile.open(zipfile.namelist()[0], 'r') as f: | ||
| yield f.read() | ||
|
|
||
|
|
||
| def xml_records(files): | ||
| for xml_file in files: | ||
| records = etree.XML(xml_file).xpath('row') | ||
| for record in records: | ||
| yield record | ||
|
|
||
|
|
||
| def add_affiliation(name, org_name): | ||
| name['affiliation'] = [{'name': org_name.text}] | ||
| return name | ||
|
|
||
|
|
||
| def nih_name_parser(names, org_name): | ||
| """ | ||
| Takes a list of names and organization names, and attempts to parse them | ||
| """ | ||
| names = default_name_parser(names) | ||
| return list(map(add_affiliation, names, org_name)) | ||
|
|
||
|
|
||
| class NIHHarvesters(XMLHarvester): | ||
|
|
||
| short_name = 'nih' | ||
| long_name = 'NIH Research Portal Online Reporting Tools' | ||
| url = 'http://exporter.nih.gov/ExPORTER_Catalog.aspx/' | ||
| project_base_url = 'https://projectreporter.nih.gov/project_info_description.cfm?aid={}' | ||
| foa_base_url = 'http://grants.nih.gov/grants/guide/pa-files/{}.html' | ||
| DEFAULT_ENCODING = 'UTF-8' | ||
| record_encoding = None | ||
|
|
||
| @property | ||
| def schema(self): | ||
| return { | ||
| "contributors": ('//PIS/PI/PI_NAME/node()', '//ORG_NAME', nih_name_parser), | ||
| "uris": { | ||
| "canonicalUri": ("//APPLICATION_ID/node()", compose(self.construct_project_url, single_result)), | ||
| "descriptorUris": ("//APPLICATION_ID/node()", "//FOA_NUMBER/node()", | ||
| self.construct_descriptor_uris) | ||
| }, | ||
| "providerUpdatedDateTime": ("AWARD_NOTICE_DATE/node()", compose(datetime_formatter, single_result)), | ||
| "title": ('//PROJECT_TITLE/node()', single_result), | ||
| "tags": ('//PROJECT_TERMSX/TERM/node()'), | ||
| "otherProperties": build_properties( | ||
| ("applicationID", "//APPLICATION_ID/node()"), | ||
| ('activity', '//ACTIVITY/node()'), | ||
| ('administeringIC', '//ADMINISTERING_IC/node()'), | ||
| ('arraFunded', '//ARRA_FUNDED/node()'), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is arra? |
||
| ('budgetStart', '//BUDGET_START/node()'), | ||
| ('budgetEnd', '//BUDGET_END/node()'), | ||
| ('FOANumber', '//FOA_NUMBER/node()'), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uri? |
||
| ('fullProjectNumber', '//FULL_PROJECT_NUM/node()'), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uri? |
||
| ('fundingICs', '//FUNDING_ICs/node()'), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uris? |
||
| ('fiscalYear', '//FY/node()'), | ||
| ('NIHSpendingCats', '//NIH_SPENDING_CATS/@xsi:nil'), | ||
| ('organizationCity', '//ORG_CITY/node()'), | ||
| ('organizationCountry', '//ORG_CONTRY/node()'), | ||
| ('organizationDistrict', '//ORG_DISTRICT/node()'), | ||
| ('organizationDUNS', '//ORG_DUNS/node()'), | ||
| ('organizationDept', '//ORG_DEPT/node()'), | ||
| ('organizationFIPS', '//ORG_FIPS/node()'), | ||
| ('organizationState', '//ORG_STATE/node()'), | ||
| ('organizationZipcode', '//ORG_ZIPCODE/node()'), | ||
| ('ICName', '//IC_NAME/node()'), | ||
| ('organizationName', '//ORG_NAME/node()'), | ||
| ('projectStart', '//PROJECT_START/node()'), | ||
| ('projectEnd', '//PROJECT_END/node()'), | ||
| ('PHR', '//PHR/node()'), | ||
| ('serialNumber', '//SERIAL_NUMBER/node()'), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uri? |
||
| ('studySection', '//STUDY_SECTION/node()'), | ||
| ('studySectionName', '//STUDY_SECTION_NAME/node()'), | ||
| ('supportYear', '//SUPPORT_YEAR/node()'), | ||
| ('suffix', '//SUFFIX/node()'), | ||
| ('subProjectID', '//SUBPROJECT_ID/@xsi:nil'), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uri? |
||
| ('totalCost', '//TOTAL_COST/node()'), | ||
| ('totalCostSubProject', '//TOTAL_COST_SUB_PROJECT/node()'), | ||
| ('coreProjectNumber', '//CORE_PROJECT_NUM/node()'), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uri? |
||
| ('CFDACode', '//CFDA_CODE/node()'), | ||
| ('programOfficerName', '//PROGRAM_OFFICER_NAME/node()'), | ||
| ('edInstType', '//ED_INST_TYPE/node()'), | ||
| ('awardNoticeDate', '//AWARD_NOTICE_DATE/node()'), | ||
| ('fundingMechanism', '//FUNDING_MECHANISM/node()') | ||
| ) | ||
| } | ||
|
|
||
| def construct_project_url(self, application_id): | ||
| return self.project_base_url.format(application_id) | ||
|
|
||
| def construct_descriptor_uris(self, application_id, foa_number): | ||
| return [ | ||
| self.project_base_url.format(application_id[0]) if application_id else None, | ||
| self.foa_base_url.format(foa_number[0] if foa_number else None) | ||
| ] | ||
|
|
||
| namespaces = {'xsi': "http://www.w3.org/2001/XMLSchema-instance"} | ||
|
|
||
| def harvest(self, start_date=None, end_date=None): | ||
| """ | ||
| Return a list of RawDocuments | ||
| """ | ||
| start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) | ||
| end_date = end_date or date.today() | ||
|
|
||
| base_url = 'http://exporter.nih.gov/' | ||
| table_url = 'http://exporter.nih.gov/ExPORTER_Catalog.aspx/' | ||
|
|
||
| # get ExPORTER page html and rows storing records | ||
| html = requests.get(table_url).content | ||
| soup = BeautifulSoup(html, 'lxml') | ||
| table = soup.find('table', id="ContentPlaceHolder1_ProjectData_dgProjectData") | ||
| rows = table.find_all('tr', class_="row_bg") | ||
| urls = [i for i in construct_urls(base_url, start_date, end_date, rows)] | ||
|
|
||
| return [ | ||
| RawDocument({ | ||
| 'doc': etree.tostring(record, encoding=self.DEFAULT_ENCODING), | ||
| 'source': self.short_name, | ||
| 'docID': copy_to_unicode(record.xpath('.//APPLICATION_ID/node()', namespaces=self.namespaces)[0]), | ||
| 'filetype': 'xml' | ||
| }) for record in xml_records(get_xml_files(urls)) | ||
| ] | ||
Large diffs are not rendered by default.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this turn into a URL?