diff --git a/city_scrapers/spiders/il_health_facilities.py b/city_scrapers/spiders/il_health_facilities.py deleted file mode 100644 index 7e7f0a699..000000000 --- a/city_scrapers/spiders/il_health_facilities.py +++ /dev/null @@ -1,195 +0,0 @@ -from datetime import datetime - -import scrapy -from city_scrapers_core.constants import BOARD, FORUM, NOT_CLASSIFIED -from city_scrapers_core.items import Meeting -from city_scrapers_core.spiders import CityScrapersSpider - - -class IlHealthFacilitiesSpider(CityScrapersSpider): - name = "il_health_facilities" - agency = "Illinois Health Facilities and Services Review Board" - timezone = "America/Chicago" - start_urls = [ - "https://www2.illinois.gov/sites/hfsrb/events/Pages/Board-Meetings.aspx" - ] - - def parse(self, response): - """ - `parse` should always `yield` Meeting items. - - Change the `_parse_title`, `_parse_start`, etc methods to fit your - scraping needs. - """ - links = response.css("a") - parsed_links = [] - for link_element in links: - inner_link_element = link_element.css("h3") - - if inner_link_element: - href = link_element.attrib["href"] - - parsed_links.append(href) - - for link in parsed_links: - yield scrapy.http.Request(link, callback=self.parse_event_page) - - def parse_event_page(self, response): - # An example demonstrating the structure of the time data on the page: - #
- #

When

- # Tuesday, March 21, 2023 - #
- # 9:00 AM - 4:00 PM - #
- time_data = response.css("div.soi-event-data").get() - - time_data = time_data.replace("\r", "").replace("\t", "").replace("\n", "") - - time_data = time_data.split("")[1].split("
") - - date_list = time_data[0].strip().split(" ") - - year = date_list[3].strip(",").strip() - - month = date_list[1].strip(",").strip() - - day = date_list[2].strip(",").strip() - - time_list = time_data[1].split() - - start_hr = time_list[0].split(":")[0] - - start_min = time_list[0].split(":")[1] - - end_hr = time_list[3].split(":")[0] - - end_min = time_list[3].split(":")[1] - - start_meridiem = time_list[1] - - end_meridiem = time_list[4] - - start_date_time = datetime.strptime( - f"{year}_{month}_{day}_{start_hr}_{start_min}_{start_meridiem}", - "%Y_%B_%d_%I_%M_%p", - ) - - end_date_time = datetime.strptime( - f"{year}_{month}_{day}_{end_hr}_{end_min}_{end_meridiem}", - "%Y_%B_%d_%I_%M_%p", - ) - - meeting = Meeting( - title=self._parse_title(response), - description=self._parse_description(response), - classification=self._parse_classification(response), - start=start_date_time, - end=end_date_time, - all_day=self._parse_all_day(response), - time_notes=self._parse_time_notes(response), - location=self._parse_location(response), - links=self._parse_links(response), - source=self._parse_source(response), - ) - - meeting["status"] = self._get_status(meeting) - meeting["id"] = self._get_id(meeting) - yield meeting - - def _parse_title(self, item): - """Parse or generate meeting title.""" - # The structure of the html section where we parse the title - # is as follows: - #
- #
- #

- # March 21, 2023 State Board Meeting - #

- #
- #
- - title = item.css("h1::text").get().strip() - return title - - def _parse_description(self, item): - """Parse or generate meeting description.""" - return "" - - def _parse_classification(self, item): - """Parse or generate classification from allowed options.""" - - # Structure for the html we need to parse: - #

- # Event Type: - # Board Meeting - #

- - event_type_string = item.css("p.soi-eventType").get() - event_type_string = event_type_string.split("")[1].strip().lower() - - if "board" in event_type_string: - return BOARD - - elif "forum" in event_type_string: - return FORUM - else: - return NOT_CLASSIFIED - - def _parse_start(self, item): - """Parse start datetime as a naive datetime object.""" - return None - - def _parse_end(self, item): - """Parse end datetime as a naive datetime object. Added by pipeline if None""" - return None - - def _parse_time_notes(self, item): - """Parse any additional notes on the timing of the meeting""" - return "" - - def _parse_all_day(self, item): - """Parse or generate all-day status. Defaults to False.""" - return False - - def _parse_location(self, item): - """Parse or generate location.""" - - # The address data for this webpage is a little malformed - - location_str1 = item.css("div.soi-event-title::text").get().strip() - location_str2 = item.css("div.soi-event-location-address1::text").get().strip() - location_str3 = item.css("div.soi-event-location-address2::text").get().strip() - - address_string = location_str1 + ", " + location_str2 + location_str3 - - address_string = address_string.replace("`", "") - - return { - "address": address_string, - "name": "", - } - - def _parse_links(self, item): - """Parse or generate links.""" - - links = [ - { - "href": "https://www2.illinois.gov/sites/hfsrb/events/Pages/Board-Meetings.aspx", # noqa - "title": "Board and Subcommittee Meetings", - }, - { - "href": "https://www2.illinois.gov/sites/hfsrb/events/Pages/Previous-Meetings.aspx", # noqa - "title": "Previous Meeting", - }, - { - "href": "https://www2.illinois.gov/sites/hfsrb/events/Pages/Public-Hearing.aspx", # noqa - "title": "Public Hearings", - }, - ] - - return links - - def _parse_source(self, response): - """Parse or generate source.""" - return response.url diff --git a/tests/files/il_health_facilities.html b/tests/files/il_health_facilities.html deleted file mode 100644 index 4bfb2b192..000000000 --- a/tests/files/il_health_facilities.html +++ /dev/null @@ -1,694 +0,0 @@ - - - - - - - Board and Subcommittee Meetings - - Events - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - -
- - - -
- - - - - - - - - - - - -
-
- - - -
- -
- - - - -
-
- -
- - -
-
- - - - - - -
-
- - - -
- -
- - - - -
-
-

- Board and Subcommittee Meetings -

-
-
- - -
- - -
- -
- - -
- - -
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- - -
-
- -
-
- - - -
-
- -
-
- - -
-
-
- -
-
-
- - -
- - - -
-
- - - -
-
- -
-
- - - -
-
- -
-
- -
-
- -
- - -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - \ No newline at end of file diff --git a/tests/files/il_health_facilities_helper.html b/tests/files/il_health_facilities_helper.html deleted file mode 100644 index 51c6bd505..000000000 --- a/tests/files/il_health_facilities_helper.html +++ /dev/null @@ -1,748 +0,0 @@ - - - - - - - - March 21, 2023 State Board Meeting - - Events - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - -
- - - -
- - - - - - - - - - - - -
-
- - - -
- -
- - - - -
-
- -
- - -
-
- - - - - - -
-
- - - -
- -
- - - - -
-
-

- March 21, 2023 State Board Meeting -

-
-
- - -
- - -
- -
- - -
- -
-
-
-

When

- - Tuesday, March 21, 2023 -
- 9:00 AM - 4:00 PM -
-

Location

-
-
- 2001 Rodeo Drive` -
-
- Bolingbrok, Illinois -
-
-   -
-
-   - , -   -
-
- Bolingbrok, Illinois , , -
-
-
-
-
-
 
-
-
-
-
-
-
-

Details

-

Event Type: - Board Meeting

-
-
-
-
-
-
-

Documents

-
-
-
- - - - -
-
-
- - - -
-
- - - -
-
- -
-
- - - -
-
- -
-
- -
-
- -
- - -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - \ No newline at end of file diff --git a/tests/test_il_health_facilities.py b/tests/test_il_health_facilities.py deleted file mode 100644 index 71b147b1c..000000000 --- a/tests/test_il_health_facilities.py +++ /dev/null @@ -1,143 +0,0 @@ -from os.path import dirname, join -from unittest.mock import patch - -import pytest -import scrapy -from city_scrapers_core.constants import BOARD, TENTATIVE -from city_scrapers_core.utils import file_response -from freezegun import freeze_time - -from city_scrapers.spiders.il_health_facilities import IlHealthFacilitiesSpider - -spider = IlHealthFacilitiesSpider() - -test_response = file_response( - join(dirname(__file__), "files", "il_health_facilities.html"), - url="https://www2.illinois.gov/sites/hfsrb/events/Pages/Board-Meetings.aspx", -) - -# The crawler for il_health_facilities grabs information from some pages that are -# linked to from the original page. -# As such, we need to test the adjacent links as well - -adjacent_links = [ - "https://www2.illinois.gov/sites/hfsrb/events/Pages/March-21%202023-State-Board-Meeting.aspx", # noqa - "https://www2.illinois.gov/sites/hfsrb/events/Pages/May-9-2023-State-Board-Meeting.aspx", # noqa - "https://www2.illinois.gov/sites/hfsrb/events/Pages/June-27-2023%20State%20Board%20Meeting.aspx", # noqa - "https://www2.illinois.gov/sites/hfsrb/events/Pages/August-15-2023-State-Board-Meeting.aspx", # noqa - "https://www2.illinois.gov/sites/hfsrb/events/Pages/October-3-2023-State-Board-Meeting.aspx", # noqa - "https://www2.illinois.gov/sites/hfsrb/events/Pages/December-5-2023%20State%20Board%20Meeting.aspx", # noqa -] - - -def mock_scrapy_request(link, callback): - with open( - join(dirname(__file__), "files", "il_health_facilities_helper.html"), "rb" - ) as f: - body = f.read() - - response = scrapy.http.HtmlResponse( - url="my HTML string", body=body, encoding="utf-8" - ) - - result = next(callback(response)) - return result - - -@patch("scrapy.http.Request", mock_scrapy_request) -def generate_parsed_items(): - freezer = freeze_time("2023-02-09") - freezer.start() - - parsed_items = [item for item in spider.parse(test_response)] - - freezer.stop() - return parsed_items - - -parsed_items = generate_parsed_items() - - -def test_num_meetings_found(): - assert len(parsed_items) == 6 - - -@pytest.mark.parametrize("item", parsed_items) -def test_title(item): - assert item["title"] == "March 21, 2023 State Board Meeting" - - -@pytest.mark.parametrize("item", parsed_items) -def test_description(item): - assert item["description"] == "" - - -@pytest.mark.parametrize("item", parsed_items) -def test_start(item): - assert item["start"].strftime("%Y_%B_%d_%I_%M_%p") == "2023_March_21_09_00_AM" - - -@pytest.mark.parametrize("item", parsed_items) -def test_end(item): - assert item["end"].strftime("%Y_%B_%d_%I_%M_%p") == "2023_March_21_04_00_PM" - - -@pytest.mark.parametrize("item", parsed_items) -def test_time_notes(item): - assert item["time_notes"] == "" - - -@pytest.mark.parametrize("item", parsed_items) -def test_id(item): - assert ( - item["id"] - == "il_health_facilities/202303210900/x/march_21_2023_state_board_meeting" - ) - - -@pytest.mark.parametrize("item", parsed_items) -def test_status(item): - assert item["status"] == TENTATIVE - - -@pytest.mark.parametrize("item", parsed_items) -def test_location(item): - assert item["location"] == { - "name": "", - "address": "2001 Rodeo Drive, Bolingbrok, Illinois", - } - - -@pytest.mark.parametrize("item", parsed_items) -def test_source(item): - item[ - "source" - ] == "https://www2.illinois.gov/sites/hfsrb/events/Pages/Board-Meetings.aspx" - - -@pytest.mark.parametrize("item", parsed_items) -def test_links(item): - assert item["links"] == [ - { - "href": "https://www2.illinois.gov/sites/hfsrb/events/Pages/Board-Meetings.aspx", # noqa - "title": "Board and Subcommittee Meetings", - }, - { - "href": "https://www2.illinois.gov/sites/hfsrb/events/Pages/Previous-Meetings.aspx", # noqa - "title": "Previous Meeting", - }, - { - "href": "https://www2.illinois.gov/sites/hfsrb/events/Pages/Public-Hearing.aspx", # noqa - "title": "Public Hearings", - }, - ] - - -@pytest.mark.parametrize("item", parsed_items) -def test_classification(item): - assert item["classification"] == BOARD - - -@pytest.mark.parametrize("item", parsed_items) -def test_all_day(item): - assert item["all_day"] is False