Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for CHI Pub health spider #1026

Merged
merged 1 commit into from
Mar 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 27 additions & 7 deletions city_scrapers/spiders/chi_pubhealth.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from city_scrapers_core.spiders import CityScrapersSpider





class ChiPubHealthSpider(CityScrapersSpider):
name = "chi_pubhealth"
agency = "Chicago Department of Public Health"
Expand All @@ -22,8 +25,10 @@ def start_urls(self):
standard_url = "https://www.chicago.gov/city/en/depts/cdph/supp_info/boh/{}-board-of-health-meetings.html" # noqa
url_variant_1 = "https://www.chicago.gov/city/en/depts/cdph/supp_info/boh/{}-board-of-health.html" # noqa

# current_year = 2021
current_year = datetime.now().year


return [
standard_url.format(current_year),
url_variant_1.format(current_year),
Expand All @@ -45,21 +50,27 @@ def parse(self, response):
# The description and meeting dates are a series of p elements
for idx, item in enumerate(response.css(".page-description-above p"), start=1):
if idx == 1:
# inspect_response(response, self)
# Description is the first p element
description = item.xpath("text()").extract_first()
if "333 S" not in description:
raise ValueError("Meeting location has changed")
description = item.xpath("text()").getall()
# description = item.xpath("text()").extract_first()
if "333 S" not in description[1]:
raise ValueError(description)
continue

# Skip empty rows
if not item.css("*::text").extract_first().strip():
continue

start = self._parse_start(item)
if start is None:
continue

meeting = Meeting(
title="Board of Health",
description="",
classification=BOARD,
start=self._parse_start(item),
start=start,
end=None,
time_notes="",
all_day=False,
Expand All @@ -86,24 +97,33 @@ def _parse_date(self, item):
if not date_text:
# Past meetings are links to the agenda
date_text = item.xpath("a/text()").extract_first()

if date_text is None:
return None
# Remove extra whitespace characters
date_text = re.sub(r"\s+", " ", date_text).strip()
date_text = re.sub(r"\s+", " ", str(date_text)).strip()



# Handle typos like "December18"
if re.match(r"[a-zA-Z]+\d+", date_text):
if re.match(r"[a-zA-Z]+\d+", str(date_text)):
date_match = re.search(r"(?P<month>[a-zA-Z]+)(?P<day>\d+)", date_text)
date_text = "{} {}".format(
date_match.group("month"), date_match.group("day")
)
# Extract date formatted like "January 12"

return datetime.strptime(date_text, "%B %d")


def _parse_start(self, item):
"""
Parse the meeting date and set start time to 9am.
"""
datetime_obj = self._parse_date(item)

if datetime_obj is None:
return None

return datetime(self.year, datetime_obj.month, datetime_obj.day, 9)

def _parse_links(self, item, response):
Expand Down
Loading