Skip to content
This repository has been archived by the owner on Oct 2, 2023. It is now read-only.

Commit

Permalink
Filtra diário oficial por data (#31)
Browse files Browse the repository at this point in the history
  • Loading branch information
anapaulagomes committed Dec 27, 2019
1 parent 5101a90 commit ff23ff3
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 22 deletions.
1 change: 1 addition & 0 deletions scraper/scraper/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def run_crawlers(start_date):
process.crawl("cityhall_payments", start_date=start_date)
process.crawl("cityhall_contracts", start_date=start_date)
process.crawl("citycouncil_agenda", start_date=start_date)
process.crawl("gazettes", start_date=start_date)
process.start()


Expand Down
1 change: 1 addition & 0 deletions scraper/scraper/spiders/citycouncil.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def parse(self, response):
start_date = self.start_date
else:
start_date = self.initial_date
self.logger.info(f"Data inicial: {start_date}")

extracted_years = response.css("select#ano option ::text").extract()
years = []
Expand Down
55 changes: 33 additions & 22 deletions scraper/scraper/spiders/gazette.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class LegacyGazetteSpider(scrapy.Spider):
Example: http://www.feiradesantana.ba.gov.br/seadm/leis.asp?acao=ir&p=24&ano=2015
"""

name = "legacy_gazette"
name = "legacy_gazettes"
start_urls = [
f"http://www.feiradesantana.ba.gov.br/servicos.asp?"
f"acao=ir&s=a&link=seadm/leis.asp&p=1&"
Expand Down Expand Up @@ -97,34 +97,45 @@ class ExecutiveAndLegislativeGazetteSpider(scrapy.Spider):
handle_httpstatus_list = [302]

def parse(self, response):
if hasattr(self, "start_date") and self.start_date:
start_date = self.start_date
else:
start_date = self.initial_date
self.logger.info(f"Data inicial: {start_date}")

gazette_table = response.css(".style166")
gazettes_links = gazette_table.xpath("a//@href").extract()
dates = gazette_table.css("a::text").extract()

for url, date in zip(gazettes_links, dates):
edition = self.extract_edition(url)
power = self.extract_power(url)
power_id = self.powers[power]

gazette = dict(
date=date,
power=power,
url=response.urljoin(url),
file_url=response.urljoin(f"abrir.asp?edi={edition}&p={power_id}"),
)
for url, gazette_date in zip(gazettes_links, dates):
date_obj = datetime.strptime(gazette_date, "%d/%m/%Y")
if date_obj.date() == start_date:
edition = self.extract_edition(url)
power = self.extract_power(url)
power_id = self.powers[power]

gazette = dict(
date=gazette_date,
power=power,
url=response.urljoin(url),
file_url=response.urljoin(f"abrir.asp?edi={edition}&p={power_id}"),
)

yield Request(
gazette["url"], callback=self.parse_details, meta={"gazette": gazette}
)
yield Request(
gazette["url"],
callback=self.parse_details,
meta={"gazette": gazette},
)

current_page_selector = "#pages ul li.current::text"
current_page = response.css(current_page_selector).extract_first()
next_page = int(current_page) + 1
next_page_url = response.urljoin(f"/?p={next_page}")
if hasattr(self, "start_date") is False: # all gazettes
current_page_selector = "#pages ul li.current::text"
current_page = response.css(current_page_selector).extract_first()
next_page = int(current_page) + 1
next_page_url = response.urljoin(f"/?p={next_page}")

if next_page > self.last_page:
self.last_page = next_page
yield Request(next_page_url)
if next_page > self.last_page:
self.last_page = next_page
yield Request(next_page_url)

def parse_details(self, response):
gazette = response.meta["gazette"]
Expand Down

0 comments on commit ff23ff3

Please sign in to comment.