diff --git a/scraper/scraper/runner.py b/scraper/scraper/runner.py index 6cbaeefc..f2b472a9 100644 --- a/scraper/scraper/runner.py +++ b/scraper/scraper/runner.py @@ -5,17 +5,17 @@ from scrapy.utils.project import get_project_settings -def run_crawlers(start_date): +def run_crawlers(start_from_date): process = CrawlerProcess(get_project_settings()) - # FIXME enable this when all spiders are ready - # for spider in process.spider_loader.list(): - # process.crawl(spider, start_date=start_date) + process.crawl("cityhall_payments", start_from_date=start_from_date) + process.crawl("cityhall_contracts", start_from_date=start_from_date) + process.crawl("cityhall_bids", start_from_date=start_from_date) + process.crawl("citycouncil_agenda", start_from_date=start_from_date) + process.crawl("gazettes", start_from_date=start_from_date) - process.crawl("cityhall_payments", start_date=start_date) - process.crawl("cityhall_contracts", start_date=start_date) - process.crawl("citycouncil_agenda", start_date=start_date) - process.crawl("gazettes", start_date=start_date) + if start_from_date is None: # --all deve incluir páginas legadas + process.crawl("legacy_gazettes") process.start() @@ -26,9 +26,9 @@ def run_crawlers(start_date): ) args = parser.parse_args() if args.all: - start_date = None + start_from_date = None else: yesterday = datetime.now() - timedelta(days=1) - start_date = yesterday.date() + start_from_date = yesterday.date() - run_crawlers(start_date) + run_crawlers(start_from_date) diff --git a/scraper/scraper/spiders/__init__.py b/scraper/scraper/spiders/__init__.py index ebd689ac..f76246cc 100644 --- a/scraper/scraper/spiders/__init__.py +++ b/scraper/scraper/spiders/__init__.py @@ -1,4 +1,18 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. +import scrapy + + +class BaseSpider(scrapy.Spider): + start_from_date = None + + @property + def start_date(self): + if self.start_from_date: + picked_date = self.start_from_date + else: + picked_date = self.initial_date + + return picked_date + + @property + def collect_all(self): + return bool(self.start_from_date) is False diff --git a/scraper/scraper/spiders/citycouncil.py b/scraper/scraper/spiders/citycouncil.py index 117dce99..ade05903 100644 --- a/scraper/scraper/spiders/citycouncil.py +++ b/scraper/scraper/spiders/citycouncil.py @@ -3,8 +3,10 @@ import scrapy from scraper.items import CityCouncilAgendaItem +from . import BaseSpider -class AgendaSpider(scrapy.Spider): + +class AgendaSpider(BaseSpider): name = "citycouncil_agenda" start_urls = ["https://www.feiradesantana.ba.leg.br/agenda"] initial_date = date(2010, 1, 1) @@ -22,11 +24,7 @@ def get_type(event_title): return "not_found" def parse(self, response): - if hasattr(self, "start_date") and self.start_date: - start_date = self.start_date - else: - start_date = self.initial_date - self.logger.info(f"Data inicial: {start_date}") + self.logger.info(f"Data inicial: {self.start_date}") extracted_years = response.css("select#ano option ::text").extract() years = [] @@ -37,9 +35,9 @@ def parse(self, response): pass for year in range(min(years), max(years) + 1): - if start_date.year <= year: + if self.start_date.year <= year: for month in range(1, 13): - if start_date.month <= month: + if self.start_date.month <= month: url = ( "https://www.feiradesantana.ba.leg.br/agenda" f"?mes={month}&ano={year}&Acessar=OK" diff --git a/scraper/scraper/spiders/cityhall.py b/scraper/scraper/spiders/cityhall.py index a74fd7ea..3333378e 100644 --- a/scraper/scraper/spiders/cityhall.py +++ b/scraper/scraper/spiders/cityhall.py @@ -4,16 +4,27 @@ import scrapy from scraper.items import CityHallBidItem, CityHallContractItem, CityHallPaymentsItem -from .utils import identify_contract_id +from . import BaseSpider +from .utils import extract_param, identify_contract_id -class BidsSpider(scrapy.Spider): - name = "bids" +class BidsSpider(BaseSpider): + name = "cityhall_bids" start_urls = ["http://www.feiradesantana.ba.gov.br/seadm/licitacoes.asp"] + initial_date = date(2001, 1, 1) + + def follow_this_date(self, url): + month_year = extract_param(url, "dt") + month_year = datetime.strptime(month_year, "%m-%Y") + + match_month = month_year.month >= self.start_date.month + match_year = month_year.year >= self.start_date.year + return match_month and match_year def parse(self, response): urls = response.xpath("//table/tbody/tr/td[1]/div/a//@href").extract() base_url = "http://www.feiradesantana.ba.gov.br" + self.logger.info(f"Data inicial: {self.start_date}") for url in urls: if base_url not in url: @@ -22,7 +33,9 @@ def parse(self, response): url = response.urljoin(f"{base_url}/{url}") else: url = response.urljoin(f"{base_url}/seadm/{url}") - yield response.follow(url, self.parse_page) + + if self.collect_all or self.follow_this_date(url): + yield response.follow(url, self.parse_page) def parse_page(self, response): raw_modalities = response.xpath("//tr/td[1]/table/tr/td/text()").extract() @@ -108,7 +121,7 @@ def _parse_date(self, raw_date): return [date[1:] for date in raw_date] -class ContractsSpider(scrapy.Spider): +class ContractsSpider(BaseSpider): """Coleta contratos da página de contratos. http://www.transparencia.feiradesantana.ba.gov.br/index.php?view=contratos @@ -126,10 +139,7 @@ class ContractsSpider(scrapy.Spider): initial_date = date(2010, 1, 1) def start_requests(self): - if hasattr(self, "start_date") and self.start_date: - start_date = self.start_date - else: - start_date = self.initial_date + start_date = self.start_date self.logger.info(f"Data inicial: {start_date}") today = datetime.now().date() @@ -219,7 +229,7 @@ def clean_details(self, raw_details): return valid_details -class PaymentsSpider(scrapy.Spider): +class PaymentsSpider(BaseSpider): """Coleta pagamentos realizados. http://www.transparencia.feiradesantana.ba.gov.br/index.php?view=despesa @@ -238,10 +248,7 @@ class PaymentsSpider(scrapy.Spider): initial_date = date(2010, 1, 1) def start_requests(self): - if hasattr(self, "start_date") and self.start_date: - start_date = self.start_date - else: - start_date = self.initial_date + start_date = self.start_date self.logger.info(f"Data inicial: {start_date}") today = datetime.now().date() diff --git a/scraper/scraper/spiders/gazette.py b/scraper/scraper/spiders/gazette.py index bc3f2428..e257eb45 100644 --- a/scraper/scraper/spiders/gazette.py +++ b/scraper/scraper/spiders/gazette.py @@ -1,13 +1,13 @@ -from datetime import datetime +from datetime import date, datetime -import scrapy from scraper.items import GazetteEventItem, LegacyGazetteItem from scrapy import Request +from . import BaseSpider from .utils import replace_query_param -class LegacyGazetteSpider(scrapy.Spider): +class LegacyGazetteSpider(BaseSpider): """Coleta diário oficial de Feira de Santana até 2015. Years: 1999 to 2015 @@ -86,7 +86,7 @@ def extract_events(self, response): return events, events_urls -class ExecutiveAndLegislativeGazetteSpider(scrapy.Spider): +class ExecutiveAndLegislativeGazetteSpider(BaseSpider): """Coleta o Diário Oficial dos poderes executivo e legislativo.""" name = "gazettes" @@ -95,13 +95,10 @@ class ExecutiveAndLegislativeGazetteSpider(scrapy.Spider): powers = {"executivo": 1, "legislativo": 2} last_page = 1 handle_httpstatus_list = [302] + initial_date = date(2015, 1, 1) def parse(self, response): - if hasattr(self, "start_date") and self.start_date: - start_date = self.start_date - else: - start_date = self.initial_date - self.logger.info(f"Data inicial: {start_date}") + self.logger.info(f"Data inicial: {self.start_date}") gazette_table = response.css(".style166") gazettes_links = gazette_table.xpath("a//@href").extract() @@ -109,7 +106,7 @@ def parse(self, response): for url, gazette_date in zip(gazettes_links, dates): date_obj = datetime.strptime(gazette_date, "%d/%m/%Y") - if date_obj.date() == start_date: + if date_obj.date() >= self.start_date: edition = self.extract_edition(url) power = self.extract_power(url) power_id = self.powers[power] @@ -127,7 +124,7 @@ def parse(self, response): meta={"gazette": gazette}, ) - if hasattr(self, "start_date") is False: # all gazettes + if self.collect_all: current_page_selector = "#pages ul li.current::text" current_page = response.css(current_page_selector).extract_first() next_page = int(current_page) + 1 diff --git a/scraper/scraper/spiders/utils.py b/scraper/scraper/spiders/utils.py index 44592731..ceb28adb 100644 --- a/scraper/scraper/spiders/utils.py +++ b/scraper/scraper/spiders/utils.py @@ -1,4 +1,6 @@ import re +import urllib.parse as urlparse +from urllib.parse import parse_qs def replace_query_param(url, field, value): @@ -10,3 +12,12 @@ def identify_contract_id(text): result = re.findall(CONTRACT_NUMBER_PATTERN, text) if result: return result[0] + + +def extract_param(url, param): + parsed = urlparse.urlparse(url) + try: + value = parse_qs(parsed.query)[param] + return value[0] + except KeyError: + return diff --git a/scraper/scraper/tests/test_utils.py b/scraper/scraper/tests/test_utils.py index 13e6385e..8bd94cd7 100644 --- a/scraper/scraper/tests/test_utils.py +++ b/scraper/scraper/tests/test_utils.py @@ -1,6 +1,6 @@ import pytest -from ..spiders.utils import identify_contract_id, replace_query_param +from ..spiders.utils import extract_param, identify_contract_id, replace_query_param @pytest.mark.parametrize( @@ -55,3 +55,20 @@ def test_replace_query_parameter_from_a_url(old_url, field, value, new_url): ) def test_identify_contract_ids(text, expected_contract_id): assert identify_contract_id(text) == expected_contract_id + + +@pytest.mark.parametrize( + "url, param, value", + [ + ( + f"http://www.feiradesantana.ba.gov.br/seadm/servicos.asp?" + "id=2&s=a&link=seadm/licitacoes_pm.asp&cat=PMFS&dt=01-2019#links", + "dt", + "01-2019", + ), + ("http://www.ba.gov.br/servicos.asp?dt=01-2019#links", "dt", "01-2019"), + ("http://www.ba.gov.br/servicos.asp?dt=01-2019#links", "invalid", None), + ], +) +def test_extract_param(url, param, value): + assert extract_param(url, param) == value