Skip to content
This repository has been archived by the owner on Oct 2, 2023. It is now read-only.

Commit

Permalink
Adiciona data início da coleta para licitações (#32)
Browse files Browse the repository at this point in the history
* Adiciona data inicial para spider

* Extrai parametro a partir de uma url

* Corrige opção all na coleta dos diários

* Isola lógica da data em  um spider base

* Adapta spiders
  • Loading branch information
anapaulagomes committed Dec 30, 2019
1 parent 01a57d2 commit 795d348
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 49 deletions.
22 changes: 11 additions & 11 deletions scraper/scraper/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@
from scrapy.utils.project import get_project_settings


def run_crawlers(start_date):
def run_crawlers(start_from_date):
process = CrawlerProcess(get_project_settings())

# FIXME enable this when all spiders are ready
# for spider in process.spider_loader.list():
# process.crawl(spider, start_date=start_date)
process.crawl("cityhall_payments", start_from_date=start_from_date)
process.crawl("cityhall_contracts", start_from_date=start_from_date)
process.crawl("cityhall_bids", start_from_date=start_from_date)
process.crawl("citycouncil_agenda", start_from_date=start_from_date)
process.crawl("gazettes", start_from_date=start_from_date)

process.crawl("cityhall_payments", start_date=start_date)
process.crawl("cityhall_contracts", start_date=start_date)
process.crawl("citycouncil_agenda", start_date=start_date)
process.crawl("gazettes", start_date=start_date)
if start_from_date is None: # --all deve incluir páginas legadas
process.crawl("legacy_gazettes")
process.start()


Expand All @@ -26,9 +26,9 @@ def run_crawlers(start_date):
)
args = parser.parse_args()
if args.all:
start_date = None
start_from_date = None
else:
yesterday = datetime.now() - timedelta(days=1)
start_date = yesterday.date()
start_from_date = yesterday.date()

run_crawlers(start_date)
run_crawlers(start_from_date)
22 changes: 18 additions & 4 deletions scraper/scraper/spiders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,18 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy


class BaseSpider(scrapy.Spider):
start_from_date = None

@property
def start_date(self):
if self.start_from_date:
picked_date = self.start_from_date
else:
picked_date = self.initial_date

return picked_date

@property
def collect_all(self):
return bool(self.start_from_date) is False
14 changes: 6 additions & 8 deletions scraper/scraper/spiders/citycouncil.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import scrapy
from scraper.items import CityCouncilAgendaItem

from . import BaseSpider

class AgendaSpider(scrapy.Spider):

class AgendaSpider(BaseSpider):
name = "citycouncil_agenda"
start_urls = ["https://www.feiradesantana.ba.leg.br/agenda"]
initial_date = date(2010, 1, 1)
Expand All @@ -22,11 +24,7 @@ def get_type(event_title):
return "not_found"

def parse(self, response):
if hasattr(self, "start_date") and self.start_date:
start_date = self.start_date
else:
start_date = self.initial_date
self.logger.info(f"Data inicial: {start_date}")
self.logger.info(f"Data inicial: {self.start_date}")

extracted_years = response.css("select#ano option ::text").extract()
years = []
Expand All @@ -37,9 +35,9 @@ def parse(self, response):
pass

for year in range(min(years), max(years) + 1):
if start_date.year <= year:
if self.start_date.year <= year:
for month in range(1, 13):
if start_date.month <= month:
if self.start_date.month <= month:
url = (
"https://www.feiradesantana.ba.leg.br/agenda"
f"?mes={month}&ano={year}&Acessar=OK"
Expand Down
35 changes: 21 additions & 14 deletions scraper/scraper/spiders/cityhall.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,27 @@
import scrapy
from scraper.items import CityHallBidItem, CityHallContractItem, CityHallPaymentsItem

from .utils import identify_contract_id
from . import BaseSpider
from .utils import extract_param, identify_contract_id


class BidsSpider(scrapy.Spider):
name = "bids"
class BidsSpider(BaseSpider):
name = "cityhall_bids"
start_urls = ["http://www.feiradesantana.ba.gov.br/seadm/licitacoes.asp"]
initial_date = date(2001, 1, 1)

def follow_this_date(self, url):
month_year = extract_param(url, "dt")
month_year = datetime.strptime(month_year, "%m-%Y")

match_month = month_year.month >= self.start_date.month
match_year = month_year.year >= self.start_date.year
return match_month and match_year

def parse(self, response):
urls = response.xpath("//table/tbody/tr/td[1]/div/a//@href").extract()
base_url = "http://www.feiradesantana.ba.gov.br"
self.logger.info(f"Data inicial: {self.start_date}")

for url in urls:
if base_url not in url:
Expand All @@ -22,7 +33,9 @@ def parse(self, response):
url = response.urljoin(f"{base_url}/{url}")
else:
url = response.urljoin(f"{base_url}/seadm/{url}")
yield response.follow(url, self.parse_page)

if self.collect_all or self.follow_this_date(url):
yield response.follow(url, self.parse_page)

def parse_page(self, response):
raw_modalities = response.xpath("//tr/td[1]/table/tr/td/text()").extract()
Expand Down Expand Up @@ -108,7 +121,7 @@ def _parse_date(self, raw_date):
return [date[1:] for date in raw_date]


class ContractsSpider(scrapy.Spider):
class ContractsSpider(BaseSpider):
"""Coleta contratos da página de contratos.
http://www.transparencia.feiradesantana.ba.gov.br/index.php?view=contratos
Expand All @@ -126,10 +139,7 @@ class ContractsSpider(scrapy.Spider):
initial_date = date(2010, 1, 1)

def start_requests(self):
if hasattr(self, "start_date") and self.start_date:
start_date = self.start_date
else:
start_date = self.initial_date
start_date = self.start_date
self.logger.info(f"Data inicial: {start_date}")
today = datetime.now().date()

Expand Down Expand Up @@ -219,7 +229,7 @@ def clean_details(self, raw_details):
return valid_details


class PaymentsSpider(scrapy.Spider):
class PaymentsSpider(BaseSpider):
"""Coleta pagamentos realizados.
http://www.transparencia.feiradesantana.ba.gov.br/index.php?view=despesa
Expand All @@ -238,10 +248,7 @@ class PaymentsSpider(scrapy.Spider):
initial_date = date(2010, 1, 1)

def start_requests(self):
if hasattr(self, "start_date") and self.start_date:
start_date = self.start_date
else:
start_date = self.initial_date
start_date = self.start_date
self.logger.info(f"Data inicial: {start_date}")
today = datetime.now().date()

Expand Down
19 changes: 8 additions & 11 deletions scraper/scraper/spiders/gazette.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from datetime import datetime
from datetime import date, datetime

import scrapy
from scraper.items import GazetteEventItem, LegacyGazetteItem
from scrapy import Request

from . import BaseSpider
from .utils import replace_query_param


class LegacyGazetteSpider(scrapy.Spider):
class LegacyGazetteSpider(BaseSpider):
"""Coleta diário oficial de Feira de Santana até 2015.
Years: 1999 to 2015
Expand Down Expand Up @@ -86,7 +86,7 @@ def extract_events(self, response):
return events, events_urls


class ExecutiveAndLegislativeGazetteSpider(scrapy.Spider):
class ExecutiveAndLegislativeGazetteSpider(BaseSpider):
"""Coleta o Diário Oficial dos poderes executivo e legislativo."""

name = "gazettes"
Expand All @@ -95,21 +95,18 @@ class ExecutiveAndLegislativeGazetteSpider(scrapy.Spider):
powers = {"executivo": 1, "legislativo": 2}
last_page = 1
handle_httpstatus_list = [302]
initial_date = date(2015, 1, 1)

def parse(self, response):
if hasattr(self, "start_date") and self.start_date:
start_date = self.start_date
else:
start_date = self.initial_date
self.logger.info(f"Data inicial: {start_date}")
self.logger.info(f"Data inicial: {self.start_date}")

gazette_table = response.css(".style166")
gazettes_links = gazette_table.xpath("a//@href").extract()
dates = gazette_table.css("a::text").extract()

for url, gazette_date in zip(gazettes_links, dates):
date_obj = datetime.strptime(gazette_date, "%d/%m/%Y")
if date_obj.date() == start_date:
if date_obj.date() >= self.start_date:
edition = self.extract_edition(url)
power = self.extract_power(url)
power_id = self.powers[power]
Expand All @@ -127,7 +124,7 @@ def parse(self, response):
meta={"gazette": gazette},
)

if hasattr(self, "start_date") is False: # all gazettes
if self.collect_all:
current_page_selector = "#pages ul li.current::text"
current_page = response.css(current_page_selector).extract_first()
next_page = int(current_page) + 1
Expand Down
11 changes: 11 additions & 0 deletions scraper/scraper/spiders/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import re
import urllib.parse as urlparse
from urllib.parse import parse_qs


def replace_query_param(url, field, value):
Expand All @@ -10,3 +12,12 @@ def identify_contract_id(text):
result = re.findall(CONTRACT_NUMBER_PATTERN, text)
if result:
return result[0]


def extract_param(url, param):
parsed = urlparse.urlparse(url)
try:
value = parse_qs(parsed.query)[param]
return value[0]
except KeyError:
return
19 changes: 18 additions & 1 deletion scraper/scraper/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from ..spiders.utils import identify_contract_id, replace_query_param
from ..spiders.utils import extract_param, identify_contract_id, replace_query_param


@pytest.mark.parametrize(
Expand Down Expand Up @@ -55,3 +55,20 @@ def test_replace_query_parameter_from_a_url(old_url, field, value, new_url):
)
def test_identify_contract_ids(text, expected_contract_id):
assert identify_contract_id(text) == expected_contract_id


@pytest.mark.parametrize(
"url, param, value",
[
(
f"http://www.feiradesantana.ba.gov.br/seadm/servicos.asp?"
"id=2&s=a&link=seadm/licitacoes_pm.asp&cat=PMFS&dt=01-2019#links",
"dt",
"01-2019",
),
("http://www.ba.gov.br/servicos.asp?dt=01-2019#links", "dt", "01-2019"),
("http://www.ba.gov.br/servicos.asp?dt=01-2019#links", "invalid", None),
],
)
def test_extract_param(url, param, value):
assert extract_param(url, param) == value

0 comments on commit 795d348

Please sign in to comment.