Skip to content
This repository has been archived by the owner on Oct 2, 2023. It is now read-only.

Adiciona data início da coleta para licitações #32

Merged
merged 5 commits into from
Dec 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions scraper/scraper/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@
from scrapy.utils.project import get_project_settings


def run_crawlers(start_date):
def run_crawlers(start_from_date):
process = CrawlerProcess(get_project_settings())

# FIXME enable this when all spiders are ready
# for spider in process.spider_loader.list():
# process.crawl(spider, start_date=start_date)
process.crawl("cityhall_payments", start_from_date=start_from_date)
process.crawl("cityhall_contracts", start_from_date=start_from_date)
process.crawl("cityhall_bids", start_from_date=start_from_date)
process.crawl("citycouncil_agenda", start_from_date=start_from_date)
process.crawl("gazettes", start_from_date=start_from_date)

process.crawl("cityhall_payments", start_date=start_date)
process.crawl("cityhall_contracts", start_date=start_date)
process.crawl("citycouncil_agenda", start_date=start_date)
process.crawl("gazettes", start_date=start_date)
if start_from_date is None: # --all deve incluir páginas legadas
process.crawl("legacy_gazettes")
process.start()


Expand All @@ -26,9 +26,9 @@ def run_crawlers(start_date):
)
args = parser.parse_args()
if args.all:
start_date = None
start_from_date = None
else:
yesterday = datetime.now() - timedelta(days=1)
start_date = yesterday.date()
start_from_date = yesterday.date()

run_crawlers(start_date)
run_crawlers(start_from_date)
22 changes: 18 additions & 4 deletions scraper/scraper/spiders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,18 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy


class BaseSpider(scrapy.Spider):
start_from_date = None

@property
def start_date(self):
if self.start_from_date:
picked_date = self.start_from_date
else:
picked_date = self.initial_date

return picked_date

@property
def collect_all(self):
return bool(self.start_from_date) is False
14 changes: 6 additions & 8 deletions scraper/scraper/spiders/citycouncil.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import scrapy
from scraper.items import CityCouncilAgendaItem

from . import BaseSpider

class AgendaSpider(scrapy.Spider):

class AgendaSpider(BaseSpider):
name = "citycouncil_agenda"
start_urls = ["https://www.feiradesantana.ba.leg.br/agenda"]
initial_date = date(2010, 1, 1)
Expand All @@ -22,11 +24,7 @@ def get_type(event_title):
return "not_found"

def parse(self, response):
if hasattr(self, "start_date") and self.start_date:
start_date = self.start_date
else:
start_date = self.initial_date
self.logger.info(f"Data inicial: {start_date}")
self.logger.info(f"Data inicial: {self.start_date}")

extracted_years = response.css("select#ano option ::text").extract()
years = []
Expand All @@ -37,9 +35,9 @@ def parse(self, response):
pass

for year in range(min(years), max(years) + 1):
if start_date.year <= year:
if self.start_date.year <= year:
for month in range(1, 13):
if start_date.month <= month:
if self.start_date.month <= month:
url = (
"https://www.feiradesantana.ba.leg.br/agenda"
f"?mes={month}&ano={year}&Acessar=OK"
Expand Down
35 changes: 21 additions & 14 deletions scraper/scraper/spiders/cityhall.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,27 @@
import scrapy
from scraper.items import CityHallBidItem, CityHallContractItem, CityHallPaymentsItem

from .utils import identify_contract_id
from . import BaseSpider
from .utils import extract_param, identify_contract_id


class BidsSpider(scrapy.Spider):
name = "bids"
class BidsSpider(BaseSpider):
name = "cityhall_bids"
start_urls = ["http://www.feiradesantana.ba.gov.br/seadm/licitacoes.asp"]
initial_date = date(2001, 1, 1)

def follow_this_date(self, url):
month_year = extract_param(url, "dt")
month_year = datetime.strptime(month_year, "%m-%Y")

match_month = month_year.month >= self.start_date.month
match_year = month_year.year >= self.start_date.year
return match_month and match_year

def parse(self, response):
urls = response.xpath("//table/tbody/tr/td[1]/div/a//@href").extract()
base_url = "http://www.feiradesantana.ba.gov.br"
self.logger.info(f"Data inicial: {self.start_date}")

for url in urls:
if base_url not in url:
Expand All @@ -22,7 +33,9 @@ def parse(self, response):
url = response.urljoin(f"{base_url}/{url}")
else:
url = response.urljoin(f"{base_url}/seadm/{url}")
yield response.follow(url, self.parse_page)

if self.collect_all or self.follow_this_date(url):
yield response.follow(url, self.parse_page)

def parse_page(self, response):
raw_modalities = response.xpath("//tr/td[1]/table/tr/td/text()").extract()
Expand Down Expand Up @@ -108,7 +121,7 @@ def _parse_date(self, raw_date):
return [date[1:] for date in raw_date]


class ContractsSpider(scrapy.Spider):
class ContractsSpider(BaseSpider):
"""Coleta contratos da página de contratos.

http://www.transparencia.feiradesantana.ba.gov.br/index.php?view=contratos
Expand All @@ -126,10 +139,7 @@ class ContractsSpider(scrapy.Spider):
initial_date = date(2010, 1, 1)

def start_requests(self):
if hasattr(self, "start_date") and self.start_date:
start_date = self.start_date
else:
start_date = self.initial_date
start_date = self.start_date
self.logger.info(f"Data inicial: {start_date}")
today = datetime.now().date()

Expand Down Expand Up @@ -219,7 +229,7 @@ def clean_details(self, raw_details):
return valid_details


class PaymentsSpider(scrapy.Spider):
class PaymentsSpider(BaseSpider):
"""Coleta pagamentos realizados.

http://www.transparencia.feiradesantana.ba.gov.br/index.php?view=despesa
Expand All @@ -238,10 +248,7 @@ class PaymentsSpider(scrapy.Spider):
initial_date = date(2010, 1, 1)

def start_requests(self):
if hasattr(self, "start_date") and self.start_date:
start_date = self.start_date
else:
start_date = self.initial_date
start_date = self.start_date
self.logger.info(f"Data inicial: {start_date}")
today = datetime.now().date()

Expand Down
19 changes: 8 additions & 11 deletions scraper/scraper/spiders/gazette.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from datetime import datetime
from datetime import date, datetime

import scrapy
from scraper.items import GazetteEventItem, LegacyGazetteItem
from scrapy import Request

from . import BaseSpider
from .utils import replace_query_param


class LegacyGazetteSpider(scrapy.Spider):
class LegacyGazetteSpider(BaseSpider):
"""Coleta diário oficial de Feira de Santana até 2015.

Years: 1999 to 2015
Expand Down Expand Up @@ -86,7 +86,7 @@ def extract_events(self, response):
return events, events_urls


class ExecutiveAndLegislativeGazetteSpider(scrapy.Spider):
class ExecutiveAndLegislativeGazetteSpider(BaseSpider):
"""Coleta o Diário Oficial dos poderes executivo e legislativo."""

name = "gazettes"
Expand All @@ -95,21 +95,18 @@ class ExecutiveAndLegislativeGazetteSpider(scrapy.Spider):
powers = {"executivo": 1, "legislativo": 2}
last_page = 1
handle_httpstatus_list = [302]
initial_date = date(2015, 1, 1)

def parse(self, response):
if hasattr(self, "start_date") and self.start_date:
start_date = self.start_date
else:
start_date = self.initial_date
self.logger.info(f"Data inicial: {start_date}")
self.logger.info(f"Data inicial: {self.start_date}")

gazette_table = response.css(".style166")
gazettes_links = gazette_table.xpath("a//@href").extract()
dates = gazette_table.css("a::text").extract()

for url, gazette_date in zip(gazettes_links, dates):
date_obj = datetime.strptime(gazette_date, "%d/%m/%Y")
if date_obj.date() == start_date:
if date_obj.date() >= self.start_date:
edition = self.extract_edition(url)
power = self.extract_power(url)
power_id = self.powers[power]
Expand All @@ -127,7 +124,7 @@ def parse(self, response):
meta={"gazette": gazette},
)

if hasattr(self, "start_date") is False: # all gazettes
if self.collect_all:
current_page_selector = "#pages ul li.current::text"
current_page = response.css(current_page_selector).extract_first()
next_page = int(current_page) + 1
Expand Down
11 changes: 11 additions & 0 deletions scraper/scraper/spiders/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import re
import urllib.parse as urlparse
from urllib.parse import parse_qs


def replace_query_param(url, field, value):
Expand All @@ -10,3 +12,12 @@ def identify_contract_id(text):
result = re.findall(CONTRACT_NUMBER_PATTERN, text)
if result:
return result[0]


def extract_param(url, param):
parsed = urlparse.urlparse(url)
try:
value = parse_qs(parsed.query)[param]
return value[0]
except KeyError:
return
19 changes: 18 additions & 1 deletion scraper/scraper/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from ..spiders.utils import identify_contract_id, replace_query_param
from ..spiders.utils import extract_param, identify_contract_id, replace_query_param


@pytest.mark.parametrize(
Expand Down Expand Up @@ -55,3 +55,20 @@ def test_replace_query_parameter_from_a_url(old_url, field, value, new_url):
)
def test_identify_contract_ids(text, expected_contract_id):
assert identify_contract_id(text) == expected_contract_id


@pytest.mark.parametrize(
"url, param, value",
[
(
f"http://www.feiradesantana.ba.gov.br/seadm/servicos.asp?"
"id=2&s=a&link=seadm/licitacoes_pm.asp&cat=PMFS&dt=01-2019#links",
"dt",
"01-2019",
),
("http://www.ba.gov.br/servicos.asp?dt=01-2019#links", "dt", "01-2019"),
("http://www.ba.gov.br/servicos.asp?dt=01-2019#links", "invalid", None),
],
)
def test_extract_param(url, param, value):
assert extract_param(url, param) == value