From fc06bcff05225ea13ac9a5657693b04df45f4ba9 Mon Sep 17 00:00:00 2001 From: Ana Paula Gomes Date: Mon, 23 Dec 2019 09:52:19 -0300 Subject: [PATCH 1/4] Adiciona pycache e corrige texto --- .gitignore | 1 + README.md | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 193c12a1..03ac239b 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ __pycache__ .scrapy/ .vscode .env +.pytest_cache # data *.json diff --git a/README.md b/README.md index d32b3dbf..73d4a7c3 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ pip install -r dev_requirements.txt ``` E tenha o [Apache Tika](https://tika.apache.org/download.html) instalado. -Esse projeto vai extrair o texto dos PDFs. +Ele vai extrair o texto dos PDFs. No diretório `scraper` você poderá encontrar os _spiders_ responsáveis pela coleta dos dados. Para entender melhor como eles funcionam, dê uma olhada From ad20f1137852485ca58e4cb998289f5916df3d2d Mon Sep 17 00:00:00 2001 From: Ana Paula Gomes Date: Mon, 23 Dec 2019 09:52:51 -0300 Subject: [PATCH 2/4] Busca pagamentos um dia antes --- scraper/scraper/runner.py | 15 +++++++++++++++ scraper/scraper/spiders/cityhall.py | 13 ++++++++----- 2 files changed, 23 insertions(+), 5 deletions(-) create mode 100644 scraper/scraper/runner.py diff --git a/scraper/scraper/runner.py b/scraper/scraper/runner.py new file mode 100644 index 00000000..bcbaa522 --- /dev/null +++ b/scraper/scraper/runner.py @@ -0,0 +1,15 @@ +from datetime import datetime, timedelta + +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings + + +process = CrawlerProcess(get_project_settings()) + +# TODO receber via CLI se todos ou não + +yesterday = datetime.now() - timedelta(days=1) +yesterday = yesterday.date() + +process.crawl("cityhall_payments", start_date=yesterday) +process.start() diff --git a/scraper/scraper/spiders/cityhall.py b/scraper/scraper/spiders/cityhall.py index a6895604..09df0a50 100644 --- a/scraper/scraper/spiders/cityhall.py +++ b/scraper/scraper/spiders/cityhall.py @@ -207,7 +207,7 @@ class PaymentsSpider(scrapy.Spider): http://www.transparencia.feiradesantana.ba.gov.br/index.php?view=despesa """ - name = "payments" + name = "cityhall_payments" url = "http://www.transparencia.feiradesantana.ba.gov.br/controller/despesa.php" data = { "POST_PARAMETRO": "PesquisaDespesas", @@ -219,17 +219,20 @@ class PaymentsSpider(scrapy.Spider): } def start_requests(self): - current_date = date(2010, 1, 1) # initial date + if self.start_date: + start_date = self.start_date + else: + start_date = date(2010, 1, 1) today = datetime.now().date() - while current_date <= today: - formatted_date = current_date.strftime("%d/%m/%Y") + while start_date < today: + formatted_date = start_date.strftime("%d/%m/%Y") data = self.data.copy() data["POST_DATA"] = f"{formatted_date} - {formatted_date}" yield scrapy.FormRequest( self.url, formdata=data, callback=self.parse, meta={"data": data} ) - current_date = current_date + timedelta(days=1) + start_date = start_date + timedelta(days=1) def parse(self, response): # ['��� Anterior', '1', '2', '33', 'Pr��ximo ���'] From 2fc6c961181c6ac30ebb4b5924f80a82d89b9682 Mon Sep 17 00:00:00 2001 From: Ana Paula Gomes Date: Mon, 23 Dec 2019 23:05:31 -0300 Subject: [PATCH 3/4] Adiciona opcao 'all' ao comando --- scraper/scraper/runner.py | 24 ++++++++++++++++++------ scraper/scraper/spiders/cityhall.py | 4 +++- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/scraper/scraper/runner.py b/scraper/scraper/runner.py index bcbaa522..4261322d 100644 --- a/scraper/scraper/runner.py +++ b/scraper/scraper/runner.py @@ -1,15 +1,27 @@ +import argparse from datetime import datetime, timedelta from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings -process = CrawlerProcess(get_project_settings()) +def run_crawlers(start_date): + process = CrawlerProcess(get_project_settings()) -# TODO receber via CLI se todos ou não + process.crawl("cityhall_payments", start_date=start_date) + process.start() -yesterday = datetime.now() - timedelta(days=1) -yesterday = yesterday.date() -process.crawl("cityhall_payments", start_date=yesterday) -process.start() +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--all", help="Coleta todos os itens desde a data inicial.", action="store_true" + ) + args = parser.parse_args() + if args.all: + start_date = None + else: + yesterday = datetime.now() - timedelta(days=1) + start_date = yesterday.date() + + run_crawlers(start_date) diff --git a/scraper/scraper/spiders/cityhall.py b/scraper/scraper/spiders/cityhall.py index 09df0a50..7efc8389 100644 --- a/scraper/scraper/spiders/cityhall.py +++ b/scraper/scraper/spiders/cityhall.py @@ -217,12 +217,14 @@ class PaymentsSpider(scrapy.Spider): "POST_NMCREDOR": "", "POST_CPFCNPJ": "", } + initial_date = date(2010, 1, 1) def start_requests(self): if self.start_date: start_date = self.start_date else: - start_date = date(2010, 1, 1) + start_date = self.initial_date + self.logger.info(f"Data inicial: {start_date}") today = datetime.now().date() while start_date < today: From d39b35baee4ca296ff3a2544507fb5c7d8abe993 Mon Sep 17 00:00:00 2001 From: Ana Paula Gomes Date: Mon, 23 Dec 2019 23:09:50 -0300 Subject: [PATCH 4/4] =?UTF-8?q?Adiciona=20instru=C3=A7=C3=B5es?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 73d4a7c3..4058c591 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,18 @@ No diretório `scraper` você poderá encontrar os _spiders_ responsáveis pela coleta dos dados. Para entender melhor como eles funcionam, dê uma olhada na documentação do [scrapy](https://docs.scrapy.org/). +Para executar todos os _spiders_, desde o início execute: + +``` +cd scraper && python runner.py --all +``` + +Para executar todos os _spiders_, coletando apenas o dia anterior: + +``` +cd scraper && python runner.py +``` + Para executar um _spider_, execute: ```