diff --git a/scraper/scraper/runner.py b/scraper/scraper/runner.py new file mode 100644 index 00000000..bcbaa522 --- /dev/null +++ b/scraper/scraper/runner.py @@ -0,0 +1,15 @@ +from datetime import datetime, timedelta + +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings + + +process = CrawlerProcess(get_project_settings()) + +# TODO receber via CLI se todos ou não + +yesterday = datetime.now() - timedelta(days=1) +yesterday = yesterday.date() + +process.crawl("cityhall_payments", start_date=yesterday) +process.start() diff --git a/scraper/scraper/spiders/cityhall.py b/scraper/scraper/spiders/cityhall.py index a6895604..09df0a50 100644 --- a/scraper/scraper/spiders/cityhall.py +++ b/scraper/scraper/spiders/cityhall.py @@ -207,7 +207,7 @@ class PaymentsSpider(scrapy.Spider): http://www.transparencia.feiradesantana.ba.gov.br/index.php?view=despesa """ - name = "payments" + name = "cityhall_payments" url = "http://www.transparencia.feiradesantana.ba.gov.br/controller/despesa.php" data = { "POST_PARAMETRO": "PesquisaDespesas", @@ -219,17 +219,20 @@ class PaymentsSpider(scrapy.Spider): } def start_requests(self): - current_date = date(2010, 1, 1) # initial date + if self.start_date: + start_date = self.start_date + else: + start_date = date(2010, 1, 1) today = datetime.now().date() - while current_date <= today: - formatted_date = current_date.strftime("%d/%m/%Y") + while start_date < today: + formatted_date = start_date.strftime("%d/%m/%Y") data = self.data.copy() data["POST_DATA"] = f"{formatted_date} - {formatted_date}" yield scrapy.FormRequest( self.url, formdata=data, callback=self.parse, meta={"data": data} ) - current_date = current_date + timedelta(days=1) + start_date = start_date + timedelta(days=1) def parse(self, response): # ['��� Anterior', '1', '2', '33', 'Pr��ximo ���']