Skip to content
This repository has been archived by the owner on Oct 2, 2023. It is now read-only.

Coleta pagamentos por dia #27

Merged
merged 4 commits into from
Dec 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ __pycache__
.scrapy/
.vscode
.env
.pytest_cache

# data
*.json
Expand Down
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,24 @@ pip install -r dev_requirements.txt
```

E tenha o [Apache Tika](https://tika.apache.org/download.html) instalado.
Esse projeto vai extrair o texto dos PDFs.
Ele vai extrair o texto dos PDFs.

No diretório `scraper` você poderá encontrar os _spiders_ responsáveis pela
coleta dos dados. Para entender melhor como eles funcionam, dê uma olhada
na documentação do [scrapy](https://docs.scrapy.org/).

Para executar todos os _spiders_, desde o início execute:

```
cd scraper && python runner.py --all
```

Para executar todos os _spiders_, coletando apenas o dia anterior:

```
cd scraper && python runner.py
```

Para executar um _spider_, execute:

```
Expand Down
27 changes: 27 additions & 0 deletions scraper/scraper/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import argparse
from datetime import datetime, timedelta

from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings


def run_crawlers(start_date):
process = CrawlerProcess(get_project_settings())

process.crawl("cityhall_payments", start_date=start_date)
process.start()


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--all", help="Coleta todos os itens desde a data inicial.", action="store_true"
)
args = parser.parse_args()
if args.all:
start_date = None
else:
yesterday = datetime.now() - timedelta(days=1)
start_date = yesterday.date()

run_crawlers(start_date)
15 changes: 10 additions & 5 deletions scraper/scraper/spiders/cityhall.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ class PaymentsSpider(scrapy.Spider):
http://www.transparencia.feiradesantana.ba.gov.br/index.php?view=despesa
"""

name = "payments"
name = "cityhall_payments"
url = "http://www.transparencia.feiradesantana.ba.gov.br/controller/despesa.php"
data = {
"POST_PARAMETRO": "PesquisaDespesas",
Expand All @@ -217,19 +217,24 @@ class PaymentsSpider(scrapy.Spider):
"POST_NMCREDOR": "",
"POST_CPFCNPJ": "",
}
initial_date = date(2010, 1, 1)

def start_requests(self):
current_date = date(2010, 1, 1) # initial date
if self.start_date:
start_date = self.start_date
else:
start_date = self.initial_date
self.logger.info(f"Data inicial: {start_date}")
today = datetime.now().date()

while current_date <= today:
formatted_date = current_date.strftime("%d/%m/%Y")
while start_date < today:
formatted_date = start_date.strftime("%d/%m/%Y")
data = self.data.copy()
data["POST_DATA"] = f"{formatted_date} - {formatted_date}"
yield scrapy.FormRequest(
self.url, formdata=data, callback=self.parse, meta={"data": data}
)
current_date = current_date + timedelta(days=1)
start_date = start_date + timedelta(days=1)

def parse(self, response):
# ['��� Anterior', '1', '2', '33', 'Pr��ximo ���']
Expand Down