In [1]:
import requests
import pandas as pd
import json
from core.config import Config, get_data
from tqdm import tqdm
import os

from core.parlamentar import Parlamentar, ParlamentarDetalhes, ParlamentarProfissao

# Parlamentares

## Parlamentar detalhes

In [5]:
folder_detalhes = './data/parlamentares_detalhes'
folder_profissoes = './data/parlamentares_profissoes'
folder_ocupacoes = './data/parlamentares_ocupacoes'

with open(f'./data/parlamentares/deputados.json', 'r') as f:
    parls = json.load(f)

parls_ids = [p['id'] for p in parls]
for parl_id in tqdm(parls_ids):
    #detalhes
    config_det = Config(
        endpoint='deputados',
        parameters={'id': parl_id},
        file_name=parl_id
    )
    data_det = get_data(config_det.url)
    with open(f'{folder_detalhes}/{config_det.file_name}.json', 'w') as f:
        json.dump(data_det, f)
    #profissoes
    config_prof = Config(
        endpoint=f"deputados/{parl_id}/profissoes",
        file_name=parl_id
    )
    data_prof = get_data(config_prof.url)
    with open(f'{folder_profissoes}/{config_prof.file_name}.json', 'w') as f:
        json.dump(data_prof, f)
    #ocupacoes
    config_ocup = Config(
        endpoint=f"deputados/{parl_id}/ocupacoes",
        file_name=parl_id
    )
    data_ocup = get_data(config_ocup.url)
    with open(f'{folder_ocupacoes}/{config_ocup.file_name}.json', 'w') as f:
        json.dump(data_ocup, f)

100%|██████████| 1814/1814 [19:51<00:00,  1.52it/s]


# Votacoes

Por conta do erro 502 quando realizando a paginação, o algoritmo deve coletar os dados de votações dia a dia, pois assim não ele não ocorre

In [2]:
import datetime
from core.utils import infinite_date_generator

In [4]:
date_init = datetime.date(2019, 1, 1)
date_end = datetime.date(2022, 12, 31)
total_days = (date_end - date_init).days
pbar = tqdm(range(0, total_days))

data_ok = [file.split('.')[0] for file in os.listdir('./data/votacoes')]
for date in infinite_date_generator(date_init):
    if date == date_end:
        break
    date_str = date.strftime('%Y-%m-%d')
    config = Config(endpoint='votacoes',
                    file_name=f'votacoes_{date_str}',
                    parameters={'dataInicio': f'{date_str}', 'dataFim': f'{date_str}'})
    if config.file_name in data_ok:
        continue
    data = get_data(config.url)
    with open(f'./data/votacoes/{config.file_name}.json', 'w') as f:
        json.dump(data, f)
    pbar.update(1)
pbar.close()

  0%|          | 0/1460 [00:00<?, ?it/s]


# Votos

In [5]:
folder = './data/votos'

data_ok = [file.split('.')[0] for file in os.listdir(folder)]

votacoes = os.listdir('./data/votacoes')
votacoes = [file for file in votacoes if '2019' in file or '2020' in file or '2021' in file or '2022' in file]

configs = []
for votacao in votacoes:
    with open('./data/votacoes/' + votacao, 'r') as f:
        data = json.load(f)
    configs += [Config(endpoint=f"votacoes/{vot['id']}/votos", file_name=vot['id'])
                for vot in data if vot['id'] not in data_ok]

for config in tqdm(configs):
    if config.file_name in data_ok:
        continue
    data = get_data(config.url)
    with open(f'{folder}/{config.file_name}.json', 'w') as f:
        json.dump(data, f)


 43%|████▎     | 7048/16516 [35:42<46:22,  3.40it/s]  

ERRO 502, sleeping for 5 minutes


 49%|████▉     | 8099/16516 [46:00<1:18:50,  1.78it/s]  

ERRO 502, sleeping for 5 minutes


100%|██████████| 16516/16516 [1:32:56<00:00,  2.96it/s]  


# Orientações

In [10]:
folder = './data/orientacoes'
data_ok = [file.split('.')[0] for file in os.listdir(folder)]

votacoes = os.listdir('./data/votacoes')
votacoes = [file for file in votacoes if '2019' in file or '2020' in file or '2021' in file or '2022' in file]

configs = []
for votacao in votacoes:
    with open('./data/votacoes/' + votacao, 'r') as f:
        data = json.load(f)
    configs += [Config(endpoint=f"votacoes/{vot['id']}/orientacoes",
                       file_name=vot['id']) for vot in data if vot['id'] not in data_ok]

for config in tqdm(configs):
    if config.file_name in data_ok:
        continue
    data = get_data(config.url)
    with open(f'{folder}/{config.file_name}.json', 'w') as f:
        json.dump(data, f)


100%|██████████| 3111/3111 [36:35<00:00,  1.42it/s]  
