In [143]:
!pip install pycep-correios
!pip install nest_asyncio
!pip install "geopy[aiohttp]"
!pip install asgiref
!pip install ratelimit
!pip install tqdm
!pip install requests_async

Collecting requests_async
  Downloading requests-async-0.6.2.tar.gz (12 kB)
Collecting http3==0.6.*
  Downloading http3-0.6.7.tar.gz (35 kB)
Collecting h11==0.8.*
  Downloading h11-0.8.1-py2.py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 3.4 MB/s  eta 0:00:01
[?25hCollecting h2==3.*
  Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 1.8 MB/s eta 0:00:011
[?25hCollecting rfc3986==1.*
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting hyperframe<6,>=5.2.0
  Downloading hyperframe-5.2.0-py2.py3-none-any.whl (12 kB)
Collecting hpack<4,>=3.0
  Downloading hpack-3.0.0-py2.py3-none-any.whl (38 kB)
Building wheels for collected packages: requests-async, http3
  Building wheel for requests-async (setup.py) ... [?25ldone
[?25h  Created wheel for requests-async: filename=requests_async-0.6.2-py3-none-any.whl size=13933 sha256=541c35f919d34c0c2b20812219ecd20f9327334860666da8ec009a48c9cd33ac
  Stored i

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import HiveContext

hive_context = HiveContext(sc)

spark = SparkSession \
    .builder \
    .appName("Teste API") \
    .enableHiveSupport() \
    .getOrCreate()

geo = spark.read.orc('/datalake/dadosbrutos/olist_geolocation_dataset.orc')

In [26]:
cep_array = [str(row.geolocation_zip_code_prefix) for row in geo.select('geolocation_zip_code_prefix').distinct().collect()]

# SYNC

In [5]:
from pycep_correios import get_address_from_cep, WebService, exceptions
import zeep
import pandas as pd
import time
# All rows generated with the adress will saved into empty result variable list 
start = time.time()
def get_adresses(limit=None):
    counter = 1
    result = []
    sufixos = ['000']# '500', '970', '900', '990', '960']
    found = 0
    for i in cep_array:
        for sufixo in sufixos:
            try:
                cep = i + sufixo
                print(f"{counter}º coleta, Cep {cep}", end="\r")
                endereco = get_address_from_cep(cep, webservice=WebService.CORREIOS)
                if(endereco["cep"] != ""):
                    result.append(endereco) 
                    found += 1
                    break
            except exceptions.BaseException as e:  
                pass
        counter += 1
        if counter > limit:break
            
    print(f"\n{limit - found} CEPS não encontrados")
    # Using filter() method to filter None values
    return result

if __name__ == "__main__":
    test = get_adresses(limit=30)

# Creating a dataframe from function return result
df = pd.DataFrame(test)
df = df[['cep','cidade','uf']]
end = time.time()
print(f"{end - start} segundos")
df.head(100)


30º coleta, Cep 40740000
9 CEPS não encontrados
11.111901044845581 segundos


Unnamed: 0,cep,cidade,uf
0,13282000,Vinhedo,SP
1,14887000,Jaboticabal,SP
2,16250000,Clementina,SP
3,17506000,Marília,SP
4,18306000,Capão Bonito,SP
5,76976000,Primavera de Rondônia,RO
6,77303000,Rio da Conceição,TO
7,77930000,Axixá do Tocantins,TO
8,78888000,Nova Ubiratã,MT
9,79785000,Angélica,MS


# ASYNC

In [4]:
from pycep_correios import get_address_from_cep, WebService, exceptions
import zeep
import pandas as pd
# All rows generated with the adress will saved into empty result variable list 

import asyncio
import aiohttp
import os
import time
import numpy as np
import nest_asyncio
from asgiref.sync import sync_to_async

start = time.time()
global results
global counter
results = []
counter = 0
def run_process(cep):
    try:
        get_address_from_cep(cep, webservice=WebService.CORREIOS)
    except exceptions.BaseException as e: 
        pass

async def get_adress(cep):
    response = await sync_to_async(get_address_from_cep)(cep, webservice=WebService.CORREIOS)
    if response['cep'] == "":return
    results.append(response)
    counter += 1
    print(f'{counter}')
    
        
async def get_all_adresses():
    async with aiohttp.ClientSession() as session:
        global tasks
        tasks = [] 
        sufixos = ['000']#, '500', '970', '900', '990', '960']
        tasks = [asyncio.ensure_future(get_adress(i + sufixo))
                 for i in cep_array[0:1000]
                 for sufixo in sufixos]
        await asyncio.gather(*tasks, return_exceptions=True)
                
nest_asyncio.apply()
loop = asyncio.get_event_loop().run_until_complete(get_all_adresses())

# Creating a dataframe from function return result
df = pd.DataFrame(results)
df = df[['cep','cidade','uf']]
df = df.replace('', np.nan)
df = df.dropna()

end = time.time()
#print(results)
#print(tasks)
print(f"{end - start} segundos")
df.head(100)

425.3440291881561 segundos


Unnamed: 0,cep,cidade,uf
0,13282000,Vinhedo,SP
1,14887000,Jaboticabal,SP
2,16250000,Clementina,SP
3,17506000,Marília,SP
4,18306000,Capão Bonito,SP
...,...,...,...
95,31330000,Belo Horizonte,MG
96,33840000,Ribeirão das Neves,MG
97,34565000,Sabará,MG
98,35438000,Acaiaca,MG


In [220]:
import asyncio
import time
import aiohttp
import nest_asyncio
from asgiref.sync import sync_to_async
global results
results = []
global URL
URL = 'https://buscacepinter.correios.com.br/app/endereco/carrega-cep-endereco.php'


async def get_address(session, cep):
    async with session.post(url=URL, data={'endereco': cep, 'tipoCEP': 'ALL'}) as response:
        response = await response.text()
        try:
            if json.loads(response)["dados"][0]["cep"] != '': 
                results.append(json.loads(response)["dados"][0])
                print(len(results), end="\r")
        except:
            pass


async def get_all_addresses(ceps):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for cep in ceps:
            task = asyncio.ensure_future(get_address(session, cep))
            tasks.append(task)
        await asyncio.gather(*tasks, return_exceptions=False)
        


if __name__ == "__main__":
    nest_asyncio.apply()
    start_time = time.time()
    ceps_array = cep_array
    asyncio.get_event_loop().run_until_complete(get_all_addresses(ceps_array))
    
    df = pd.DataFrame(results)
    df = df[['cep','localidade','uf']]
    
    duration = time.time() - start_time
    print(f"Downloaded {len(ceps_array)} ceps in {duration} seconds")

df.head(100)

Downloaded 19015 ceps in 144.70813179016113 seconds


Unnamed: 0,cep,localidade,uf
0,76976970,Primavera de Rondônia,RO
1,77303970,Rio da Conceição,TO
2,79785971,Angélica,MS
3,16250970,Clementina,SP
4,77930970,Axixá do Tocantins,TO
...,...,...,...
95,32654804,Betim,MG
96,72314701,Brasília,DF
97,99690970,Liberato Salzano,RS
98,56505000,Arcoverde,PE


In [221]:
df.count()

cep           18616
localidade    18616
uf            18616
dtype: int64

In [222]:
len(cep_array)

19015

In [223]:
df[df['cep'].str.slice(0, 5).duplicated()]

Unnamed: 0,cep,localidade,uf
74,29903192,Linhares,ES
146,76824406,Porto Velho,RO
272,22630013,Rio de Janeiro,RJ
289,60125001,Fortaleza,CE
354,76824166,Porto Velho,RO
...,...,...,...
18604,69902410,Rio Branco,AC
18605,76808458,Porto Velho,RO
18609,60540510,Fortaleza,CE
18611,29145910,Cariacica,ES


# MULTIPROCESSING

In [99]:
from multiprocessing import Pool, Manager, cpu_count
from pycep_correios import get_address_from_cep, WebService, exceptions
from ratelimit import limits, sleep_and_retry
from functools import partial
from tqdm import tqdm
import requests
import json
import numpy as np
import time
import pandas as pd

ceps_array = cep_array[0:1000]
global URL
URL = 'https://buscacepinter.correios.com.br/app/endereco/carrega-cep-endereco.php'
start = time.time()
@sleep_and_retry
@limits(calls=100, period=60)
def call_api(cep):
    myobj = {'endereco': cep, "tipoCEP": "ALL"}
    response = requests.post(URL, data = myobj)
    return response

def get_adress_multiprocess(listManager=None, ceps_list=None, process=0):
    cep = ceps_list[process]
    info = None
    resolved = False
    
    try:
        while not resolved:  
            res = None
            adress_info = None
            tooManyCalls = False
            try:
                res = call_api(cep)
                if json.loads(res.text)["dados"][0]["cep"] == '':
                    resolved = True
                    break
           
            except Exception as e:
                if e == 'too many calls':
                    tooManyCalls =True
                    
            if tooManyCalls:
                time.sleep(60)

            elif res.status_code < 300:
                adress_info = json.loads(res.text)["dados"][0]
                resolved = True 
                
            elif res.status_code == 429:
                print(res.status_code)
                time.sleep(60)

            else:
                print(res.status_code)
                sleep_val = random.randint(1,10)
                time.sleep(sleep_val)
        
    except Exception as e:
        pass
    finally:
        if adress_info != None:
            listManager.append(adress_info)
            time.sleep(0.5)
            return

def main_adresses_run_multiprocessing():
    ## cannot be 0, so max(NUMBER,1) solves this
    workers = max(cpu_count()-1,1)

    ## create the pool
    manager = Manager()
    
    ## Need a manager to help get the values async, the values will be updated after join
    listManager = manager.list()
    pool = Pool(workers)
    try:
        part_get_clean_adress = partial(get_adress_multiprocess, listManager, ceps_array)
#         could do this the below is visualize the rate success /etc
#         pool.imap(part_get_clean_pokemon, list(range(0, len(links_pokemon))))
#         using tqdm to see progress imap works
        for _ in tqdm(pool.imap(part_get_clean_adress, list(range(0, len(ceps_array)))), total=len(ceps_array)):
            pass
        pool.close()
        pool.join()
    finally:
        pool.close()
        pool.join()
        
    adressList = list(listManager)
    
    df_adress = pd.DataFrame(adressList)
    return df_adress
end = time.time()

df_adress = main_adresses_run_multiprocessing()
df_adress = df_adress[['cep','localidade','uf']]
df_adress["original_cep"] = df_adress['cep'].str.slice(0, 5)
df_adress = df_adress.drop_duplicates(subset="original_cep")
print(f"{end - start} segundos")
df_adress.head(100)

  7%|▋         | 71/1000 [00:08<01:31, 10.18it/s]

list index out of range


  9%|▉         | 93/1000 [00:11<02:15,  6.70it/s]

list index out of range


 14%|█▍        | 141/1000 [00:16<01:34,  9.07it/s]

list index out of range


 18%|█▊        | 175/1000 [00:20<01:46,  7.75it/s]

list index out of range
list index out of range


 19%|█▊        | 187/1000 [00:21<01:23,  9.73it/s]

list index out of range


 20%|█▉        | 199/1000 [00:23<01:23,  9.58it/s]

list index out of range


 25%|██▍       | 249/1000 [00:29<01:39,  7.57it/s]

list index out of range


 34%|███▍      | 339/1000 [00:40<01:13,  8.94it/s]

list index out of range


 35%|███▍      | 347/1000 [00:40<01:01, 10.66it/s]

list index out of range


 35%|███▍      | 349/1000 [00:41<01:24,  7.67it/s]

list index out of range


 42%|████▎     | 425/1000 [00:49<01:03,  9.13it/s]

list index out of range


 43%|████▎     | 431/1000 [00:50<01:09,  8.19it/s]

list index out of range


 52%|█████▎    | 525/1000 [01:00<00:47,  9.93it/s]

list index out of range


 63%|██████▎   | 630/1000 [01:13<00:50,  7.40it/s]

list index out of range


 66%|██████▌   | 657/1000 [01:16<00:38,  8.90it/s]

list index out of range


 69%|██████▉   | 693/1000 [01:20<00:41,  7.46it/s]

list index out of range


 72%|███████▏  | 720/1000 [01:23<00:32,  8.67it/s]

list index out of range


 74%|███████▍  | 744/1000 [01:26<00:32,  7.87it/s]

list index out of range


 76%|███████▌  | 760/1000 [01:28<00:33,  7.17it/s]

list index out of range


 76%|███████▋  | 765/1000 [01:28<00:22, 10.61it/s]

list index out of range


 85%|████████▌ | 851/1000 [01:39<00:14, 10.57it/s]

list index out of range


 89%|████████▉ | 893/1000 [01:44<00:13,  7.98it/s]

list index out of range


100%|██████████| 1000/1000 [01:57<00:00,  8.48it/s]


0.0004487037658691406 segundos


Unnamed: 0,cep,localidade,uf,original_cep
0,16250970,Clementina,SP,16250
1,17506000,Marília,SP,17506
2,18306025,Capão Bonito,SP,18306
3,14887250,Jaboticabal,SP,14887
4,13282596,Vinhedo,SP,13282
...,...,...,...,...
96,95211435,Vacaria,RS,95211
97,54430030,Jaboatão dos Guararapes,PE,54430
98,93410175,Novo Hamburgo,RS,93410
100,92027033,Canoas,RS,92027


In [93]:
import zeep
import requests
import json
URL = 'https://buscacepinter.correios.com.br/app/endereco/carrega-cep-endereco.php'


myobj = {'endereco': '18130', 'tipoCEP': 'ALL'}

response = json.loads(requests.post(URL, data = myobj).text)["dados"][0]

print(response)

{'uf': 'SP', 'localidade': 'São Roque', 'locNoSem': '', 'locNu': '', 'localidadeSubordinada': '', 'logradouroDNEC': '5º Alto Dona Amasília Ribeiro Lopes', 'logradouroTextoAdicional': '', 'logradouroTexto': '', 'bairro': 'Vila Aguiar', 'baiNu': '', 'nomeUnidade': '', 'cep': '18130649', 'tipoCep': '2', 'numeroLocalidade': '', 'situacao': '', 'faixasCaixaPostal': [], 'faixasCep': []}
