## Lendo arquivodo HDFS

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import HiveContext

hive_context = HiveContext(sc)

spark = SparkSession \
    .builder \
    .appName("Teste API") \
    .enableHiveSupport() \
    .getOrCreate()

geo = spark.read.orc('/datalake/dadosbrutos/geolocation_correios.orc')

## Consultando coordenadas por cidade e estados distintos 

In [2]:
!pip install geopy

Collecting geopy
  Downloading geopy-2.2.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 6.1 MB/s eta 0:00:01
[?25hCollecting geographiclib<2,>=1.49
  Downloading geographiclib-1.52-py3-none-any.whl (38 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.52 geopy-2.2.0


In [5]:
from geopy.geocoders import Nominatim
import time

start_time = time.time()
# Cidade e estado distintos
cidades_ufs = geo.select('cep_5_digitos','cidade','uf').distinct().collect()
qtde_cidades_ufs = geo.select('cep_5_digitos','cidade','uf').distinct().count()

coords = []
counter = 0

# Consulta com o Geopy qual as coordenadas para cada conjunto cidade, estado.
for linha in cidades_ufs:
    print(f"{counter}º Consulta de {qtde_cidades_ufs}", end="\r")
    try:
        geolocator = Nominatim(user_agent="test_app", timeout=15)
        location = geolocator.geocode(f'{linha["cidade"]}, {linha["uf"]}')
        coords.append([linha['cep_5_digitos'], location.raw["lat"], location.raw["lon"]])
        counter += 1
    except Exception as e:
        print(e)
        pass

# Cria o dataframe com as coordenadas de cada cep
coords_df = spark.createDataFrame(coords, schema=["cep_5_digitos","lat", "lon"])
coords_df.show(truncate=False)
duration = time.time() - start_time
print(f"Tempo total: {duration/60} minutes")

'NoneType' object has no attribute 'raw'
'NoneType' object has no attribute 'raw'
'NoneType' object has no attribute 'raw'
'NoneType' object has no attribute 'raw'
+-------------+------------------+------------------+
|cep_5_digitos|lat               |lon               |
+-------------+------------------+------------------+
|04609        |-23.5506507       |-46.6333824       |
|85887        |-25.2404183       |-53.9835004       |
|04410        |-23.5506507       |-46.6333824       |
|11606        |-23.8027866       |-45.4070527       |
|79841        |-22.2206145       |-54.812208        |
|75256        |-16.7013233       |-49.0914921       |
|65606        |-4.8654201        |-43.353664        |
|55355        |-8.894516849999999|-36.73692078185357|
|72901        |-15.9465415       |-48.2616527       |
|17063        |-22.3218102       |-49.0705863       |
|58180        |-6.7562744        |-36.4673179       |
|03896        |-23.5506507       |-46.6333824       |
|13030        |-22.90556  

## Grava o resultado no HDFS

In [6]:
coords_df.write.orc(f'/datalake/dadosbrutos/geolocation_coords.orc', 'overwrite')

## JOIN dos endereços obtidos no site Correios com as coordenadas

In [8]:
geo_final = geo.select('cep_5_digitos','cidade', 'uf')\
   .dropDuplicates()\
   .join(
        coords_df,
        on=["cep_5_digitos"],
        how="inner"
    )

In [9]:
geo_final.show()

+-------------+--------------------+---+------------------+------------------+
|cep_5_digitos|              cidade| uf|               lat|               lon|
+-------------+--------------------+---+------------------+------------------+
|        04609|           São Paulo| SP|       -23.5506507|       -46.6333824|
|        85887|          Matelândia| PR|       -25.2404183|       -53.9835004|
|        04410|           São Paulo| SP|       -23.5506507|       -46.6333824|
|        11606|       São Sebastião| SP|       -23.8027866|       -45.4070527|
|        79841|            Dourados| MS|       -22.2206145|        -54.812208|
|        75256|      Senador Canedo| GO|       -16.7013233|       -49.0914921|
|        65606|              Caxias| MA|        -4.8654201|        -43.353664|
|        55355|          Paranatama| PE|-8.894516849999999|-36.73692078185357|
|        72901|Santo Antônio do ...| GO|       -15.9465415|       -48.2616527|
|        17063|               Bauru| SP|       -22.3

## Grava resultado no HDFS

In [10]:
import pyspark.sql.functions as F
geo_final = geo_final\
.withColumn('lat', F.col('lat').cast('double'))\
.withColumn('lon', F.col('lon').cast('double'))

geo_final.printSchema()

root
 |-- cep_5_digitos: string (nullable = true)
 |-- cidade: string (nullable = true)
 |-- uf: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)



In [11]:
geo_final.write.orc(f'/datalake/dadosbrutos/geolocation_correios_coords.orc', 'overwrite')