###Inserindo Bases Auxiliares de Cep

In [0]:
# Import libraries

import requests
import pandas as pd
import re

from pyspark.sql.types import StringType
from pyspark.sql.functions import (regexp_extract, col, regexp_replace, lit, when, concat_ws, expr,
                                size, trim, split, lpad, lower, coalesce, count, row_number, first,
                                monotonically_increasing_id, concat)

from pyspark.sql.window import Window

import shutil


In [0]:
%sql
-- Create a new database for ceps in silver layer

CREATE DATABASE IF NOT EXISTS silver_aux
LOCATION 'dbfs:/mvp/database/silver_aux';

In [0]:
# Load cep bases from bronze_cep database

df_cep_sp = spark.table("bronze_aux.cep_sp")
df_cep_rj = spark.table("bronze_aux.cep_rio")


In [0]:
df_cep_sp.show(5)

+-------+--------------------+------------+------+----------+----------+
|    cep|          logradouro|      numero|bairro|cod_cidade|cod_estado|
+-------+--------------------+------------+------+----------+----------+
|1001000|         Praça da Sé|- lado ímpar|    Sé|      8966|        26|
|1001001|         Praça da Sé|  - lado par|    Sé|      8966|        26|
|1001010|Rua Filipe de Oli...|        null|    Sé|      8966|        26|
|1001900|     Praça da Sé 108|        null|    Sé|      8966|        26|
|1001901|     Praça da Sé 371|        null|    Sé|      8966|        26|
+-------+--------------------+------------+------+----------+----------+
only showing top 5 rows



In [0]:
print(df_cep_sp.count())

46395


In [0]:
# Cleaning cep sp dataset

df_cep_sp = df_cep_sp.withColumn("cep", lpad(col("cep").cast("string"), 8, "0"))

df_cep_sp = df_cep_sp.select(
    trim(col("cep")).alias("cep"),
    trim(col("logradouro")).alias("logradouro"),
    trim(col("numero")).alias("numero"),
    trim(col("bairro")).alias("bairro"),
    trim(col("cod_cidade")).alias("mun_ibge"),
    trim(col("cod_estado")).alias("uf_ibge")
)



In [0]:
# Save table in silver layer

df_cep_sp.write.format("delta").mode("overwrite").option("mergeSchema", 'true').saveAsTable("silver_aux.cep_sp")


In [0]:
# Cleaning cep rio dataset

df_cep_rj = df_cep_rj.withColumn("cep", lpad(col("cep").cast("string"), 8, "0"))

df_cep_rj = df_cep_rj.select(
    trim(col("cep")).alias("cep"),
    trim(col("logradouro")).alias("logradouro"),
    trim(col("numero")).alias("numero"),
    trim(col("bairro")).alias("bairro"),
    trim(col("cod_cidade")).alias("mun_ibge"),
    trim(col("cod_estado")).alias("uf_ibge")
)


In [0]:
# Save table in silver layer

df_cep_rj.write.format("delta").mode("overwrite").option("mergeSchema", 'true').saveAsTable("silver_aux.cep_rio")


In [0]:
print(df_cep_rj.count())

19884


In [0]:
# Union of two cep datasets

df_cep_SP_RJ = df_cep_sp.unionByName(df_cep_rj)

In [0]:
# Save table in silver layer

df_cep_SP_RJ.write.format("delta").mode("overwrite").option("mergeSchema", 'true').saveAsTable("silver_aux.cep_SP_RJ")


#### Obter coordenadas com Base_geo

In [0]:
# Open base_geo from bronze layer

Base_geo = spark.table("bronze_aux.geo_base")


In [0]:
display(Base_geo.show(5))

+--------+------------+-----------+---------------+
|postcode|         lon|        lat|     cd_geocodi|
+--------+------------+-----------+---------------+
|00000000| -46.6638301|-22.2576125|313490505000032|
|00000001| -67.8387383|   -9.92166|120040105000325|
|00000002|-67.88050835| -9.8820725|120040105000325|
|00000003| -46.5144433|  -21.82057|315180005000240|
|00000004| -67.8933533| -9.8820383|120040105000325|
+--------+------------+-----------+---------------+
only showing top 5 rows



In [0]:
Base_geo = Base_geo.withColumn("postcode", regexp_replace("postcode", "[^0-9]", ""))

In [0]:
Base_geo = Base_geo.withColumnRenamed('postcode', 'cep')

In [0]:
print(Base_geo.count())

429994


In [0]:
# Open base cep_sp_rj from silver

cep_sp_rj = spark.table("silver_aux.cep_sp_rj")


In [0]:
print(cep_sp_rj.count())

66279


In [0]:
# Join with cep_sp_rj

df_cep_geo = cep_sp_rj.join(
    Base_geo.select("cep", "lat", "lon", "cd_geocodi"),
    on="cep",
    how="left"
)


In [0]:
display(df_cep_geo.show(5))

+--------+--------------------+------------+------+--------+-------+-----------+-----------+---------------+
|     cep|          logradouro|      numero|bairro|mun_ibge|uf_ibge|        lat|        lon|     cd_geocodi|
+--------+--------------------+------------+------+--------+-------+-----------+-----------+---------------+
|01001000|         Praça da Sé|- lado ímpar|    Sé|    8966|     26|-23.5493251|-46.6336932|355030878000005|
|01001001|         Praça da Sé|  - lado par|    Sé|    8966|     26|  -23.55046|-46.6342517|355030878000005|
|01001010|Rua Filipe de Oli...|        null|    Sé|    8966|     26|-23.5513175|-46.6336815|355030878000004|
|01001900|     Praça da Sé 108|        null|    Sé|    8966|     26|       null|       null|           null|
|01001901|     Praça da Sé 371|        null|    Sé|    8966|     26|       null|       null|           null|
+--------+--------------------+------------+------+--------+-------+-----------+-----------+---------------+
only showing top 5 

In [0]:
print(df_cep_geo.filter(col("lat").isNull()).count())
print(df_cep_geo.count())

22877
66289


In [0]:
# Save table in silver layer

df_cep_geo.write.format("delta").mode("overwrite").option("mergeSchema", 'true').saveAsTable("silver_aux.geo_base")
