In [7]:
import os
os.environ["HADOOP_HOME"] = "C:\\hadoop"

In [8]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

In [9]:
from pyspark.sql.types import *

In [10]:
session = SparkSession.builder\
    .appName('DW_municipio_completo')\
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

In [11]:
spark = configure_spark_with_delta_pip(session).getOrCreate()

In [12]:
schema = StructType([
    StructField("id", LongType(), nullable=True),
    StructField("nome", StringType(), nullable=True),
    StructField("microrregiao", StructType([
        StructField("id", LongType(), nullable=True),
        StructField("nome", StringType(), nullable=True),
        StructField("mesorregiao", StructType([
            StructField("id", LongType(), nullable=True),
            StructField("nome", StringType(), nullable=True),
            StructField("UF", StructType([
                StructField("id", LongType(), nullable=True),
                StructField("sigla", StringType(), nullable=True),
                StructField("nome", StringType(), nullable=True),
                StructField("regiao", StructType([
                    StructField("id", LongType(), nullable=True),
                    StructField("nome", StringType(), nullable=True),
                    StructField("sigla", StringType(), nullable=True)
                ]), nullable=True)
            ]), nullable=True)
        ]), nullable=True)
    ]), nullable=True),
    StructField("regiao-imediata", StructType([
        StructField("id", LongType(), nullable=True),
        StructField("nome", StringType(), nullable=True),
        StructField("regiao-intermediaria", StructType([
            StructField("id", LongType(), nullable=True),
            StructField("nome", StringType(), nullable=True),
            StructField("UF", StructType([
                StructField("id", LongType(), nullable=True),
                StructField("sigla", StringType(), nullable=True),
                StructField("nome", StringType(), nullable=True),
                StructField("regiao", StructType([
                    StructField("id", LongType(), nullable=True),
                    StructField("nome", StringType(), nullable=True),
                    StructField("sigla", StringType(), nullable=True)
                ]), nullable=True)
            ]), nullable=True)
        ]), nullable=True)
    ]), nullable=True)
])

In [13]:
json_path = r'C:\Users\ResTIC16\Documents\IBGE_PROJETO\datawarehouse_ibge\data\LND\municipio_completo\municipio_completo_25-07-17.json'
df = spark.read \
    .option("multiline", "true") \
    .json(json_path, schema=schema)

In [14]:
df.show(truncate=False)

+-------+------------------------+----------------------------------------------------------------------------------------+-----------------------------------------------------------------------------+
|id     |nome                    |microrregiao                                                                            |regiao-imediata                                                              |
+-------+------------------------+----------------------------------------------------------------------------------------+-----------------------------------------------------------------------------+
|1100015|Alta Floresta D'Oeste   |{11006, Cacoal, {1102, Leste Rondoniense, {11, RO, Rondônia, {1, Norte, N}}}}           |{110005, Cacoal, {1102, Ji-Paraná, {11, RO, Rondônia, {1, Norte, N}}}}       |
|1100023|Ariquemes               |{11003, Ariquemes, {1102, Leste Rondoniense, {11, RO, Rondônia, {1, Norte, N}}}}        |{110002, Ariquemes, {1101, Porto Velho, {11, RO, Rondônia, {1, Norte,

In [15]:
df.write.format('delta').mode('overwrite').save(r'C:\Users\ResTIC16\Documents\IBGE_PROJETO\datawarehouse_ibge\data\RAW\municipio_completo')