In [0]:
# libs
import os
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql.functions import col, split
from pyspark.sql import functions as F

# paths
path_bronze_soc = "dbfs:/FileStore/shared_uploads/default_user/bronze/*SOCIOCSV"

In [0]:
df_raw_soc = spark.read.options(header=False, inferSchema=True, sep=';').format("csv").load(path_bronze_soc)
df_raw_soc.printSchema()

In [0]:
df_raw_soc.limit(50).display()

In [0]:
# tipo de socio
df_raw_soc.select("_c1").distinct().orderBy("_c1", ascending=True).limit(100).display()

In [0]:
# tipo de socio
df_raw_soc.select("_c6").distinct().orderBy("_c6", ascending=True).limit(100).display()

In [0]:
# definicao de schema
schemaSocios = StructType([
    StructField("cnpj", IntegerType(), True),
    StructField("tipo_socio", IntegerType(), True),
    StructField("nome_socio", StringType(), True),
    StructField("documento_socio", StringType(), True),
    StructField("codigo_qualificacao_socio", IntegerType(), True),
    StructField("cnpj_seg_num", IntegerType(), True),
    StructField("documento_co_responsavel", StringType(), True),
    StructField("nome_co_responsavel", StringType(), True),
    StructField("codigo_qualificacao_co_responsavel", IntegerType(), True),
    StructField("id_registro_co_responsavel", IntegerType(), True)
])

df_list_soc = spark.read.options(header=False, inferSchema=True, sep=';') \
                .format("csv") \
                    .schema(schemaSocios) \
                        .load(path_bronze_soc)

In [0]:
df_list_soc.limit(20).display()

In [0]:
df_list_soc.select("nome_socio").filter(col("nome_socio").like(" %")).limit(100).display()

In [0]:
# Remoção de espaços em branco                       
df_list_soc = (df_list_soc
               .withColumn("nome_socio_format",
                           F.regexp_replace(col("nome_socio"), "^\\s+", ""))
                    .drop("nome_socio")
                        .withColumnRenamed("nome_socio_format", "nome_socio")
                        ).select(
                            'cnpj'
                            , 'tipo_socio'
                            , 'nome_socio'
                            , 'documento_socio'
                            , 'codigo_qualificacao_socio'
                        )

In [0]:
df_list_soc.limit(50).display()

In [0]:
df_list_soc.select("documento_socio").filter(col("documento_socio").like('%*%')).count()

In [0]:
# Criar schema no metastore
spark.sql("CREATE SCHEMA IF NOT EXISTS hive_metastore.db_rfb")
# Salvar tabela delta
df_list_soc.write.mode("append").format("delta").saveAsTable("hive_metastore.db_rfb.tbl_slv_socios")