In [0]:
# libs
import os
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql.functions import col, split
from pyspark.sql import functions as F

# paths
path_bronze_soc = "dbfs:/FileStore/shared_uploads/default_user/bronze/*SOCIOCSV"

In [0]:
df_raw_soc = spark.read.options(header=False, inferSchema=True, sep=';').format("csv").load(path_bronze_soc)
df_raw_soc.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: integer (nullable = true)
 |-- _c5: integer (nullable = true)
 |-- _c6: integer (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: integer (nullable = true)
 |-- _c10: integer (nullable = true)



In [0]:
df_raw_soc.limit(50).display()

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10
7396865,2,GERSON HOFFMANN,***240659**,49,20050518,,***000000**,,0,5
7396865,2,MITZI HARTMANN,***126339**,22,20080509,,***000000**,,0,4
52302726,2,BENEDITO BEZERRA DE CARVALHO,***061354**,49,20050921,,***000000**,,0,7
7396923,2,RENATO PEDUTO,***069508**,49,20050516,,***000000**,,0,6
7396923,2,ELAINE GARCIA,***971118**,49,20050516,,***000000**,,0,6
3650261,2,DILTO DE SOUZA,***236669**,49,19991217,,***000000**,,0,7
3650261,2,TAMARA SOUZA FREITAS,***042859**,22,20050328,,***000000**,,0,4
7396929,2,PAOLA CACELLA LADEIRO,***763888**,49,20050504,,***000000**,,0,5
7396929,2,FABIANO AUGUSTO RUGGIERO CACHELE,***807908**,49,20051124,,***000000**,,0,6
25040718,2,BENJAMIN BENONI MARTINS SPADONI,***205061**,16,20050912,,***000000**,,0,9


In [0]:
# tipo de socio
df_raw_soc.select("_c1").distinct().orderBy("_c1", ascending=True).limit(100).display()

_c1
1
2
3


In [0]:
# tipo de socio
df_raw_soc.select("_c6").distinct().orderBy("_c6", ascending=True).limit(100).display()

_c6
""
13.0
23.0
31.0
37.0
40.0
41.0
47.0
53.0
63.0


In [0]:
# definicao de schema
schemaSocios = StructType([
    StructField("cnpj", IntegerType(), True),
    StructField("tipo_socio", IntegerType(), True),
    StructField("nome_socio", StringType(), True),
    StructField("documento_socio", StringType(), True),
    StructField("codigo_qualificacao_socio", IntegerType(), True),
    StructField("cnpj_seg_num", IntegerType(), True),
    StructField("documento_co_responsavel", StringType(), True),
    StructField("nome_co_responsavel", StringType(), True),
    StructField("codigo_qualificacao_co_responsavel", IntegerType(), True),
    StructField("id_registro_co_responsavel", IntegerType(), True)
])

df_list_soc = spark.read.options(header=False, inferSchema=True, sep=';') \
                .format("csv") \
                    .schema(schemaSocios) \
                        .load(path_bronze_soc)

In [0]:
df_list_soc.limit(20).display()

id,tipo_socio,nome_socio,documento_socio,codigo_qualificacao_socio,id_registro,documento_co_responsavel,nome_co_responsavel,codigo_qualificacao_co_responsavel,id_registro_co_responsavel
7396865,2,GERSON HOFFMANN,***240659**,49,20050518,,***000000**,,0
7396865,2,MITZI HARTMANN,***126339**,22,20080509,,***000000**,,0
52302726,2,BENEDITO BEZERRA DE CARVALHO,***061354**,49,20050921,,***000000**,,0
7396923,2,RENATO PEDUTO,***069508**,49,20050516,,***000000**,,0
7396923,2,ELAINE GARCIA,***971118**,49,20050516,,***000000**,,0
3650261,2,DILTO DE SOUZA,***236669**,49,19991217,,***000000**,,0
3650261,2,TAMARA SOUZA FREITAS,***042859**,22,20050328,,***000000**,,0
7396929,2,PAOLA CACELLA LADEIRO,***763888**,49,20050504,,***000000**,,0
7396929,2,FABIANO AUGUSTO RUGGIERO CACHELE,***807908**,49,20051124,,***000000**,,0
25040718,2,BENJAMIN BENONI MARTINS SPADONI,***205061**,16,20050912,,***000000**,,0


In [0]:
df_list_soc.select("nome_socio").filter(col("nome_socio").like(" %")).limit(100).display()

nome_socio
JMC STOPPIGLIA ADMINISTRACAO DE BENS PROPRIOS LTDA
SANTA TERESA PARTICIPACOES S.A.
GRAMADO PARKS INVESTIMENTOS E INTERMEDIACOES S.A - EM RECUPERACAO JUDICIAL
INCUB PARTICIPACOES EIRELI
GRAMADO PARKS INVESTIMENTOS E INTERMEDIACOES S.A - EM RECUPERACAO JUDICIAL


In [0]:
# Remoção de espaços em branco                       
df_list_soc = (df_list_soc
               .withColumn("nome_socio_format",
                           F.regexp_replace(col("nome_socio"), "^\\s+", ""))
                    .drop("nome_socio")
                        .withColumnRenamed("nome_socio_format", "nome_socio")
                        ).select(
                            'cnpj'
                            , 'tipo_socio'
                            , 'nome_socio'
                            , 'documento_socio'
                            , 'codigo_qualificacao_socio'
                        )

In [0]:
df_list_soc.limit(50).display()

cnpj,tipo_socio,nome_socio,documento_socio,codigo_qualificacao_socio
7396865,2,GERSON HOFFMANN,***240659**,49
7396865,2,MITZI HARTMANN,***126339**,22
52302726,2,BENEDITO BEZERRA DE CARVALHO,***061354**,49
7396923,2,RENATO PEDUTO,***069508**,49
7396923,2,ELAINE GARCIA,***971118**,49
3650261,2,DILTO DE SOUZA,***236669**,49
3650261,2,TAMARA SOUZA FREITAS,***042859**,22
7396929,2,PAOLA CACELLA LADEIRO,***763888**,49
7396929,2,FABIANO AUGUSTO RUGGIERO CACHELE,***807908**,49
25040718,2,BENJAMIN BENONI MARTINS SPADONI,***205061**,16


In [0]:
df_list_soc.select("documento_socio").filter(col("documento_socio").like('%*%')).count()

1975483

In [0]:
# Criar schema no metastore
spark.sql("CREATE SCHEMA IF NOT EXISTS hive_metastore.db_rfb")
# Salvar tabela delta
df_list_soc.write.mode("append").format("delta").saveAsTable("hive_metastore.db_rfb.tbl_slv_socios")