In [15]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master('local[*]') \
    .appName("create temp view") \
    .getOrCreate()
spark

In [16]:
# imports and path
from pyspark.sql.types import DoubleType, StringType
from pyspark.sql import functions as func

path = "../datasets/alura/01/empresas/"

In [17]:
emp = spark.read.csv(path, sep=';', inferSchema=True)

                                                                                

In [18]:
empresasColNames = ['cnpj_basico', 'razao_social_nome_empresarial', 'natureza_juridica', 'qualificacao_do_responsavel', 'capital_social_da_empresa', 'porte_da_empresa', 'ente_federativo_responsavel']

In [19]:
for idx, name in enumerate(empresasColNames):
    emp = emp.withColumnRenamed(f"_c{idx}", name)

In [20]:
emp = emp.withColumn('capital_social_da_empresa', func.regexp_replace('capital_social_da_empresa', ',', '.'))
emp.limit(5).toPandas()

Unnamed: 0,cnpj_basico,razao_social_nome_empresarial,natureza_juridica,qualificacao_do_responsavel,capital_social_da_empresa,porte_da_empresa,ente_federativo_responsavel
0,306,FRANCAMAR REFRIGERACAO TECNICA S/C LTDA,2240,49,0.0,1,
1,1355,BRASILEIRO & OLIVEIRA LTDA,2062,49,0.0,5,
2,4820,"REGISTRO DE IMOVEIS, TABELIONATO 1 DE NOTAS E ...",3034,32,0.0,5,
3,5347,ROSELY APARECIDA MONTEIRO CALTABIANO FREITAS,2135,50,0.0,5,
4,6846,BADU E FILHOS TECIDOS LTDA,2062,49,4000.0,1,


In [21]:
emp = emp.withColumn('capital_social_da_empresa', emp['capital_social_da_empresa'].cast(DoubleType()))
emp.limit(5).toPandas()

Unnamed: 0,cnpj_basico,razao_social_nome_empresarial,natureza_juridica,qualificacao_do_responsavel,capital_social_da_empresa,porte_da_empresa,ente_federativo_responsavel
0,306,FRANCAMAR REFRIGERACAO TECNICA S/C LTDA,2240,49,0.0,1,
1,1355,BRASILEIRO & OLIVEIRA LTDA,2062,49,0.0,5,
2,4820,"REGISTRO DE IMOVEIS, TABELIONATO 1 DE NOTAS E ...",3034,32,0.0,5,
3,5347,ROSELY APARECIDA MONTEIRO CALTABIANO FREITAS,2135,50,0.0,5,
4,6846,BADU E FILHOS TECIDOS LTDA,2062,49,4000.0,1,


In [22]:
emp.where("capital_social_da_empresa==50").show(5, False)

+-----------+------------------------------------+-----------------+---------------------------+-------------------------+----------------+---------------------------+
|cnpj_basico|razao_social_nome_empresarial       |natureza_juridica|qualificacao_do_responsavel|capital_social_da_empresa|porte_da_empresa|ente_federativo_responsavel|
+-----------+------------------------------------+-----------------+---------------------------+-------------------------+----------------+---------------------------+
|17350147   |ERIK MARCELO DOS SANTOS 42107848858 |2135             |50                         |50.0                     |1               |null                       |
|17833214   |ALEXANDRE MACHADO LIMA 73750123772  |2135             |50                         |50.0                     |1               |null                       |
|20860830   |YASMIN MOURA DA FONSECA 13457709793 |2135             |50                         |50.0                     |1               |null                 

In [23]:
emp.select('cnpj_basico', 'porte_da_empresa', 'capital_social_da_empresa')\
    .groupBy("porte_da_empresa")\
    .agg(
        func.avg("capital_social_da_empresa").alias("capital_social_medio"),
        func.max("capital_social_da_empresa").alias("capital_social_medio_max"),
        func.count("cnpj_basico").alias("frequencia")
    )\
    .orderBy('porte_da_empresa', ascending=True)\
    .show()



+----------------+--------------------+------------------------+----------+
|porte_da_empresa|capital_social_medio|capital_social_medio_max|frequencia|
+----------------+--------------------+------------------------+----------+
|            null|    8.35421888053467|                 50000.0|      5985|
|               1|  339994.53313507047|        3.22014670262E11|   3129043|
|               3|  2601001.7677092687|        2.52006125741E11|    115151|
|               5|   708660.4208249793|                  5.0E10|   1335500|
+----------------+--------------------+------------------------+----------+



                                                                                

In [24]:
emp.select('capital_social_da_empresa')\
   .summary("mean", "stddev")\
   .show()



+-------+-------------------------+
|summary|capital_social_da_empresa|
+-------+-------------------------+
|   mean|        503694.5478542674|
| stddev|     2.1118691490537727E8|
+-------+-------------------------+



                                                                                

In [25]:
emp.createOrReplaceTempView("empresasView")

In [26]:
res = spark.sql("SELECT * FROM empresasView")
res.limit(5).toPandas()

Unnamed: 0,cnpj_basico,razao_social_nome_empresarial,natureza_juridica,qualificacao_do_responsavel,capital_social_da_empresa,porte_da_empresa,ente_federativo_responsavel
0,306,FRANCAMAR REFRIGERACAO TECNICA S/C LTDA,2240,49,0.0,1,
1,1355,BRASILEIRO & OLIVEIRA LTDA,2062,49,0.0,5,
2,4820,"REGISTRO DE IMOVEIS, TABELIONATO 1 DE NOTAS E ...",3034,32,0.0,5,
3,5347,ROSELY APARECIDA MONTEIRO CALTABIANO FREITAS,2135,50,0.0,5,
4,6846,BADU E FILHOS TECIDOS LTDA,2062,49,4000.0,1,


In [27]:
res = spark.sql("""
                SELECT * FROM empresasView WHERE capital_social_da_empresa = 50
                """)
res.limit(5).toPandas()

Unnamed: 0,cnpj_basico,razao_social_nome_empresarial,natureza_juridica,qualificacao_do_responsavel,capital_social_da_empresa,porte_da_empresa,ente_federativo_responsavel
0,17350147,ERIK MARCELO DOS SANTOS 42107848858,2135,50,50.0,1,
1,17833214,ALEXANDRE MACHADO LIMA 73750123772,2135,50,50.0,1,
2,20860830,YASMIN MOURA DA FONSECA 13457709793,2135,50,50.0,1,
3,22242856,JOAO CESAR MESSIAS 08707149883,2135,50,50.0,1,
4,23238540,EVERTON ROBERTO DA SILVA 42101963809,2135,50,50.0,1,


In [28]:
res = spark.sql("""
            SELECT porte_da_empresa, MEAN(capital_social_da_empresa) AS Media 
            FROM empresasView GROUP BY porte_da_empresa
                """)
res.limit(5).toPandas()

                                                                                

Unnamed: 0,porte_da_empresa,Media
0,,8.354219
1,1.0,339994.5
2,3.0,2601002.0
3,5.0,708660.4
