## Importações

In [10]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Leitura

In [2]:
spark = SparkSession.builder \
    .appName("AnaliseENEM") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

leitura = spark.read.option("header", "true") \
               .option("sep", ";") \
               .option("encoding", "ISO-8859-1") \
               .option("inferSchema", "true") \
               .csv("data_raw/MICRODADOS_ENEM_2021.csv")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/01/16 22:20:13 WARN Utils: Your hostname, DESKTOP-NR380U6, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
26/01/16 22:20:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/16 22:20:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

## Verificação da qualidade dos dados na camada bronze:

###  1. Dados faltantes por colunas

In [13]:
coluna_uf = 'SG_UF_PROVA' 

colunas_analise = [c for c in leitura.columns if c != coluna_uf]
total_colunas = len(colunas_analise)

df_stats_uf = leitura.groupBy(coluna_uf).agg(
    F.count("*").alias("Qtd_Inscritos"),
    *[F.count(c).alias(c) for c in colunas_analise]
)
df_ranking = df_stats_uf.withColumn(
    "Total_Celulas_Preenchidas", 
    sum(F.col(c) for c in colunas_analise)
).withColumn(
    "Perc_Preenchimento",
    (F.col("Total_Celulas_Preenchidas") / (F.col("Qtd_Inscritos") * total_colunas)) * 100
)
relatorio_estados = df_ranking.select(
    coluna_uf, 
    "Qtd_Inscritos", 
    "Perc_Preenchimento"
).orderBy(F.desc("Perc_Preenchimento"))

relatorio_estados.show(27)

total_geral_inscritos = relatorio_estados.select(F.sum("Qtd_Inscritos")).collect()[0][0]

print(f"Total de inscritos no dataset: {total_geral_inscritos}")

                                                                                

+-----------+-------------+------------------+
|SG_UF_PROVA|Qtd_Inscritos|Perc_Preenchimento|
+-----------+-------------+------------------+
|         SC|        80765|  83.9418394931798|
|         CE|       220517| 83.90253812631225|
|         SP|       509954| 83.59789052868821|
|         PR|       144282| 83.25176159650314|
|         PI|        79969| 82.89278762187014|
|         RS|       150484| 82.78739710977025|
|         DF|        67501| 82.73718414048187|
|         ES|        64181| 82.70784188466992|
|         RN|        80820|  82.6341994555803|
|         PE|       193616| 82.62062363991957|
|         SE|        53796| 82.57521005279203|
|         PB|       102002| 82.51846042234466|
|         AL|        56584| 82.46187379235592|
|         RJ|       238347| 82.43666027542477|
|         MG|       327829| 82.16661328517814|
|         GO|       136915| 81.91900083993718|
|         MS|        42490| 81.90498156428964|
|         MA|       127905| 81.72775106524374|
|         BA|

[Stage 41:>                                                       (0 + 16) / 16]

Total de inscritos no dataset: 3389832


                                                                                