In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

spark = SparkSession.builder \
    .appName("AnaliseENEM") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

df = spark.read.option("header", "true") \
               .option("sep", ";") \
               .option("encoding", "ISO-8859-1") \
               .option("inferSchema", "false") \
               .csv("data_raw/MICRODADOS_ENEM_2021.csv")

grafico_1 = df.select("SG_UF_PROVA", "NU_NOTA_MT") \
                .withColumn("NU_NOTA_MT", col("NU_NOTA_MT").cast("float")) \
                .dropna() \
                .groupBy("SG_UF_PROVA") \
                .avg("NU_NOTA_MT") \
                .toPandas()

plt.figure(figsize=(12, 6))
sns.barplot(data=grafico_1.sort_values("avg(NU_NOTA_MT)", ascending=False), 
            x="SG_UF_PROVA", 
            y="avg(NU_NOTA_MT)", 
            palette="viridis")
plt.show()