In [1]:
from pyspark.sql import types as T
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("DataFrameQuality").getOrCreate()

In [85]:
schema = T.StructType([
    T.StructField("id", T.LongType(), True),
    T.StructField("nome", T.StringType(), True),
    T.StructField("idade", T.IntegerType(), True),
    T.StructField("salário", T.DoubleType(), True),
    T.StructField("endereco", T.DoubleType(), True)
])

data = [
    (1, "João", None, None, None),
    (2, "Maria", None, None, None),
    (3, None, 32, 4500.0, None),
    (None, "Pedro", None, 6000.0, None),
    (5, "Ana", None, 5500.0, None)
]

df = spark.createDataFrame(data, schema)

In [23]:
df.show(5)

+----+-----+-----+-------+--------+
|  id| nome|idade|salário|endereco|
+----+-----+-----+-------+--------+
|   1| João| NULL|   NULL|    NULL|
|   2|Maria| NULL|   NULL|    NULL|
|   3| NULL|   32| 4500.0|    NULL|
|NULL|Pedro| NULL| 6000.0|    NULL|
|   5|  Ana| NULL| 5500.0|    NULL|
+----+-----+-----+-------+--------+



In [33]:
df_null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(f"null_count_{c}") for c in df.columns])

In [34]:
df_null_counts.show()

+-------------+---------------+----------------+------------------+-------------------+
|null_count_id|null_count_nome|null_count_idade|null_count_salário|null_count_endereco|
+-------------+---------------+----------------+------------------+-------------------+
|            1|              1|               4|                 2|                  5|
+-------------+---------------+----------------+------------------+-------------------+



In [89]:
total_rows = df.count()

df_null_counts.select(*[(F.col(c) / F.lit(total_rows)).alias(c.replace('null_count', 'percent_null')) for c in df_null_counts.columns], "*").show()

+---------------+-----------------+------------------+--------------------+---------------------+-------------+---------------+----------------+------------------+-------------------+
|percent_null_id|percent_null_nome|percent_null_idade|percent_null_salário|percent_null_endereco|null_count_id|null_count_nome|null_count_idade|null_count_salário|null_count_endereco|
+---------------+-----------------+------------------+--------------------+---------------------+-------------+---------------+----------------+------------------+-------------------+
|            0.2|              0.2|               0.8|                 0.4|                  1.0|            1|              1|               4|                 2|                  5|
+---------------+-----------------+------------------+--------------------+---------------------+-------------+---------------+----------------+------------------+-------------------+



In [88]:
def quality_df(df):
    null = 'NullCount_'
    df_null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(f"{null}{c}") for c in df.columns])
    df_null_percent = df_null_counts.select(*[(F.col(c) / F.lit(total_rows)*100).alias(c.replace(null, 'PercentNull_')) for c in df_null_counts.columns], "*")
    return df_null_percent

df_q = quality_df(df)
df_q.show()

+--------------+----------------+-----------------+-------------------+--------------------+------------+--------------+---------------+-----------------+------------------+
|PercentNull_id|PercentNull_nome|PercentNull_idade|PercentNull_salário|PercentNull_endereco|NullCount_id|NullCount_nome|NullCount_idade|NullCount_salário|NullCount_endereco|
+--------------+----------------+-----------------+-------------------+--------------------+------------+--------------+---------------+-----------------+------------------+
|          20.0|            20.0|             80.0|               40.0|               100.0|           1|             1|              4|                2|                 5|
+--------------+----------------+-----------------+-------------------+--------------------+------------+--------------+---------------+-----------------+------------------+

