In [5]:
import findspark

In [6]:
findspark.init('/home/gympass/spark-3.1.2-bin-hadoop3.2')

In [7]:
from pyspark.sql import SparkSession

In [8]:
spark = SparkSession.builder.appName('aggs').getOrCreate()

In [9]:
df = spark.read.csv('ContainsNull.csv', header=True, inferSchema=True)

In [10]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [11]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



Drops rows with less than 2 non null values

In [13]:
df.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



default how=any so drop rows with at least one null value, if how=all then drop rows if all values are null

In [16]:
df.na.drop(how='all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



subset=['Sales'] only drops null registers on that column 

In [17]:
df.na.drop(subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



Fill string values with FILL VALUE instedad of null

In [19]:
df.na.fill('FILL VALUE').show()

+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John| null|
|emp2|FILL VALUE| null|
|emp3|FILL VALUE|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



subset=['Name'] specifies with column to target

In [23]:
df.na.fill('<No Name>', subset=['Name']).show()

+----+---------+-----+
|  Id|     Name|Sales|
+----+---------+-----+
|emp1|     John| null|
|emp2|<No Name>| null|
|emp3|<No Name>|345.0|
|emp4|    Cindy|456.0|
+----+---------+-----+



In [24]:
from pyspark.sql.functions import mean

In [25]:
mean_val = df.select(mean('Sales')).collect()

In [29]:
mean_sales = mean_val[0][0]

In [31]:
df.na.fill(mean_sales,['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

