## Importing Libraries

In [0]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql import functions as F



## Read data

In [0]:
df_null = spark.read.csv("/FileStore/tables/ContainsNull.csv", header = True, inferSchema="True")

display(df_null)

Id,Name,Sales
emp1,John,
emp2,,
emp3,,345.0
emp4,Cindy,456.0


## Drop null rows

In [0]:
#drop row if it more than 2 null values 

display(df_null.na.drop(thresh=2))

Id,Name,Sales
emp1,John,
emp3,,345.0
emp4,Cindy,456.0


In [0]:
display(df_null.na.drop())

Id,Name,Sales
emp4,Cindy,456.0


In [0]:
#how = all -- drop row if all the values are null

display(df_null.na.drop(how = 'all'))

Id,Name,Sales
emp1,John,
emp2,,
emp3,,345.0
emp4,Cindy,456.0


In [0]:
### Drop only from specific rows

display(df_null.na.drop(subset=['Sales']))

Id,Name,Sales
emp3,,345.0
emp4,Cindy,456.0


## Fill null rows

In [0]:
df_null.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [0]:
display(df_null.fillna('FILL VALUE'))

Id,Name,Sales
emp1,John,
emp2,FILL VALUE,
emp3,FILL VALUE,345.0
emp4,Cindy,456.0


In [0]:
display(df_null.fillna(0))

Id,Name,Sales
emp1,John,0.0
emp2,,0.0
emp3,,345.0
emp4,Cindy,456.0


In [0]:
mean_sales = df_null.select(avg('Sales')).collect()

mean_sales

Out[17]: [Row(avg(Sales)=400.5)]

In [0]:
mean_sales_value = mean_sales[0][0]
type(mean_sales_value)

Out[26]: float

In [0]:
display(df_null.na.fill(mean_sales_value, ['Sales']))

Id,Name,Sales
emp1,John,400.5
emp2,,400.5
emp3,,345.0
emp4,Cindy,456.0
