# Dealing with missing data 

In [1]:
from pyspark.sql import SparkSession 

In [2]:
spark = SparkSession.builder.appName('missing_data').getOrCreate()

In [3]:
df = spark.read.csv('ContainsNull.csv' , inferSchema= True , 
                    header = True)

In [4]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [5]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [6]:
df.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [7]:
# using thresh to specify what to drop 

df.na.drop(thresh=2).show()
# this mean at least the row has to have at least two non-null values 
# to not be dropped 

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [28]:
# using how 

"""
how = 'any' 
drop rows has any null values 

how = 'all' 
drop rows if all of the rows are null 
"""

df.na.drop(how= 'all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [31]:
"""
if we want to drop null based on certine columns we use subset

"""

df.na.drop(how= 'all' , subset=[
    'Sales',
    'Name'
]).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [32]:
"""
fill the na 
""" 

df.na.fill('No Name' , subset=  ['Name']).show()

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|No Name| null|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [33]:
# fill the na with the mean value 

from pyspark.sql.functions import mean 

In [34]:
mean_val = df.select(mean(df['Sales'])).collect()

In [35]:
mean_val

[Row(avg(Sales)=400.5)]

In [36]:
mean_val[0][0]

400.5

In [37]:
mean_sales = mean_val[0][0]

In [38]:
df.na.fill(mean_sales, ['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [40]:
"""
Doing everything in one line 
"""

df.na.fill(df.select(mean(df['Sales']))\
    .collect()[0][0] , 
    ['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

