## 04-Missing_Data

In [0]:
# 04-Missing_Data
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySparkExamples").getOrCreate()

In [0]:
df = spark.read.csv("dbfs:/FileStore/tables/ContainsNull.csv", header = True, inferSchema = True)
df.printSchema()
print("DataFrame columns are:", df.columns, "with column count:", len(df.columns), "and with row count:", df.count())
print(df.head(2))
df.show()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)

DataFrame columns are: ['Id', 'Name', 'Sales'] with column count: 3 and with row count: 4
[Row(Id='emp1', Name='John', Sales=None), Row(Id='emp2', Name=None, Sales=None)]
+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [0]:
df.filter(df.Name.isNotNull()).show()
df.filter(df.Name.isNull()).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp4|Cindy|456.0|
+----+-----+-----+

+----+----+-----+
|  Id|Name|Sales|
+----+----+-----+
|emp2|null| null|
|emp3|null|345.0|
+----+----+-----+



In [0]:
# Drop any row that contains missing data
df.na.drop().show()
# Has to have at least 2 NON-null values
df.na.drop(thresh = 2).show()
# Drop those rows where Sales column is having null
df.na.drop(subset = ["Sales"]).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [0]:
# Drop those rows where null is present in any attribute
df.na.drop(how = 'any').show()
# Drop those rows where null is present in all attributes
df.na.drop(how = 'all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [0]:
# Fill the missing values
df.na.fill('NEW VALUE').show()
df.na.fill(0).show()

+----+---------+-----+
|  Id|     Name|Sales|
+----+---------+-----+
|emp1|     John| null|
|emp2|NEW VALUE| null|
|emp3|NEW VALUE|345.0|
|emp4|    Cindy|456.0|
+----+---------+-----+

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [0]:
# Specifying what columns to fill with the subset parameter
df.na.fill('No Name', subset = ['Name']).show()

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|No Name| null|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [0]:
# Common practice is to fill values with the mean value for the column
from pyspark.sql.functions import mean
mean_val = df.select(mean(df['Sales']))
mean_val.show()
mean_val = df.select(mean(df['Sales'])).collect()
print ("Mean value is:", mean_val, "and type is:", type(mean_val))
print (mean_val[0], mean_val[0][0])
mean_sales = mean_val[0][0]
df.na.fill(mean_sales, ["Sales"]).show()

+----------+
|avg(Sales)|
+----------+
|     400.5|
+----------+

Mean value is: [Row(avg(Sales)=400.5)] and type is: <class 'list'>
Row(avg(Sales)=400.5) 400.5
+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

