# Pyspark handling missing values:
- Dropping columns
- Dropping rows
- Various parameter in dropping functionalities
- Handling missing values by mean, median and mode

### Notes: 
To see describtion from a function, place the cursor on the python function in question and press shift-Tab

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('Practise').getOrCreate()

In [6]:
df_pyspark=spark.read.csv('test3.csv', sep=';', header=True, inferSchema=True)
df_pyspark.show()

+-----+----+----------+------+
| Name| Age|Experience|Salary|
+-----+----+----------+------+
|  Max|  31|        10| 30000|
|Abdel|  30|         8| 25000|
| Manu|  29|         4| 20000|
| Paul|  24|         3| 20000|
|Kevin|  21|         1| 15000|
|Jonas|  23|         2| 18000|
| Ertu|null|      null| 40000|
| null|  34|        10| 38000|
| null|  36|      null|  null|
+-----+----+----------+------+



In [7]:
# Drop column
df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  30|         8| 25000|
|  29|         4| 20000|
|  24|         3| 20000|
|  21|         1| 15000|
|  23|         2| 18000|
|null|      null| 40000|
|  34|        10| 38000|
|  36|      null|  null|
+----+----------+------+



In [8]:
df_pyspark.na.drop().show() # drops all rows where null 

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
|  Max| 31|        10| 30000|
|Abdel| 30|         8| 25000|
| Manu| 29|         4| 20000|
| Paul| 24|         3| 20000|
|Kevin| 21|         1| 15000|
|Jonas| 23|         2| 18000|
+-----+---+----------+------+



In [17]:
# how = any
df_pyspark.na.drop(how='any').show()

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
|  Max| 31|        10| 30000|
|Abdel| 30|         8| 25000|
| Manu| 29|         4| 20000|
| Paul| 24|         3| 20000|
|Kevin| 21|         1| 15000|
|Jonas| 23|         2| 18000|
+-----+---+----------+------+



In [16]:
# threshold =  rop rows that have less than `thresh` non-null values
df_pyspark.na.drop(how='any', thresh=2).show()

+-----+----+----------+------+
| Name| Age|Experience|Salary|
+-----+----+----------+------+
|  Max|  31|        10| 30000|
|Abdel|  30|         8| 25000|
| Manu|  29|         4| 20000|
| Paul|  24|         3| 20000|
|Kevin|  21|         1| 15000|
|Jonas|  23|         2| 18000|
| Ertu|null|      null| 40000|
| null|  34|        10| 38000|
+-----+----+----------+------+



In [19]:
# Subset
df_pyspark.na.drop(how='any', subset=['Experience']).show()

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
|  Max| 31|        10| 30000|
|Abdel| 30|         8| 25000|
| Manu| 29|         4| 20000|
| Paul| 24|         3| 20000|
|Kevin| 21|         1| 15000|
|Jonas| 23|         2| 18000|
| null| 34|        10| 38000|
+-----+---+----------+------+



In [24]:
# Filling missing value
df_pyspark.na.fill('Missing Values').show() # only for string 

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|           Max|  31|        10| 30000|
|         Abdel|  30|         8| 25000|
|          Manu|  29|         4| 20000|
|          Paul|  24|         3| 20000|
|         Kevin|  21|         1| 15000|
|         Jonas|  23|         2| 18000|
|          Ertu|null|      null| 40000|
|Missing Values|  34|        10| 38000|
|Missing Values|  36|      null|  null|
+--------------+----+----------+------+



In [27]:
df_pyspark.na.fill('Missing Values', ['Name', 'Age']).show()

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|           Max|  31|        10| 30000|
|         Abdel|  30|         8| 25000|
|          Manu|  29|         4| 20000|
|          Paul|  24|         3| 20000|
|         Kevin|  21|         1| 15000|
|         Jonas|  23|         2| 18000|
|          Ertu|null|      null| 40000|
|Missing Values|  34|        10| 38000|
|Missing Values|  36|      null|  null|
+--------------+----+----------+------+



In [33]:
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=['Age', 'Experience', 'Salary'], 
                  outputCols=["{}_imputed".format(c) for c in ['Age', 'Experience', 'Salary']]).setStrategy("mean") # change strategy to median or mode

In [32]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+-----+----+----------+------+-----------+------------------+--------------+
| Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+-----+----+----------+------+-----------+------------------+--------------+
|  Max|  31|        10| 30000|         31|                10|         30000|
|Abdel|  30|         8| 25000|         30|                 8|         25000|
| Manu|  29|         4| 20000|         29|                 4|         20000|
| Paul|  24|         3| 20000|         24|                 3|         20000|
|Kevin|  21|         1| 15000|         21|                 1|         15000|
|Jonas|  23|         2| 18000|         23|                 2|         18000|
| Ertu|null|      null| 40000|         28|                 5|         40000|
| null|  34|        10| 38000|         34|                10|         38000|
| null|  36|      null|  null|         36|                 5|         25750|
+-----+----+----------+------+-----------+------------------+--------------+