# Missing values
In this tutorial we will cover:  
- how to find missing values  
- techniques to handling missing values: dropping columns/rows, handling by mean/median/mode, etc.

In [2]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Missing values').getOrCreate()
spark

In [28]:
data = spark.read.options(header=True, delimiter=';', inferSchema=True) \
            .csv('data/tut03_test.csv')
data.show()

+-------+-----------+------+
|   Name|        Job|Salary|
+-------+-----------+------+
|   Paul|  Scientist|    70|
| Andrew|       null|    90|
|Markesh|Firefighter|  null|
+-------+-----------+------+



### Dropping

In [29]:
### Dropping columns
data.drop('Job').show()

+-------+------+
|   Name|Salary|
+-------+------+
|   Paul|    70|
| Andrew|    90|
|Markesh|  null|
+-------+------+



In [30]:
### Dropping rows with null values
data.na.drop().show()

+----+---------+------+
|Name|      Job|Salary|
+----+---------+------+
|Paul|Scientist|    70|
+----+---------+------+



In [31]:
# drop rows where all features is null
data.na.drop(how='all').show()

+-------+-----------+------+
|   Name|        Job|Salary|
+-------+-----------+------+
|   Paul|  Scientist|    70|
| Andrew|       null|    90|
|Markesh|Firefighter|  null|
+-------+-----------+------+



In [33]:
# drop rows where `thresh` features have NOT-null value
data.na.drop(how='any', thresh=3).show()

data.na.drop(how='any', thresh=2).show()

+----+---------+------+
|Name|      Job|Salary|
+----+---------+------+
|Paul|Scientist|    70|
+----+---------+------+

+-------+-----------+------+
|   Name|        Job|Salary|
+-------+-----------+------+
|   Paul|  Scientist|    70|
| Andrew|       null|    90|
|Markesh|Firefighter|  null|
+-------+-----------+------+



In [34]:
# drop rows with null values in SPECIFIC columns
data.na.drop(how='any', subset=['Salary']).show()

data.na.drop(how='any', subset=['Job']).show()

+------+---------+------+
|  Name|      Job|Salary|
+------+---------+------+
|  Paul|Scientist|    70|
|Andrew|     null|    90|
+------+---------+------+

+-------+-----------+------+
|   Name|        Job|Salary|
+-------+-----------+------+
|   Paul|  Scientist|    70|
|Markesh|Firefighter|  null|
+-------+-----------+------+



### Filling

In [38]:
# filling by custom value (with corresponding to type of value)
data.na.fill(value='job').show()

+-------+-----------+------+
|   Name|        Job|Salary|
+-------+-----------+------+
|   Paul|  Scientist|    70|
| Andrew|        job|    90|
|Markesh|Firefighter|  null|
+-------+-----------+------+



In [39]:
# filling by custom value in specific columns
data.na.fill(value='Job missed', subset=['Job']).show()

+-------+-----------+------+
|   Name|        Job|Salary|
+-------+-----------+------+
|   Paul|  Scientist|    70|
| Andrew| Job missed|    90|
|Markesh|Firefighter|  null|
+-------+-----------+------+



In [44]:
# filling by mean value in specific columns using Imputer
from pyspark.ml.feature import Imputer

imp = Imputer(
    inputCols=['Salary'],
    outputCols=['Salary_imputed']
).setStrategy('mean')

In [45]:
# transform our data
data = imp.fit(data).transform(data)
data.show()

+-------+-----------+------+--------------+
|   Name|        Job|Salary|Salary_imputed|
+-------+-----------+------+--------------+
|   Paul|  Scientist|    70|            70|
| Andrew|       null|    90|            90|
|Markesh|Firefighter|  null|            80|
+-------+-----------+------+--------------+

