In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [2]:
df_pyspark = spark.read.csv('Salary.csv', header=True, inferSchema=True)

In [3]:
df_pyspark.show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|Abhishek|  41|        17|  2000|
| Akansha|  36|        10|  2500|
|   Akash|  35|        10|  2600|
|  Prince|  30|         7|  1850|
|  Shweta|  31|         7|  2150|
|    Ivan|null|      null|  2200|
|    null|  30|         4|  1200|
+--------+----+----------+------+



In [5]:
df_pyspark.dropna().show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|Abhishek| 41|        17|  2000|
| Akansha| 36|        10|  2500|
|   Akash| 35|        10|  2600|
|  Prince| 30|         7|  1850|
|  Shweta| 31|         7|  2150|
+--------+---+----------+------+



In [6]:
df_pyspark.dropna(how='all').show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|Abhishek|  41|        17|  2000|
| Akansha|  36|        10|  2500|
|   Akash|  35|        10|  2600|
|  Prince|  30|         7|  1850|
|  Shweta|  31|         7|  2150|
|    Ivan|null|      null|  2200|
|    null|  30|         4|  1200|
+--------+----+----------+------+



<font color = 'Blue'>Note: how='all' will drop those rows where all values are null </font>

In [7]:
df_pyspark.dropna(how='any').show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|Abhishek| 41|        17|  2000|
| Akansha| 36|        10|  2500|
|   Akash| 35|        10|  2600|
|  Prince| 30|         7|  1850|
|  Shweta| 31|         7|  2150|
+--------+---+----------+------+



<font color = 'Blue'> Note: how='any' will drop those rows where all values are null </font>

In [9]:
df_pyspark.dropna(how='any', thresh=1).show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|Abhishek|  41|        17|  2000|
| Akansha|  36|        10|  2500|
|   Akash|  35|        10|  2600|
|  Prince|  30|         7|  1850|
|  Shweta|  31|         7|  2150|
|    Ivan|null|      null|  2200|
|    null|  30|         4|  1200|
+--------+----+----------+------+



In [11]:
df_pyspark.na.drop(how='any', thresh=3).show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|Abhishek| 41|        17|  2000|
| Akansha| 36|        10|  2500|
|   Akash| 35|        10|  2600|
|  Prince| 30|         7|  1850|
|  Shweta| 31|         7|  2150|
|    null| 30|         4|  1200|
+--------+---+----------+------+



<font color = 'Blue'> Note: how='any', thresh=3 drops rows with less than 3 non-null values. Also, dropna() works the same as na.drop() </font>

In [12]:
df_pyspark.dropna(how='any', subset=['Name']).show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|Abhishek|  41|        17|  2000|
| Akansha|  36|        10|  2500|
|   Akash|  35|        10|  2600|
|  Prince|  30|         7|  1850|
|  Shweta|  31|         7|  2150|
|    Ivan|null|      null|  2200|
+--------+----+----------+------+



<font color = 'Blue'> Note: how='any', subset = 'Name', drops all rows where Name = null </font>

In [13]:
df_pyspark.fillna('No Name Available', ['Name']).show()

+-----------------+----+----------+------+
|             Name| Age|Experience|Salary|
+-----------------+----+----------+------+
|         Abhishek|  41|        17|  2000|
|          Akansha|  36|        10|  2500|
|            Akash|  35|        10|  2600|
|           Prince|  30|         7|  1850|
|           Shweta|  31|         7|  2150|
|             Ivan|null|      null|  2200|
|No Name Available|  30|         4|  1200|
+-----------------+----+----------+------+



In [14]:
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=['Age', 'Experience', 'Salary'],
                  outputCols = ['{}_imputed'.format(column) for column in ['Age', 'Experience', 'Salary']]
                 ).setStrategy('median')

In [15]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+--------+----+----------+------+-----------+------------------+--------------+
|    Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+--------+----+----------+------+-----------+------------------+--------------+
|Abhishek|  41|        17|  2000|         41|                17|          2000|
| Akansha|  36|        10|  2500|         36|                10|          2500|
|   Akash|  35|        10|  2600|         35|                10|          2600|
|  Prince|  30|         7|  1850|         30|                 7|          1850|
|  Shweta|  31|         7|  2150|         31|                 7|          2150|
|    Ivan|null|      null|  2200|         31|                 7|          2200|
|    null|  30|         4|  1200|         30|                 4|          1200|
+--------+----+----------+------+-----------+------------------+--------------+

