## Pyspark Handling Missing values

In [1]:
from pyspark.sql import SparkSession

In [2]:
sp = SparkSession.builder.appName('Dataframe').getOrCreate()

In [10]:
df_spark = sp.read.csv('test1.csv',header=True,inferSchema=True)

In [11]:
df_spark.show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|   Aman|  22|        10|100000|
|Anshita|  25|         8| 90000|
|   John|  27|         4| 80000|
|   zara|  23|         4| 70000|
|    Sam|  21|         3| 50000|
|   NULL|  22|         6| 80000|
|   NULL|NULL|      NULL|  NULL|
|  sumit|  28|      NULL| 78497|
+-------+----+----------+------+



In [29]:
# df_spark = df_spark.withColumn('Salary',df_spark['Experience']*50000)

In [12]:
df_spark.show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|   Aman|  22|        10|100000|
|Anshita|  25|         8| 90000|
|   John|  27|         4| 80000|
|   zara|  23|         4| 70000|
|    Sam|  21|         3| 50000|
|   NULL|  22|         6| 80000|
|   NULL|NULL|      NULL|  NULL|
|  sumit|  28|      NULL| 78497|
+-------+----+----------+------+



### dropping columns

In [13]:
df_spark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  22|        10|100000|
|  25|         8| 90000|
|  27|         4| 80000|
|  23|         4| 70000|
|  21|         3| 50000|
|  22|         6| 80000|
|NULL|      NULL|  NULL|
|  28|      NULL| 78497|
+----+----------+------+



In [14]:
df_spark.na.drop().show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|   Aman| 22|        10|100000|
|Anshita| 25|         8| 90000|
|   John| 27|         4| 80000|
|   zara| 23|         4| 70000|
|    Sam| 21|         3| 50000|
+-------+---+----------+------+



In [15]:
df_spark.na.drop(how='any').show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|   Aman| 22|        10|100000|
|Anshita| 25|         8| 90000|
|   John| 27|         4| 80000|
|   zara| 23|         4| 70000|
|    Sam| 21|         3| 50000|
+-------+---+----------+------+



In [17]:
## Threshold

df_spark.na.drop(how="any",thresh=1).show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|   Aman| 22|        10|100000|
|Anshita| 25|         8| 90000|
|   John| 27|         4| 80000|
|   zara| 23|         4| 70000|
|    Sam| 21|         3| 50000|
|   NULL| 22|         6| 80000|
|  sumit| 28|      NULL| 78497|
+-------+---+----------+------+



In [18]:
## Subset

df_spark.na.drop(how="any",subset=['Experience']).show()


+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|   Aman| 22|        10|100000|
|Anshita| 25|         8| 90000|
|   John| 27|         4| 80000|
|   zara| 23|         4| 70000|
|    Sam| 21|         3| 50000|
|   NULL| 22|         6| 80000|
+-------+---+----------+------+



## Filling Missing Value

In [19]:
df_spark.na.fill('Missing Values').show()

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|          Aman|  22|        10|100000|
|       Anshita|  25|         8| 90000|
|          John|  27|         4| 80000|
|          zara|  23|         4| 70000|
|           Sam|  21|         3| 50000|
|Missing Values|  22|         6| 80000|
|Missing Values|NULL|      NULL|  NULL|
|         sumit|  28|      NULL| 78497|
+--------------+----+----------+------+



In [29]:
df_spark.na.fill('Missing Values',['Experience','Age']).show()


+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|   Aman|  22|        10|100000|
|Anshita|  25|         8| 90000|
|   John|  27|         4| 80000|
|   zara|  23|         4| 70000|
|    Sam|  21|         3| 50000|
|   NULL|  22|         6| 80000|
|   NULL|NULL|      NULL|  NULL|
|  sumit|  28|      NULL| 78497|
+-------+----+----------+------+



In [31]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Age','Experience','Salary'],
    outputCols=["{}_inputer".format(c) for c in ['age','Experience','Salary']]
).setStrategy("mean")

In [32]:
imputer.fit(df_spark).transform(df_spark).show()

+-------+----+----------+------+-----------+------------------+--------------+
|   Name| Age|Experience|Salary|age_inputer|Experience_inputer|Salary_inputer|
+-------+----+----------+------+-----------+------------------+--------------+
|   Aman|  22|        10|100000|         22|                10|        100000|
|Anshita|  25|         8| 90000|         25|                 8|         90000|
|   John|  27|         4| 80000|         27|                 4|         80000|
|   zara|  23|         4| 70000|         23|                 4|         70000|
|    Sam|  21|         3| 50000|         21|                 3|         50000|
|   NULL|  22|         6| 80000|         22|                 6|         80000|
|   NULL|NULL|      NULL|  NULL|         24|                 5|         78356|
|  sumit|  28|      NULL| 78497|         28|                 5|         78497|
+-------+----+----------+------+-----------+------------------+--------------+

