## Pyspark Handling Missing Values
- Dropping Columns
- Dropping Rows
- Various Parameter in Dropping functionalities
- Handling Missing Values by Mean, Median and Mode

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()

In [3]:
spark
# In local only 1 master node

In [4]:
## Read the dataset - Method - 1
df_pyspark = spark.read.option('header','true').csv('E:\Programming Career\Pyspark\Pyspark-Introduction\Dataset\Sample_data - Part - 2.csv',inferSchema=True)

In [5]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Experience: double (nullable = true)
 |-- Salary: integer (nullable = true)



In [6]:
df_pyspark.show()

+--------+---+------+-----------+----------+------+
|    Name|Age|Gender|      State|Experience|Salary|
+--------+---+------+-----------+----------+------+
|Bumbhole| 27|     M|  Karnataka|       2.5|120000|
|  Vishnu| 27|  null|  Karnataka|       3.5| 70000|
|  Amanta| 31|     F|       null|       5.0| 30000|
| Samanta| 27|     F|  Karnataka|       2.5| 22000|
|   Pallu| 28|     M|Maharashtra|       2.2| 25000|
|  Naruto| 26|     F|       null|      null|  null|
| Samurai| 25|     F|Maharashtra|       2.0| 25000|
| Shimanu| 28|     M|West Bengal|       4.0| 75000|
|   Mannu| 27|  null|       null|       0.0|  null|
|    null| 25|     F| Tamil Nadu|       3.0| 30000|
|    null| 21|  null|       null|      null|  null|
+--------+---+------+-----------+----------+------+



In [7]:
## Drop column
df_pyspark.drop('Name').show()

+---+------+-----------+----------+------+
|Age|Gender|      State|Experience|Salary|
+---+------+-----------+----------+------+
| 27|     M|  Karnataka|       2.5|120000|
| 27|  null|  Karnataka|       3.5| 70000|
| 31|     F|       null|       5.0| 30000|
| 27|     F|  Karnataka|       2.5| 22000|
| 28|     M|Maharashtra|       2.2| 25000|
| 26|     F|       null|      null|  null|
| 25|     F|Maharashtra|       2.0| 25000|
| 28|     M|West Bengal|       4.0| 75000|
| 27|  null|       null|       0.0|  null|
| 25|     F| Tamil Nadu|       3.0| 30000|
| 21|  null|       null|      null|  null|
+---+------+-----------+----------+------+



In [8]:
df_pyspark.na.drop().show()
# Drop rows wherever null or nan value is.

+--------+---+------+-----------+----------+------+
|    Name|Age|Gender|      State|Experience|Salary|
+--------+---+------+-----------+----------+------+
|Bumbhole| 27|     M|  Karnataka|       2.5|120000|
| Samanta| 27|     F|  Karnataka|       2.5| 22000|
|   Pallu| 28|     M|Maharashtra|       2.2| 25000|
| Samurai| 25|     F|Maharashtra|       2.0| 25000|
| Shimanu| 28|     M|West Bengal|       4.0| 75000|
+--------+---+------+-----------+----------+------+



In [9]:
## na parameters and their default values: how ='any', threshold = None, subset = None

## how = 'all' drop if all are null
df_pyspark.na.drop(how='all').show()

+--------+---+------+-----------+----------+------+
|    Name|Age|Gender|      State|Experience|Salary|
+--------+---+------+-----------+----------+------+
|Bumbhole| 27|     M|  Karnataka|       2.5|120000|
|  Vishnu| 27|  null|  Karnataka|       3.5| 70000|
|  Amanta| 31|     F|       null|       5.0| 30000|
| Samanta| 27|     F|  Karnataka|       2.5| 22000|
|   Pallu| 28|     M|Maharashtra|       2.2| 25000|
|  Naruto| 26|     F|       null|      null|  null|
| Samurai| 25|     F|Maharashtra|       2.0| 25000|
| Shimanu| 28|     M|West Bengal|       4.0| 75000|
|   Mannu| 27|  null|       null|       0.0|  null|
|    null| 25|     F| Tamil Nadu|       3.0| 30000|
|    null| 21|  null|       null|      null|  null|
+--------+---+------+-----------+----------+------+



In [10]:
## how = 'any' or 'all'
df_pyspark.na.drop(how='any').show()

+--------+---+------+-----------+----------+------+
|    Name|Age|Gender|      State|Experience|Salary|
+--------+---+------+-----------+----------+------+
|Bumbhole| 27|     M|  Karnataka|       2.5|120000|
| Samanta| 27|     F|  Karnataka|       2.5| 22000|
|   Pallu| 28|     M|Maharashtra|       2.2| 25000|
| Samurai| 25|     F|Maharashtra|       2.0| 25000|
| Shimanu| 28|     M|West Bengal|       4.0| 75000|
+--------+---+------+-----------+----------+------+



In [11]:
## thresh = num; atleast num = 4number of non-null values
df_pyspark.na.drop(how = 'any' , thresh = 4).show()

+--------+---+------+-----------+----------+------+
|    Name|Age|Gender|      State|Experience|Salary|
+--------+---+------+-----------+----------+------+
|Bumbhole| 27|     M|  Karnataka|       2.5|120000|
|  Vishnu| 27|  null|  Karnataka|       3.5| 70000|
|  Amanta| 31|     F|       null|       5.0| 30000|
| Samanta| 27|     F|  Karnataka|       2.5| 22000|
|   Pallu| 28|     M|Maharashtra|       2.2| 25000|
| Samurai| 25|     F|Maharashtra|       2.0| 25000|
| Shimanu| 28|     M|West Bengal|       4.0| 75000|
|    null| 25|     F| Tamil Nadu|       3.0| 30000|
+--------+---+------+-----------+----------+------+



In [12]:
## thresh = num; atleast num = 5 number of non-null values
df_pyspark.na.drop(how = 'any' , thresh = 5).show()

+--------+---+------+-----------+----------+------+
|    Name|Age|Gender|      State|Experience|Salary|
+--------+---+------+-----------+----------+------+
|Bumbhole| 27|     M|  Karnataka|       2.5|120000|
|  Vishnu| 27|  null|  Karnataka|       3.5| 70000|
|  Amanta| 31|     F|       null|       5.0| 30000|
| Samanta| 27|     F|  Karnataka|       2.5| 22000|
|   Pallu| 28|     M|Maharashtra|       2.2| 25000|
| Samurai| 25|     F|Maharashtra|       2.0| 25000|
| Shimanu| 28|     M|West Bengal|       4.0| 75000|
|    null| 25|     F| Tamil Nadu|       3.0| 30000|
+--------+---+------+-----------+----------+------+



In [13]:
## thresh = num; atleast num = 6 number of non-null values
df_pyspark.na.drop(how = 'any' , thresh = 6).show()

+--------+---+------+-----------+----------+------+
|    Name|Age|Gender|      State|Experience|Salary|
+--------+---+------+-----------+----------+------+
|Bumbhole| 27|     M|  Karnataka|       2.5|120000|
| Samanta| 27|     F|  Karnataka|       2.5| 22000|
|   Pallu| 28|     M|Maharashtra|       2.2| 25000|
| Samurai| 25|     F|Maharashtra|       2.0| 25000|
| Shimanu| 28|     M|West Bengal|       4.0| 75000|
+--------+---+------+-----------+----------+------+



In [14]:
## Subset = actually provide a particular column
df_pyspark.na.drop(how = 'any' , subset='Age').show()

+--------+---+------+-----------+----------+------+
|    Name|Age|Gender|      State|Experience|Salary|
+--------+---+------+-----------+----------+------+
|Bumbhole| 27|     M|  Karnataka|       2.5|120000|
|  Vishnu| 27|  null|  Karnataka|       3.5| 70000|
|  Amanta| 31|     F|       null|       5.0| 30000|
| Samanta| 27|     F|  Karnataka|       2.5| 22000|
|   Pallu| 28|     M|Maharashtra|       2.2| 25000|
|  Naruto| 26|     F|       null|      null|  null|
| Samurai| 25|     F|Maharashtra|       2.0| 25000|
| Shimanu| 28|     M|West Bengal|       4.0| 75000|
|   Mannu| 27|  null|       null|       0.0|  null|
|    null| 25|     F| Tamil Nadu|       3.0| 30000|
|    null| 21|  null|       null|      null|  null|
+--------+---+------+-----------+----------+------+



In [15]:
## Subset = actually provide a particular column
df_pyspark.na.drop(how = 'any' , subset='Experience').show()

+--------+---+------+-----------+----------+------+
|    Name|Age|Gender|      State|Experience|Salary|
+--------+---+------+-----------+----------+------+
|Bumbhole| 27|     M|  Karnataka|       2.5|120000|
|  Vishnu| 27|  null|  Karnataka|       3.5| 70000|
|  Amanta| 31|     F|       null|       5.0| 30000|
| Samanta| 27|     F|  Karnataka|       2.5| 22000|
|   Pallu| 28|     M|Maharashtra|       2.2| 25000|
| Samurai| 25|     F|Maharashtra|       2.0| 25000|
| Shimanu| 28|     M|West Bengal|       4.0| 75000|
|   Mannu| 27|  null|       null|       0.0|  null|
|    null| 25|     F| Tamil Nadu|       3.0| 30000|
+--------+---+------+-----------+----------+------+



In [16]:
## Subset = actually provide a particular column
df_pyspark.na.drop(how = 'any' , subset=['Experience','Age']).show()

+--------+---+------+-----------+----------+------+
|    Name|Age|Gender|      State|Experience|Salary|
+--------+---+------+-----------+----------+------+
|Bumbhole| 27|     M|  Karnataka|       2.5|120000|
|  Vishnu| 27|  null|  Karnataka|       3.5| 70000|
|  Amanta| 31|     F|       null|       5.0| 30000|
| Samanta| 27|     F|  Karnataka|       2.5| 22000|
|   Pallu| 28|     M|Maharashtra|       2.2| 25000|
| Samurai| 25|     F|Maharashtra|       2.0| 25000|
| Shimanu| 28|     M|West Bengal|       4.0| 75000|
|   Mannu| 27|  null|       null|       0.0|  null|
|    null| 25|     F| Tamil Nadu|       3.0| 30000|
+--------+---+------+-----------+----------+------+



In [17]:
### Filling Missing Values - all columns
df_pyspark.na.fill('<<Missing Values>>').show()

+------------------+---+------------------+------------------+----------+------+
|              Name|Age|            Gender|             State|Experience|Salary|
+------------------+---+------------------+------------------+----------+------+
|          Bumbhole| 27|                 M|         Karnataka|       2.5|120000|
|            Vishnu| 27|<<Missing Values>>|         Karnataka|       3.5| 70000|
|            Amanta| 31|                 F|<<Missing Values>>|       5.0| 30000|
|           Samanta| 27|                 F|         Karnataka|       2.5| 22000|
|             Pallu| 28|                 M|       Maharashtra|       2.2| 25000|
|            Naruto| 26|                 F|<<Missing Values>>|      null|  null|
|           Samurai| 25|                 F|       Maharashtra|       2.0| 25000|
|           Shimanu| 28|                 M|       West Bengal|       4.0| 75000|
|             Mannu| 27|<<Missing Values>>|<<Missing Values>>|       0.0|  null|
|<<Missing Values>>| 25|    

In [18]:
### Filling Missing Values - specific columns
df_pyspark.na.fill('<<Missing Values>>', ['Gender', 'Age']).show()

+--------+---+------------------+-----------+----------+------+
|    Name|Age|            Gender|      State|Experience|Salary|
+--------+---+------------------+-----------+----------+------+
|Bumbhole| 27|                 M|  Karnataka|       2.5|120000|
|  Vishnu| 27|<<Missing Values>>|  Karnataka|       3.5| 70000|
|  Amanta| 31|                 F|       null|       5.0| 30000|
| Samanta| 27|                 F|  Karnataka|       2.5| 22000|
|   Pallu| 28|                 M|Maharashtra|       2.2| 25000|
|  Naruto| 26|                 F|       null|      null|  null|
| Samurai| 25|                 F|Maharashtra|       2.0| 25000|
| Shimanu| 28|                 M|West Bengal|       4.0| 75000|
|   Mannu| 27|<<Missing Values>>|       null|       0.0|  null|
|    null| 25|                 F| Tamil Nadu|       3.0| 30000|
|    null| 21|<<Missing Values>>|       null|      null|  null|
+--------+---+------------------+-----------+----------+------+



In [19]:
df_pyspark.show()

+--------+---+------+-----------+----------+------+
|    Name|Age|Gender|      State|Experience|Salary|
+--------+---+------+-----------+----------+------+
|Bumbhole| 27|     M|  Karnataka|       2.5|120000|
|  Vishnu| 27|  null|  Karnataka|       3.5| 70000|
|  Amanta| 31|     F|       null|       5.0| 30000|
| Samanta| 27|     F|  Karnataka|       2.5| 22000|
|   Pallu| 28|     M|Maharashtra|       2.2| 25000|
|  Naruto| 26|     F|       null|      null|  null|
| Samurai| 25|     F|Maharashtra|       2.0| 25000|
| Shimanu| 28|     M|West Bengal|       4.0| 75000|
|   Mannu| 27|  null|       null|       0.0|  null|
|    null| 25|     F| Tamil Nadu|       3.0| 30000|
|    null| 21|  null|       null|      null|  null|
+--------+---+------+-----------+----------+------+



In [20]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Experience: double (nullable = true)
 |-- Salary: integer (nullable = true)



In [21]:
### Imputer function
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Age','Experience','Salary'],
    outputCols=['{}_imputed'.format(c) for c in ['Age','Experience','Salary']]
    ).setStrategy("mean")

In [22]:
## Add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+--------+---+------+-----------+----------+------+-----------+------------------+--------------+
|    Name|Age|Gender|      State|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+--------+---+------+-----------+----------+------+-----------+------------------+--------------+
|Bumbhole| 27|     M|  Karnataka|       2.5|120000|         27|               2.5|        120000|
|  Vishnu| 27|  null|  Karnataka|       3.5| 70000|         27|               3.5|         70000|
|  Amanta| 31|     F|       null|       5.0| 30000|         31|               5.0|         30000|
| Samanta| 27|     F|  Karnataka|       2.5| 22000|         27|               2.5|         22000|
|   Pallu| 28|     M|Maharashtra|       2.2| 25000|         28|               2.2|         25000|
|  Naruto| 26|     F|       null|      null|  null|         26|2.7444444444444445|         49625|
| Samurai| 25|     F|Maharashtra|       2.0| 25000|         25|               2.0|         25000|
| Shimanu| 28|     M

In [23]:
### Imputer function - median
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Age','Experience','Salary'],
    outputCols=['{}_imputed'.format(c) for c in ['Age','Experience','Salary']]
    ).setStrategy("median")

In [24]:
## Add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+--------+---+------+-----------+----------+------+-----------+------------------+--------------+
|    Name|Age|Gender|      State|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+--------+---+------+-----------+----------+------+-----------+------------------+--------------+
|Bumbhole| 27|     M|  Karnataka|       2.5|120000|         27|               2.5|        120000|
|  Vishnu| 27|  null|  Karnataka|       3.5| 70000|         27|               3.5|         70000|
|  Amanta| 31|     F|       null|       5.0| 30000|         31|               5.0|         30000|
| Samanta| 27|     F|  Karnataka|       2.5| 22000|         27|               2.5|         22000|
|   Pallu| 28|     M|Maharashtra|       2.2| 25000|         28|               2.2|         25000|
|  Naruto| 26|     F|       null|      null|  null|         26|               2.5|         30000|
| Samurai| 25|     F|Maharashtra|       2.0| 25000|         25|               2.0|         25000|
| Shimanu| 28|     M

In [25]:
### Imputer function - mode
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Age','Experience','Salary'],
    outputCols=['{}_imputed'.format(c) for c in ['Age','Experience','Salary']]
    ).setStrategy("mode")

In [26]:
## Add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+--------+---+------+-----------+----------+------+-----------+------------------+--------------+
|    Name|Age|Gender|      State|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+--------+---+------+-----------+----------+------+-----------+------------------+--------------+
|Bumbhole| 27|     M|  Karnataka|       2.5|120000|         27|               2.5|        120000|
|  Vishnu| 27|  null|  Karnataka|       3.5| 70000|         27|               3.5|         70000|
|  Amanta| 31|     F|       null|       5.0| 30000|         31|               5.0|         30000|
| Samanta| 27|     F|  Karnataka|       2.5| 22000|         27|               2.5|         22000|
|   Pallu| 28|     M|Maharashtra|       2.2| 25000|         28|               2.2|         25000|
|  Naruto| 26|     F|       null|      null|  null|         26|               2.5|         25000|
| Samurai| 25|     F|Maharashtra|       2.0| 25000|         25|               2.0|         25000|
| Shimanu| 28|     M

## Pyspark DataFrames
- Filter Operations
- &,|,==
- ~

In [27]:
df_pyspark.show()

+--------+---+------+-----------+----------+------+
|    Name|Age|Gender|      State|Experience|Salary|
+--------+---+------+-----------+----------+------+
|Bumbhole| 27|     M|  Karnataka|       2.5|120000|
|  Vishnu| 27|  null|  Karnataka|       3.5| 70000|
|  Amanta| 31|     F|       null|       5.0| 30000|
| Samanta| 27|     F|  Karnataka|       2.5| 22000|
|   Pallu| 28|     M|Maharashtra|       2.2| 25000|
|  Naruto| 26|     F|       null|      null|  null|
| Samurai| 25|     F|Maharashtra|       2.0| 25000|
| Shimanu| 28|     M|West Bengal|       4.0| 75000|
|   Mannu| 27|  null|       null|       0.0|  null|
|    null| 25|     F| Tamil Nadu|       3.0| 30000|
|    null| 21|  null|       null|      null|  null|
+--------+---+------+-----------+----------+------+



In [28]:
### Imputer function - median
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Age','Experience','Salary'],
    outputCols=['{}_imputed'.format(c) for c in ['Age','Experience','Salary']]
    ).setStrategy("median")

In [29]:
## Add imputation cols to df
df_pyspark_no_null = imputer.fit(df_pyspark).transform(df_pyspark)

In [30]:
df_pyspark_no_null.show()

+--------+---+------+-----------+----------+------+-----------+------------------+--------------+
|    Name|Age|Gender|      State|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+--------+---+------+-----------+----------+------+-----------+------------------+--------------+
|Bumbhole| 27|     M|  Karnataka|       2.5|120000|         27|               2.5|        120000|
|  Vishnu| 27|  null|  Karnataka|       3.5| 70000|         27|               3.5|         70000|
|  Amanta| 31|     F|       null|       5.0| 30000|         31|               5.0|         30000|
| Samanta| 27|     F|  Karnataka|       2.5| 22000|         27|               2.5|         22000|
|   Pallu| 28|     M|Maharashtra|       2.2| 25000|         28|               2.2|         25000|
|  Naruto| 26|     F|       null|      null|  null|         26|               2.5|         30000|
| Samurai| 25|     F|Maharashtra|       2.0| 25000|         25|               2.0|         25000|
| Shimanu| 28|     M

In [31]:
df_pyspark_no_null = df_pyspark_no_null.drop('Experience','Age','Salary')
df_pyspark_no_null.show()

+--------+------+-----------+-----------+------------------+--------------+
|    Name|Gender|      State|Age_imputed|Experience_imputed|Salary_imputed|
+--------+------+-----------+-----------+------------------+--------------+
|Bumbhole|     M|  Karnataka|         27|               2.5|        120000|
|  Vishnu|  null|  Karnataka|         27|               3.5|         70000|
|  Amanta|     F|       null|         31|               5.0|         30000|
| Samanta|     F|  Karnataka|         27|               2.5|         22000|
|   Pallu|     M|Maharashtra|         28|               2.2|         25000|
|  Naruto|     F|       null|         26|               2.5|         30000|
| Samurai|     F|Maharashtra|         25|               2.0|         25000|
| Shimanu|     M|West Bengal|         28|               4.0|         75000|
|   Mannu|  null|       null|         27|               0.0|         30000|
|    null|     F| Tamil Nadu|         25|               3.0|         30000|
|    null|  

In [32]:
### Filling Missing Values - specific columns
df_pyspark_no_null = df_pyspark_no_null.na.fill('Missing Name', ['Name'])
df_pyspark_no_null = df_pyspark_no_null.na.fill('Missing Gender', ['Gender'])
df_pyspark_no_null = df_pyspark_no_null.na.fill('Missing State', ['State'])
df_pyspark_no_null.show()

+------------+--------------+-------------+-----------+------------------+--------------+
|        Name|        Gender|        State|Age_imputed|Experience_imputed|Salary_imputed|
+------------+--------------+-------------+-----------+------------------+--------------+
|    Bumbhole|             M|    Karnataka|         27|               2.5|        120000|
|      Vishnu|Missing Gender|    Karnataka|         27|               3.5|         70000|
|      Amanta|             F|Missing State|         31|               5.0|         30000|
|     Samanta|             F|    Karnataka|         27|               2.5|         22000|
|       Pallu|             M|  Maharashtra|         28|               2.2|         25000|
|      Naruto|             F|Missing State|         26|               2.5|         30000|
|     Samurai|             F|  Maharashtra|         25|               2.0|         25000|
|     Shimanu|             M|  West Bengal|         28|               4.0|         75000|
|       Ma

### Filter Operation

In [33]:
### Salary is less than equal to 30000
df_pyspark_no_null.filter('Salary_imputed<=30000').show()

+------------+--------------+-------------+-----------+------------------+--------------+
|        Name|        Gender|        State|Age_imputed|Experience_imputed|Salary_imputed|
+------------+--------------+-------------+-----------+------------------+--------------+
|      Amanta|             F|Missing State|         31|               5.0|         30000|
|     Samanta|             F|    Karnataka|         27|               2.5|         22000|
|       Pallu|             M|  Maharashtra|         28|               2.2|         25000|
|      Naruto|             F|Missing State|         26|               2.5|         30000|
|     Samurai|             F|  Maharashtra|         25|               2.0|         25000|
|       Mannu|Missing Gender|Missing State|         27|               0.0|         30000|
|Missing Name|             F|   Tamil Nadu|         25|               3.0|         30000|
|Missing Name|Missing Gender|Missing State|         21|               2.5|         30000|
+---------

In [34]:
### Salary is less than equal to 27000 and show specific columns
df_pyspark_no_null.filter('Salary_imputed<=27000').select(['Name','Age_imputed','Salary_imputed']).show()

+-------+-----------+--------------+
|   Name|Age_imputed|Salary_imputed|
+-------+-----------+--------------+
|Samanta|         27|         22000|
|  Pallu|         28|         25000|
|Samurai|         25|         25000|
+-------+-----------+--------------+



In [35]:
### Salary is less than equal to 27000 and show specific columns - Method - 2
df_pyspark_no_null.filter(df_pyspark_no_null['Salary_imputed']<=27000).select(['Name','Age_imputed','Salary_imputed']).show()

+-------+-----------+--------------+
|   Name|Age_imputed|Salary_imputed|
+-------+-----------+--------------+
|Samanta|         27|         22000|
|  Pallu|         28|         25000|
|Samurai|         25|         25000|
+-------+-----------+--------------+



In [36]:
### Multiple conditions : Salary is less than equal to 30000 and age more than 26 and show specific columns
df_pyspark_no_null.filter((df_pyspark_no_null['Salary_imputed']<=27000) & 
                          (df_pyspark_no_null['Age_imputed'] > 26)).select(['Name','Age_imputed','Salary_imputed']).show()

+-------+-----------+--------------+
|   Name|Age_imputed|Salary_imputed|
+-------+-----------+--------------+
|Samanta|         27|         22000|
|  Pallu|         28|         25000|
+-------+-----------+--------------+



In [37]:
### Multiple conditions : Salary is less than equal to 30000 or age more than 26 and show specific columns
df_pyspark_no_null.filter((df_pyspark_no_null['Salary_imputed']<=27000) | 
                          (df_pyspark_no_null['Salary_imputed'] >= 80000)).select(['Name','Age_imputed','Salary_imputed']).show()

+--------+-----------+--------------+
|    Name|Age_imputed|Salary_imputed|
+--------+-----------+--------------+
|Bumbhole|         27|        120000|
| Samanta|         27|         22000|
|   Pallu|         28|         25000|
| Samurai|         25|         25000|
+--------+-----------+--------------+



In [38]:
### Salary is not less than to 30000 and show specific columns - Method - 2
df_pyspark_no_null.filter(~(df_pyspark_no_null['Salary_imputed']<=27000)).select(['Name','Age_imputed','Salary_imputed']).show()

+------------+-----------+--------------+
|        Name|Age_imputed|Salary_imputed|
+------------+-----------+--------------+
|    Bumbhole|         27|        120000|
|      Vishnu|         27|         70000|
|      Amanta|         31|         30000|
|      Naruto|         26|         30000|
|     Shimanu|         28|         75000|
|       Mannu|         27|         30000|
|Missing Name|         25|         30000|
|Missing Name|         21|         30000|
+------------+-----------+--------------+

