In [2]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[1]") \
                    .appName("PySparkLearning") \
                    .getOrCreate()


In [3]:
data = [
        ("James",None,"M"),
        ("Anna","NY","F"),
        ("Julia",None,None)
  ]

columns = ["name","state","gender"]

df = spark.createDataFrame(data,columns)
df.show()

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|James| null|     M|
| Anna|   NY|     F|
|Julia| null|  null|
+-----+-----+------+



### Filter Rows with NULL Values in DataFrame

In PySpark, using `filter() or where()` functions of DataFrame we can filter rows with NULL values by checking `IS NULL or isNULL`.

In [5]:
from pyspark.sql.functions import col

df.filter("state is NULL").show()
df.filter(df.state.isNull()).show()
df.filter(col("state").isNull()).show()

# Removes all rows with null values on `state` column and returns the new DataFrame. 
# All the above returns the same output.

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|James| null|     M|
|Julia| null|  null|
+-----+-----+------+

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|James| null|     M|
|Julia| null|  null|
+-----+-----+------+

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|James| null|     M|
|Julia| null|  null|
+-----+-----+------+



### Filter Rows with NULL on Multiple Columns

Let’s see how to filter rows with NULL values on multiple columns in DataFrame. In order to do so you can use either `AND or && operators`.

In [6]:
df.filter("state IS NULL AND gender IS NULL").show()
df.filter(df.state.isNull() & df.gender.isNull()).show()

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|Julia| null|  null|
+-----+-----+------+

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|Julia| null|  null|
+-----+-----+------+



### Filter Rows with IS NOT NULL or isNotNull

`IS NOT NULL or isNotNull` is used to filter rows that are `NOT NULL` in PySpark DataFrame columns.

In [7]:
df.filter("state IS NOT NULL").show()
df.filter("NOT state IS NULL").show()
df.filter(df.state.isNotNull()).show()
df.filter(col("state").isNotNull()).show()

+----+-----+------+
|name|state|gender|
+----+-----+------+
|Anna|   NY|     F|
+----+-----+------+

+----+-----+------+
|name|state|gender|
+----+-----+------+
|Anna|   NY|     F|
+----+-----+------+

+----+-----+------+
|name|state|gender|
+----+-----+------+
|Anna|   NY|     F|
+----+-----+------+

+----+-----+------+
|name|state|gender|
+----+-----+------+
|Anna|   NY|     F|
+----+-----+------+



Alternatively, you can also write the same using `df.na.drop()`

In [8]:
df.na.drop(subset=["state"]).show()

+----+-----+------+
|name|state|gender|
+----+-----+------+
|Anna|   NY|     F|
+----+-----+------+



### PySpark SQL Filter Rows with NULL Values

If you are familiar with PySpark SQL, you can check `IS NULL and IS NOT NULL` to filter the rows from DataFrame.



In [9]:
df.createOrReplaceTempView("DATA")

spark.sql("SELECT * FROM DATA where STATE IS NULL").show()
spark.sql("SELECT * FROM DATA where STATE IS NULL AND GENDER IS NULL").show()
spark.sql("SELECT * FROM DATA where STATE IS NOT NULL").show()

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|James| null|     M|
|Julia| null|  null|
+-----+-----+------+

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|Julia| null|  null|
+-----+-----+------+

+----+-----+------+
|name|state|gender|
+----+-----+------+
|Anna|   NY|     F|
+----+-----+------+

