# PySpark `filter()` - 
        by Aishwarya Raut

# 1. PySpark DataFrame filter() Syntax

`filter(condition)`

In [3]:
# Imports

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SP').getOrCreate()

from pyspark.sql.types import StructType,StructField 
from pyspark.sql.types import StringType, IntegerType, ArrayType
data = [
    (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
    (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
    (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
    (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
    (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
    (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
 ]
        
schema = StructType([
     StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
     ])),
     StructField('languages', ArrayType(StringType()), True),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
 ])

df = spark.createDataFrame(data = data, schema = schema)
df.printSchema()


root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)



# 2. DataFrame filter() with Column Condition

In [5]:
# using equals condition
df.filter(df.state=="OH").show(truncate=False)

# not equals condition
df.filter(df.state!= "OH").show(truncate=False)
df.filter(~(df.state=="OH")).show(truncate=False)

In [None]:
# using SQL col() function 
from pyspark.sql.functions import col
df.filter(col("state") == "OH") \
    .show(truncate=False) 

# 3. DataFrame filter() with SQL Expression

In [None]:
df.filter("gender == 'M'").show()
df.filter("gender != 'M'").show()
df.filter("gender <> 'M'").show()

# 4. PySpark Filter with Multiple Conditions


In [None]:
df.filter((df.state=="OH") & (df.gender =="M")).show()

# 5. Filter Based on List Values

In [None]:
# filter IS IN list values
li=["OH","CA","DE"]
df.filter(df.state.isin(li)).show()

# filter NOT IS IN list values
# There show all records with NY (NY is not part of the list)
df.filter(~df.state.isin(li)).show()
df.filter(df.state.isin(li)==False).show()

# 6. Filter Based on Starts With, Ends With, Contains

In [None]:
# Using startswith
df.filter(df.state.startswith("N")).show()

#using endswith
df.filter(df.state.endswith("H")).show()

#contains
df.filter(df.state.contains("H")).show()


# 7. PySpark Filter like and rlike

In [None]:
# Prepare Data
data2 = [(2,"Michael Rose"),(3,"Robert Williams"),
     (4,"Rames Rose"),(5,"Rames rose")
  ]

df2 = spark.createDataFrame(data = data2, schema = ["id","name"])

# like - SQL LIKE pattern
df2.filter(df2.name.like("%rose%")).show()

# rlike - SQL RLIKE pattern (LIKE with Regex)
#This check case insensitive
df2.filter(df2.name.rlike("(?i)^*rose$")).show()

# 8. Filter on an Array column

In [None]:
from pyspark.sql.functions import array_contains
df.filter(array_contains(df.languages,"Java")) \
    .show(truncate=False)   

# 9. Filtering on Nested Struct columns


In [None]:
# Struct condition
df.filter(df.name.lastname == "Williams") \
    .show(truncate=False) 

**What is the difference between where and filter in PySpark?**

In PySpark, both filter() and where() functions are used to filter out data based on certain conditions. They are used interchangeably, and both of them essentially perform the same operation. where() is an alias for filter.
