### Overview of Filter or Where Function on Spark DataFrame

In [8]:
# Import libraries
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
import datetime

In [4]:
spark = SparkSession \
        .builder \
        .appName('SparkFunctions') \
        .getOrCreate()

In [19]:
users = [
            {
                "id": 1,
                "first_name": "Pheobe",
                "last_name": "Buffay",
                "gender": "Female",
                "phone_numbers": Row(mobile= "82349238942", home= "2348910249", office= "8273929", shop=None),
                "courses": [1, 3, 5, 7],
                "email": "pheobebuffay@abc.com",
                "is_customer": True,
                "amount_paid": 1000.55,
                "customer_from": datetime.date(2021, 1, 13),
                "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
            },
            {
                "id": 2,
                "first_name": "Joey",
                "last_name": "Tribbiani",
                "gender": "Male",
                "phone_numbers": Row(mobile= "82349238942", home= "2348910249", office= None, shop=None),
                "courses": [2, 4, 5],
                "email": "joey@abc.com",
                "is_customer": True,
                "amount_paid": 900.0,
                "customer_from": datetime.date(2021, 2, 14),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": 3,
                "first_name": "Monica",
                "last_name": "Geller",
                "gender": "Female",
                "phone_numbers": Row(mobile= None, home= None, office= None, shop=None),
                "courses": [2],
                "email": "monica@abc.com",
                "is_customer": True,
                "amount_paid": 1000.90,
                "customer_from": datetime.date(2021, 2, 22),
                "last_updated_ts": datetime.datetime(2021, 2, 28, 7, 33, 0)
            },
            {
                "id": 4,
                "first_name": "Ross",
                "last_name": "Geller",
                "gender": "Male",
                "phone_numbers": Row(mobile= "82349238942", home= None, office= None, shop=None),
                "courses": [],
                "email": "ross@abc.com",
                "is_customer": True,
                "amount_paid": 1200.55,
                "customer_from": datetime.date(2021, 1, 19),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 1, 10, 0)
            },
            {
                "id": 5,
                "first_name": "Rachel",
                "last_name": "Green",
                "gender": "Female",
                "phone_numbers": Row(mobile= "82349238942", home= "2348910249", office= "8273929", shop= "5343434654"),
                "courses": [3],
                "email": "rachel@abc.com",
                "is_customer": True,
                "amount_paid": None,
                "customer_from": datetime.date(2021, 2, 24),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": 6,
                "first_name": "Chandler",
                "last_name": "Bing",
                "gender": "Male",
                "phone_numbers": Row(mobile= "8273929", home= None, office= None, shop=None),
                "courses": [2, 4],
                "email": "bing@abc.com",
                "is_customer": True,
                "amount_paid": 1000.80,
                "customer_from": datetime.date(2021, 2, 22),
                "last_updated_ts": datetime.datetime(2021, 2, 25, 7, 33, 0)
            }
        ]

In [20]:
usersDF = spark.createDataFrame([Row(**user) for user in users])

In [21]:
usersDF.show()

+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  2|      Joey|Tribbiani|  Male|[82349238942, 234...|   [2, 4, 5]|        joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Monica|   Geller|Female|               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  4|      Ross|   Geller|  Male|    [82349238942,,,]|          []|        ross@abc.com|       true|    1200.55|   202

In [14]:
usersDF.printSchema()

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |    |-- office: string (nullable = true)
 |    |-- shop: string (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- email: string (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)



In [15]:
help(usersDF.filter)

Help on method filter in module pyspark.sql.dataframe:

filter(condition) method of pyspark.sql.dataframe.DataFrame instance
    Filters rows using the given condition.
    
    :func:`where` is an alias for :func:`filter`.
    
    :param condition: a :class:`Column` of :class:`types.BooleanType`
        or a string of SQL expression.
    
    >>> df.filter(df.age > 3).collect()
    [Row(age=5, name='Bob')]
    >>> df.where(df.age == 2).collect()
    [Row(age=2, name='Alice')]
    
    >>> df.filter("age > 3").collect()
    [Row(age=5, name='Bob')]
    >>> df.where("age = 2").collect()
    [Row(age=2, name='Alice')]
    
    .. versionadded:: 1.3



* `where` and `filter` are synonyms
* We can pass conditions either by using SQL Style or Non SQL Style.
* For Non SQL Style we can pass columns using `col` function on column name as string or using the notation of df['col_name']

In [22]:
usersDF.filter(col('gender') == 'Female').show()

+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  3|    Monica|   Geller|Female|               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  5|    Rachel|    Green|Female|[82349238942, 234...|         [3]|      rachel@abc.com|       true|       null|   2021-02-24|2021-02-18 03:33:00|
+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+------

In [25]:
usersDF.filter(usersDF['gender'] == 'Female').show()

+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  3|    Monica|   Geller|Female|               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  5|    Rachel|    Green|Female|[82349238942, 234...|         [3]|      rachel@abc.com|       true|       null|   2021-02-24|2021-02-18 03:33:00|
+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+------

In [29]:
usersDF.filter("gender == 'Female'").show()

+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  3|    Monica|   Geller|Female|               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  5|    Rachel|    Green|Female|[82349238942, 234...|         [3]|      rachel@abc.com|       true|       null|   2021-02-24|2021-02-18 03:33:00|
+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+------

In [30]:
usersDF.where(col('gender') == 'Female').show()

+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  3|    Monica|   Geller|Female|               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  5|    Rachel|    Green|Female|[82349238942, 234...|         [3]|      rachel@abc.com|       true|       null|   2021-02-24|2021-02-18 03:33:00|
+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+------

In [31]:
usersDF.where("gender == 'Female'").show()

+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  3|    Monica|   Geller|Female|               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  5|    Rachel|    Green|Female|[82349238942, 234...|         [3]|      rachel@abc.com|       true|       null|   2021-02-24|2021-02-18 03:33:00|
+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+------

In [32]:
# Using spark sql syntax
usersDF.createOrReplaceTempView('users')

In [33]:
spark.sql("SELECT * FROM users WHERE gender = 'Female'").show()

+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  3|    Monica|   Geller|Female|               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  5|    Rachel|    Green|Female|[82349238942, 234...|         [3]|      rachel@abc.com|       true|       null|   2021-02-24|2021-02-18 03:33:00|
+---+----------+---------+------+--------------------+------------+--------------------+-----------+-----------+------