### Overview of Filter or Where Function on Spark DataFrame

In [1]:
# Import libraries
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *
import datetime

In [2]:
spark = SparkSession \
        .builder \
        .appName('SparkFunctions') \
        .getOrCreate()

In [3]:
users = [
            {
                "id": 1,
                "first_name": "Pheobe",
                "last_name": "Buffay",
                "gender": "Female",
                "current_city": "Dallas",
                "phone_numbers": Row(mobile= "82349238942", home= "2348910249", office= "8273929", shop=None),
                "courses": [1, 3, 5, 7],
                "email": "pheobebuffay@abc.com",
                "is_customer": False,
                "amount_paid": 1000.55,
                "customer_from": datetime.date(2021, 1, 13),
                "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
            },
            {
                "id": 2,
                "first_name": "Joey",
                "last_name": "Tribbiani",
                "gender": "Male",
                "current_city": None,
                "phone_numbers": Row(mobile= "82349238942", home= "2348910249", office= None, shop=None),
                "courses": [2, 4, 5],
                "email": "joey@abc.com",
                "is_customer": True,
                "amount_paid": 900.0,
                "customer_from": datetime.date(2021, 2, 14),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": 3,
                "first_name": "Monica",
                "last_name": "Geller",
                "gender": "Female",
                "current_city": "",
                "phone_numbers": Row(mobile= None, home= None, office= None, shop=None),
                "courses": [2],
                "email": "monica@abc.com",
                "is_customer": True,
                "amount_paid": 1000.90,
                "customer_from": datetime.date(2021, 2, 22),
                "last_updated_ts": datetime.datetime(2021, 2, 28, 7, 33, 0)
            },
            {
                "id": 4,
                "first_name": "Ross",
                "last_name": "Geller",
                "gender": "Male",
                "current_city": "Dallas",
                "phone_numbers": Row(mobile= "82349238942", home= None, office= None, shop=None),
                "courses": [],
                "email": "ross@abc.com",
                "is_customer": True,
                "amount_paid": 1200.55,
                "customer_from": datetime.date(2021, 1, 19),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 1, 10, 0)
            },
            {
                "id": 5,
                "first_name": "Rachel",
                "last_name": "Green",
                "gender": "Female",
                "current_city": "Houston",
                "phone_numbers": Row(mobile= "82349238942", home= "2348910249", office= "8273929", shop= "5343434654"),
                "courses": [3],
                "email": "rachel@abc.com",
                "is_customer": False,
                "amount_paid": float('nan'),
                "customer_from": datetime.date(2021, 2, 24),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": 6,
                "first_name": "Chandler",
                "last_name": "Bing",
                "gender": "Male",
                "current_city": "Dallas",
                "phone_numbers": Row(mobile= "8273929", home= None, office= None, shop=None),
                "courses": [2, 4],
                "email": "bing@abc.com",
                "is_customer": True,
                "amount_paid": 1000.80,
                "customer_from": datetime.date(2021, 2, 22),
                "last_updated_ts": datetime.datetime(2021, 2, 25, 7, 33, 0)
            }
        ]

In [4]:
usersDF = spark.createDataFrame([Row(**user) for user in users])

In [5]:
usersDF.show()

+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|      Dallas|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|      false|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  2|      Joey|Tribbiani|  Male|        null|[82349238942, 234...|   [2, 4, 5]|        joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Monica|   Geller|Female|            |               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  4|      Ross|   Geller|  Male|      D

In [6]:
usersDF.printSchema()

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- current_city: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |    |-- office: string (nullable = true)
 |    |-- shop: string (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- email: string (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)



In [7]:
help(usersDF.filter)

Help on method filter in module pyspark.sql.dataframe:

filter(condition) method of pyspark.sql.dataframe.DataFrame instance
    Filters rows using the given condition.
    
    :func:`where` is an alias for :func:`filter`.
    
    :param condition: a :class:`Column` of :class:`types.BooleanType`
        or a string of SQL expression.
    
    >>> df.filter(df.age > 3).collect()
    [Row(age=5, name='Bob')]
    >>> df.where(df.age == 2).collect()
    [Row(age=2, name='Alice')]
    
    >>> df.filter("age > 3").collect()
    [Row(age=5, name='Bob')]
    >>> df.where("age = 2").collect()
    [Row(age=2, name='Alice')]
    
    .. versionadded:: 1.3



* `where` and `filter` are synonyms
* We can pass conditions either by using SQL Style or Non SQL Style.
* For Non SQL Style we can pass columns using `col` function on column name as string or using the notation of df['col_name']

In [8]:
usersDF.filter(col('gender') == 'Female').show()

+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|      Dallas|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|      false|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  3|    Monica|   Geller|Female|            |               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  5|    Rachel|    Green|Female|     Houston|[82349238942, 234...|         [3]|      rachel@abc.com|      false|        NaN|   2021-02-24|2021-02-18 03:33:00|
+---+----------+---------+------+-------

In [9]:
usersDF.filter(usersDF['gender'] == 'Female').show()

+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|      Dallas|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|      false|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  3|    Monica|   Geller|Female|            |               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  5|    Rachel|    Green|Female|     Houston|[82349238942, 234...|         [3]|      rachel@abc.com|      false|        NaN|   2021-02-24|2021-02-18 03:33:00|
+---+----------+---------+------+-------

In [10]:
usersDF.filter("gender == 'Female'").show()

+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|      Dallas|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|      false|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  3|    Monica|   Geller|Female|            |               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  5|    Rachel|    Green|Female|     Houston|[82349238942, 234...|         [3]|      rachel@abc.com|      false|        NaN|   2021-02-24|2021-02-18 03:33:00|
+---+----------+---------+------+-------

In [11]:
usersDF.where(col('gender') == 'Female').show()

+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|      Dallas|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|      false|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  3|    Monica|   Geller|Female|            |               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  5|    Rachel|    Green|Female|     Houston|[82349238942, 234...|         [3]|      rachel@abc.com|      false|        NaN|   2021-02-24|2021-02-18 03:33:00|
+---+----------+---------+------+-------

In [12]:
usersDF.where("gender == 'Female'").show()

+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|      Dallas|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|      false|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  3|    Monica|   Geller|Female|            |               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  5|    Rachel|    Green|Female|     Houston|[82349238942, 234...|         [3]|      rachel@abc.com|      false|        NaN|   2021-02-24|2021-02-18 03:33:00|
+---+----------+---------+------+-------

In [13]:
# Using spark sql syntax
usersDF.createOrReplaceTempView('users')

In [14]:
spark.sql("SELECT * FROM users WHERE gender = 'Female'").show()

+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|      Dallas|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|      false|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  3|    Monica|   Geller|Female|            |               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  5|    Rachel|    Green|Female|     Houston|[82349238942, 234...|         [3]|      rachel@abc.com|      false|        NaN|   2021-02-24|2021-02-18 03:33:00|
+---+----------+---------+------+-------

### Overview of Conditions and Operators

* Equal: `=` or `==`
* Not Equal: `!=`
* Greater Than: `>`
* Less Than: `<`
* Greater Than or Equal To: `>=`
* Less Than or Equal To: `<=`
* IN Operator: `isin` function or `IN` or `contains` function
* BETWEEN Operator: `between` function or `BETWEEN` with `AND`

In [15]:
usersDF.show(2)

+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|      Dallas|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|      false|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  2|      Joey|Tribbiani|  Male|    New York|[82349238942, 234...|   [2, 4, 5]|        joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
only showing top 2 rows



#### Equal to

* Get list of customers(is_customer flags is set to true)

In [16]:
usersDF.filter(col('is_customer') == 'true').show() # 'true' or True

+---+----------+---------+------+------------+--------------------+---------+--------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|  courses|         email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+---------+--------------+-----------+-----------+-------------+-------------------+
|  2|      Joey|Tribbiani|  Male|    New York|[82349238942, 234...|[2, 4, 5]|  joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Monica|   Geller|Female|            |               [,,,]|      [2]|monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  4|      Ross|   Geller|  Male|      Dallas|    [82349238942,,,]|       []|  ross@abc.com|       true|    1200.55|   2021-01-19|2021-02-18 01:10:00|
|  6|  Chandler|     Bing|  Male|      Dallas|        [8273929,,,]|   [2, 4]|  bing@abc.com|  

In [17]:
# Use sql style syntax
usersDF.filter('is_customer = "true"').show() # True or "true" or true

+---+----------+---------+------+------------+--------------------+---------+--------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|  courses|         email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+---------+--------------+-----------+-----------+-------------+-------------------+
|  2|      Joey|Tribbiani|  Male|    New York|[82349238942, 234...|[2, 4, 5]|  joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Monica|   Geller|Female|            |               [,,,]|      [2]|monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  4|      Ross|   Geller|  Male|      Dallas|    [82349238942,,,]|       []|  ross@abc.com|       true|    1200.55|   2021-01-19|2021-02-18 01:10:00|
|  6|  Chandler|     Bing|  Male|      Dallas|        [8273929,,,]|   [2, 4]|  bing@abc.com|  

In [18]:
usersDF.createOrReplaceTempView('users')

In [19]:
# SQL query
spark.sql("""SELECT * FROM users WHERE is_customer = 'true'""").show() # True or true

+---+----------+---------+------+------------+--------------------+---------+--------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|  courses|         email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+---------+--------------+-----------+-----------+-------------+-------------------+
|  2|      Joey|Tribbiani|  Male|    New York|[82349238942, 234...|[2, 4, 5]|  joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Monica|   Geller|Female|            |               [,,,]|      [2]|monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  4|      Ross|   Geller|  Male|      Dallas|    [82349238942,,,]|       []|  ross@abc.com|       true|    1200.55|   2021-01-19|2021-02-18 01:10:00|
|  6|  Chandler|     Bing|  Male|      Dallas|        [8273929,,,]|   [2, 4]|  bing@abc.com|  

* Get users from Dallas

In [20]:
usersDF.filter(col('current_city') == 'Dallas').show()

+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|      Dallas|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|      false|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  4|      Ross|   Geller|  Male|      Dallas|    [82349238942,,,]|          []|        ross@abc.com|       true|    1200.55|   2021-01-19|2021-02-18 01:10:00|
|  6|  Chandler|     Bing|  Male|      Dallas|        [8273929,,,]|      [2, 4]|        bing@abc.com|       true|     1000.8|   2021-02-22|2021-02-25 07:33:00|
+---+----------+---------+------+-------

* Get the customers who paid 900.0

In [21]:
usersDF.filter(col('amount_paid') == 900.0).show() # '900.0'

+---+----------+---------+------+------------+--------------------+---------+------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|  courses|       email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+---------+------------+-----------+-----------+-------------+-------------------+
|  2|      Joey|Tribbiani|  Male|    New York|[82349238942, 234...|[2, 4, 5]|joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
+---+----------+---------+------+------------+--------------------+---------+------------+-----------+-----------+-------------+-------------------+



* Get the customers where paid amount is not a number

In [22]:
usersDF.select('amount_paid', isnan('amount_paid')).show()

+-----------+------------------+
|amount_paid|isnan(amount_paid)|
+-----------+------------------+
|    1000.55|             false|
|      900.0|             false|
|     1000.9|             false|
|    1200.55|             false|
|        NaN|              true|
|     1000.8|             false|
+-----------+------------------+



In [23]:
usersDF.filter(isnan(col('amount_paid'))).show()

+---+----------+---------+------+------------+--------------------+-------+--------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|courses|         email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+-------+--------------+-----------+-----------+-------------+-------------------+
|  5|    Rachel|    Green|Female|     Houston|[82349238942, 234...|    [3]|rachel@abc.com|      false|        NaN|   2021-02-24|2021-02-18 03:33:00|
+---+----------+---------+------+------------+--------------------+-------+--------------+-----------+-----------+-------------+-------------------+



In [24]:
usersDF.filter('isnan(amount_paid) = True').show()

+---+----------+---------+------+------------+--------------------+-------+--------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|courses|         email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+-------+--------------+-----------+-----------+-------------+-------------------+
|  5|    Rachel|    Green|Female|     Houston|[82349238942, 234...|    [3]|rachel@abc.com|      false|        NaN|   2021-02-24|2021-02-18 03:33:00|
+---+----------+---------+------+------------+--------------------+-------+--------------+-----------+-----------+-------------+-------------------+



#### Not Equal to

* Get all the users who are not living in Dallas

In [52]:
usersDF.select('id', 'current_city').show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|        null|
|  3|            |
|  4|      Dallas|
|  5|     Houston|
|  6|      Dallas|
+---+------------+



In [29]:
usersDF.select('id', 'current_city').filter(col('current_city') != 'Dallas').show()

+---+------------+
| id|current_city|
+---+------------+
|  2|    New York|
|  3|            |
|  5|     Houston|
+---+------------+



* Get all the users whose city name is not empty string. Null can be ignored.

In [30]:
usersDF.select('id', 'current_city').filter(col('current_city') != '').show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|    New York|
|  4|      Dallas|
|  5|     Houston|
|  6|      Dallas|
+---+------------+



In [51]:
# Null or Not dallas
usersDF \
.select('id', 'current_city') \
.filter((col('current_city') != 'Dallas') | (col('current_city').isNull())).show()

+---+------------+
| id|current_city|
+---+------------+
|  2|        null|
|  3|            |
|  5|     Houston|
+---+------------+



In [53]:
usersDF \
.select('id', 'current_city') \
.filter("current_city != 'Dallas' OR current_city IS NULL").show()

+---+------------+
| id|current_city|
+---+------------+
|  2|        null|
|  3|            |
|  5|     Houston|
+---+------------+



In [54]:
usersDF \
.select('id', 'current_city') \
.filter((col('current_city') != '')).show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  4|      Dallas|
|  5|     Houston|
|  6|      Dallas|
+---+------------+



In [57]:
usersDF \
.select('id', 'current_city') \
.filter(col('current_city') != 'None').show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  3|            |
|  4|      Dallas|
|  5|     Houston|
|  6|      Dallas|
+---+------------+



**NOTE:**
* If we use `col('current_city') != ''`, it will ignore both `null values` and `empty` string.
* If we use `col('current_city') != 'None'`, it will only `ignore null values`.
* If we use `col('current_city') == ''`, it will only consider `empty strings`.
* Check examples below.

In [71]:
city = [(1, 'Kolkata'), (2, 'Indore'), (3, 'Satna'), (4, ''), (5, ), (6, None)]

In [72]:
pd.DataFrame(city)

Unnamed: 0,0,1
0,1,Kolkata
1,2,Indore
2,3,Satna
3,4,
4,5,
5,6,


In [76]:
import pandas as pd
df = spark.createDataFrame(pd.DataFrame(city), 'id INT, city STRING')

In [77]:
df.show()

+---+-------+
| id|   city|
+---+-------+
|  1|Kolkata|
|  2| Indore|
|  3|  Satna|
|  4|       |
|  5|   null|
|  6|   null|
+---+-------+



In [78]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- city: string (nullable = true)



In [80]:
df.filter(col('city').isNull()).show()

+---+----+
| id|city|
+---+----+
|  5|null|
|  6|null|
+---+----+



In [81]:
df.filter((col('city').isNull()) | (col('city') == '')).show()

+---+----+
| id|city|
+---+----+
|  4|    |
|  5|null|
|  6|null|
+---+----+



In [83]:
df.filter((col('city') != '')).show()

+---+-------+
| id|   city|
+---+-------+
|  1|Kolkata|
|  2| Indore|
|  3|  Satna|
+---+-------+



In [84]:
df.filter((col('city') == '')).show()

+---+----+
| id|city|
+---+----+
|  4|    |
+---+----+



#### BETWEEN

In [85]:
usersDF.show()

+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|      Dallas|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|      false|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  2|      Joey|Tribbiani|  Male|        null|[82349238942, 234...|   [2, 4, 5]|        joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Monica|   Geller|Female|            |               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  4|      Ross|   Geller|  Male|      D

* Get user_id and email whose last updated timestamp is between 2021 Feb 15th and 2021 March 15th

In [92]:
help(usersDF['last_updated_ts'].between)

Help on method between in module pyspark.sql.column:

between(lowerBound, upperBound) method of pyspark.sql.column.Column instance
    A boolean expression that is evaluated to true if the value of this
    expression is between the given columns.
    
    >>> df.select(df.name, df.age.between(2, 4)).show()
    +-----+---------------------------+
    | name|((age >= 2) AND (age <= 4))|
    +-----+---------------------------+
    |Alice|                       true|
    |  Bob|                      false|
    +-----+---------------------------+
    
    .. versionadded:: 1.3



In [96]:
# BUG: should return 5 rows
usersDF \
.select('id', 'email', 'last_updated_ts') \
.filter(col('last_updated_ts').between('2021-02-15', '2021-02-28')) \
.show()

+---+--------------+-------------------+
| id|         email|    last_updated_ts|
+---+--------------+-------------------+
|  2|  joey@abc.com|2021-02-18 03:33:00|
|  4|  ross@abc.com|2021-02-18 01:10:00|
|  5|rachel@abc.com|2021-02-18 03:33:00|
|  6|  bing@abc.com|2021-02-25 07:33:00|
+---+--------------+-------------------+



In [99]:
# FIX: specify full timestamp
usersDF \
.select('id', 'email', 'last_updated_ts') \
.filter(col('last_updated_ts').between('2021-02-15 00:00:00', '2021-02-28 23:59:59')) \
.show()

+---+--------------+-------------------+
| id|         email|    last_updated_ts|
+---+--------------+-------------------+
|  2|  joey@abc.com|2021-02-18 03:33:00|
|  3|monica@abc.com|2021-02-28 07:33:00|
|  4|  ross@abc.com|2021-02-18 01:10:00|
|  5|rachel@abc.com|2021-02-18 03:33:00|
|  6|  bing@abc.com|2021-02-25 07:33:00|
+---+--------------+-------------------+



In [100]:
# SQL like syntax
usersDF \
.select('id', 'email', 'last_updated_ts') \
.filter("last_updated_ts BETWEEN '2021-02-15 00:00:00' AND '2021-02-28 23:59:59'") \
.show()

+---+--------------+-------------------+
| id|         email|    last_updated_ts|
+---+--------------+-------------------+
|  2|  joey@abc.com|2021-02-18 03:33:00|
|  3|monica@abc.com|2021-02-28 07:33:00|
|  4|  ross@abc.com|2021-02-18 01:10:00|
|  5|rachel@abc.com|2021-02-18 03:33:00|
|  6|  bing@abc.com|2021-02-25 07:33:00|
+---+--------------+-------------------+



* Get all the users whose payment is in the range of 850 and 900

In [101]:
usersDF.filter(col('amount_paid').between(850, 900)).show()

+---+----------+---------+------+------------+--------------------+---------+------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|  courses|       email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+---------+------------+-----------+-----------+-------------+-------------------+
|  2|      Joey|Tribbiani|  Male|        null|[82349238942, 234...|[2, 4, 5]|joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
+---+----------+---------+------+------------+--------------------+---------+------------+-----------+-----------+-------------+-------------------+



In [102]:
# SQL like syntax
usersDF.filter("amount_paid BETWEEN 850 AND 900").show()

+---+----------+---------+------+------------+--------------------+---------+------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|  courses|       email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+---------+------------+-----------+-----------+-------------+-------------------+
|  2|      Joey|Tribbiani|  Male|        null|[82349238942, 234...|[2, 4, 5]|joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
+---+----------+---------+------+------------+--------------------+---------+------------+-----------+-----------+-------------+-------------------+



In [103]:
# If you pass 850 and 900 as string, it will implicitly get converted into float or int or double
usersDF.filter("amount_paid BETWEEN '850' AND '900'").show()

+---+----------+---------+------+------------+--------------------+---------+------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|  courses|       email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+---------+------------+-----------+-----------+-------------+-------------------+
|  2|      Joey|Tribbiani|  Male|        null|[82349238942, 234...|[2, 4, 5]|joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
+---+----------+---------+------+------------+--------------------+---------+------------+-----------+-----------+-------------+-------------------+



### Boolean Operations
* Boolean OR
* Boolean AND
* Negation

In [104]:
# True or True
True or True

True

In [105]:
# True or False
True or False

True

In [106]:
# False or True
False or True

True

In [107]:
# False or False
False or False

False

In [108]:
# True and True
True and True

True

In [109]:
# True and False
True and False

False

In [110]:
# False and True
False and True

False

In [111]:
# False and False
False and False

False

In [112]:
not True

False

In [113]:
not False

True

In [114]:
usersDF.show()

+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|      Dallas|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|      false|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  2|      Joey|Tribbiani|  Male|        null|[82349238942, 234...|   [2, 4, 5]|        joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Monica|   Geller|Female|            |               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  4|      Ross|   Geller|  Male|      D

* Get list of users whose city is null or empty string

In [115]:
usersDF.select('id', 'current_city').show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|        null|
|  3|            |
|  4|      Dallas|
|  5|     Houston|
|  6|      Dallas|
+---+------------+



In [117]:
usersDF \
.select('id', 'current_city') \
.filter((col('current_city').isNull()) | (col('current_city') == '')).show()

+---+------------+
| id|current_city|
+---+------------+
|  2|        null|
|  3|            |
+---+------------+



In [118]:
# SQL style syntax
usersDF \
.select('id', 'current_city') \
.filter("current_city IS NULL OR current_city = '' ").show()

+---+------------+
| id|current_city|
+---+------------+
|  2|        null|
|  3|            |
+---+------------+



* Get list of users whose city is either Houston or Dallas

In [125]:
usersDF \
.select('id', 'current_city') \
.filter((col('current_city') == 'Dallas') | (col('current_city') == 'Houston')).show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  4|      Dallas|
|  5|     Houston|
|  6|      Dallas|
+---+------------+



In [121]:
usersDF \
.select('id', 'current_city') \
.filter(col('current_city').isin('Houston', 'Dallas')).show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  4|      Dallas|
|  5|     Houston|
|  6|      Dallas|
+---+------------+



In [126]:
usersDF \
.select('id', 'current_city') \
.filter(col('current_city').isin('Houston', 'Dallas', '')).show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  3|            |
|  4|      Dallas|
|  5|     Houston|
|  6|      Dallas|
+---+------------+



In [124]:
# SQL style syntax
usersDF \
.select('id', 'current_city') \
.filter("current_city IN ('Houston', 'Dallas')").show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  4|      Dallas|
|  5|     Houston|
|  6|      Dallas|
+---+------------+



In [128]:
# SQL style syntax
usersDF \
.select('id', 'current_city') \
.filter("current_city IN ('Houston', 'Dallas', '')").show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  3|            |
|  4|      Dallas|
|  5|     Houston|
|  6|      Dallas|
+---+------------+



* Get list of users whose city is either Houston or Dallas or empty string or Null

In [131]:
usersDF \
.select('id', 'current_city') \
.filter((col('current_city').isin('Houston', 'Dallas', '')) | (col('current_city').isNull())).show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|        null|
|  3|            |
|  4|      Dallas|
|  5|     Houston|
|  6|      Dallas|
+---+------------+



In [132]:
# SQL style syntax
usersDF \
.select('id', 'current_city') \
.filter("current_city IN ('Houston', 'Dallas', '') OR current_city IS NULL").show()

+---+------------+
| id|current_city|
+---+------------+
|  1|      Dallas|
|  2|        null|
|  3|            |
|  4|      Dallas|
|  5|     Houston|
|  6|      Dallas|
+---+------------+



* `>`
* `<`
* `>=` (equivalent to boolean with `col1 > val1 or cal1 = val1`)
* `<=` (equivalent to boolean with `col1 < val1 or cal1 = val1`)

* Get customers who paid greater than 900

In [6]:
usersDF.show()

+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|      Dallas|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|      false|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  2|      Joey|Tribbiani|  Male|        null|[82349238942, 234...|   [2, 4, 5]|        joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Monica|   Geller|Female|            |               [,,,]|         [2]|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  4|      Ross|   Geller|  Male|      D

In [12]:
usersDF.select('id', 'amount_paid') \
.filter((col('amount_paid') > 900) & (isnan(col('amount_paid')) == False)).show()

+---+-----------+
| id|amount_paid|
+---+-----------+
|  1|    1000.55|
|  3|     1000.9|
|  4|    1200.55|
|  6|     1000.8|
+---+-----------+



* Get customers who paid less than 900

In [15]:
usersDF.select('id', 'amount_paid') \
.filter((col('amount_paid') < 900)  & (isnan(col('amount_paid')) == False)).show()

+---+-----------+
| id|amount_paid|
+---+-----------+
+---+-----------+



* Get customers who paid greater than or equal to 900

In [17]:
usersDF.select('id', 'amount_paid') \
.filter((col('amount_paid') >= 900) & (isnan(col('amount_paid')) == False)).show()

+---+-----------+
| id|amount_paid|
+---+-----------+
|  1|    1000.55|
|  2|      900.0|
|  3|     1000.9|
|  4|    1200.55|
|  6|     1000.8|
+---+-----------+



* Get the users who became customers after `2021-01-21`

In [18]:
usersDF.select('id', 'customer_from') \
.filter((col('customer_from') >= '2021-01-21 00:00:00')).show()

+---+-------------+
| id|customer_from|
+---+-------------+
|  2|   2021-02-14|
|  3|   2021-02-22|
|  5|   2021-02-24|
|  6|   2021-02-22|
+---+-------------+



In [21]:
usersDF.select('id', 'customer_from') \
.filter("customer_from > '2021-01-21'").show()

+---+-------------+
| id|customer_from|
+---+-------------+
|  2|   2021-02-14|
|  3|   2021-02-22|
|  5|   2021-02-24|
|  6|   2021-02-22|
+---+-------------+



### Boolean AND Condition 

In [22]:
usersDF.show(2)

+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|gender|current_city|       phone_numbers|     courses|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|Female|      Dallas|[82349238942, 234...|[1, 3, 5, 7]|pheobebuffay@abc.com|      false|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  2|      Joey|Tribbiani|  Male|        null|[82349238942, 234...|   [2, 4, 5]|        joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
+---+----------+---------+------+------------+--------------------+------------+--------------------+-----------+-----------+-------------+-------------------+
only showing top 2 rows



In [25]:
usersDF.printSchema()

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- current_city: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |    |-- office: string (nullable = true)
 |    |-- shop: string (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- email: string (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)



* Get Male Customers (is_customer = true)

In [29]:
usersDF. \
select('id', 'gender', 'is_customer'). \
filter((col('gender') == 'Male') & (col('is_customer') == True)).show()

+---+------+-----------+
| id|gender|is_customer|
+---+------+-----------+
|  2|  Male|       true|
|  4|  Male|       true|
|  6|  Male|       true|
+---+------+-----------+



In [30]:
usersDF. \
select('id', 'gender', 'is_customer'). \
filter("gender = 'Male' AND is_customer = True").show()

+---+------+-----------+
| id|gender|is_customer|
+---+------+-----------+
|  2|  Male|       true|
|  4|  Male|       true|
|  6|  Male|       true|
+---+------+-----------+



* Get users who become customers between 2021 Jan 20th and 2021 Feb 15th

In [31]:
usersDF. \
select('id', 'customer_from'). \
filter((col('customer_from') >= '2021-01-20') & (col('customer_from') <= '2021-02-15')).show()

+---+-------------+
| id|customer_from|
+---+-------------+
|  2|   2021-02-14|
+---+-------------+



### Boolean OR Condition 


* Get id and email of users who are not customers or city contain empty string.

In [32]:
usersDF.select('id', 'email', 'current_city', 'is_customer').show()

+---+--------------------+------------+-----------+
| id|               email|current_city|is_customer|
+---+--------------------+------------+-----------+
|  1|pheobebuffay@abc.com|      Dallas|      false|
|  2|        joey@abc.com|        null|       true|
|  3|      monica@abc.com|            |       true|
|  4|        ross@abc.com|      Dallas|       true|
|  5|      rachel@abc.com|     Houston|      false|
|  6|        bing@abc.com|      Dallas|       true|
+---+--------------------+------------+-----------+



In [33]:
usersDF.select('id', 'email', 'current_city', 'is_customer') \
.filter((col('is_customer') == False) | (col('current_city') == '')).show()

+---+--------------------+------------+-----------+
| id|               email|current_city|is_customer|
+---+--------------------+------------+-----------+
|  1|pheobebuffay@abc.com|      Dallas|      false|
|  3|      monica@abc.com|            |       true|
|  5|      rachel@abc.com|     Houston|      false|
+---+--------------------+------------+-----------+



* Get id and email of users who are not customers or customer whose last updated time is before 2021-03-01.

In [37]:
usersDF.select('id', 'email', 'is_customer', 'last_updated_ts') \
.filter((col('is_customer') == False) | (col('last_updated_ts') < '2021-03-01')).show()

+---+--------------------+-----------+-------------------+
| id|               email|is_customer|    last_updated_ts|
+---+--------------------+-----------+-------------------+
|  1|pheobebuffay@abc.com|      false|2021-02-10 01:15:00|
|  2|        joey@abc.com|       true|2021-02-18 03:33:00|
|  3|      monica@abc.com|       true|2021-02-28 07:33:00|
|  4|        ross@abc.com|       true|2021-02-18 01:10:00|
|  5|      rachel@abc.com|      false|2021-02-18 03:33:00|
|  6|        bing@abc.com|       true|2021-02-25 07:33:00|
+---+--------------------+-----------+-------------------+

