In [1]:
import findspark

findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').appName('Rows, Columns and Datatypes').getOrCreate()

In [3]:
spark

In [4]:
sc = spark.sparkContext

In [5]:
sc.defaultParallelism

8

In [10]:
spark.conf.get('spark.sql.shuffle.partitions')

'200'

In [8]:
spark.conf.set("spark.sql.shuffle.partitions", sc.defaultParallelism)

In [12]:
spark.conf.get('spark.sql.shuffle.partitions')

'8'

In [13]:
spark.conf.get('spark.sql.codegen.wholeStage')

'true'

# Spark Data Types :-
----------------------
These are the datatype classes used in Schema definitions of the dataframes.

In [7]:
from pyspark.sql.types import IntegerType, LongType, FloatType, DoubleType, StringType, BooleanType, ArrayType, MapType, StructType, StructField


In [16]:
data_schema = StructType([
    StructField('fname', StringType(), True),
    StructField('lname', StringType(), True),
    StructField('age', IntegerType(), True),
    StructField('univ_roll', LongType(), True),
    StructField('dgpa', FloatType(), True),
    StructField('intelligence', DoubleType(), True),
    StructField('profiles', ArrayType(StringType()), True),
    StructField('marks', MapType( StringType(), IntegerType() ), True ),
    StructField('properties', MapType( StringType(), StringType() ), True),
    StructField('isCreative', BooleanType(), True)
])

data_list = [
    (
        'Debanjan', 
        'Sarkar', 
        23, 
        19101105023,
        8.84,
        100.00,
        ['facebook', 'instagram', 'twitter', 'linkedin', 'github', 'bugcrowd', 'telegram', 'whatsapp'],
        {
            'maths': 95,
            'physics': 95,
            'chemistry': 95,
            'biology': 95,
            'english': 87
        },
        {
            'height': '168 cm',
            'weight': '74 kg',
            'irish': 'black'
        },
        True
    ),
    (
        'Atul',
        'Kumar',
        24,
        20408192350,
        9.26,
        100.00,
        ['facebook', 'whatsapp', 'instagram'],
        {
            'maths': 98,
            'physics': 96,
            'chemistry': 91,
            'biology': 70,
            'english': 85
        },
        {
            'height': '180 cm',
            'weight': '65 kg',
            'irish': 'brown'
        },
        False
    ),
    (
        'Soumyadip',
        'Mondal',
        24,
        1593891120,
        8.21,
        90.85,
        ['facebook', 'whatsapp', 'instagram', 'snapchat'],
        {
            'maths': 88,
            'physics': 76,
            'chemistry': 91,
            'biology': 71,
            'english': 89
        },
        {
            'height': '170 cm',
            'weight': '59 kg',
            'irish': 'brown'
        },
        True
    )
]

In [17]:
friend_df = spark.createDataFrame( data = data_list, schema = data_schema )

In [19]:
friend_df.printSchema()

root
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- univ_roll: long (nullable = true)
 |-- dgpa: float (nullable = true)
 |-- intelligence: double (nullable = true)
 |-- profiles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- marks: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- isCreative: boolean (nullable = true)



In [20]:
friend_df.show()

+---------+------+---+-----------+----+------------+--------------------+--------------------+--------------------+----------+
|    fname| lname|age|  univ_roll|dgpa|intelligence|            profiles|               marks|          properties|isCreative|
+---------+------+---+-----------+----+------------+--------------------+--------------------+--------------------+----------+
| Debanjan|Sarkar| 23|19101105023|8.84|       100.0|[facebook, instag...|{chemistry -> 95,...|{irish -> black, ...|      true|
|     Atul| Kumar| 24|20408192350|9.26|       100.0|[facebook, whatsa...|{chemistry -> 91,...|{irish -> brown, ...|     false|
|Soumyadip|Mondal| 24| 1593891120|8.21|       90.85|[facebook, whatsa...|{chemistry -> 91,...|{irish -> brown, ...|      true|
+---------+------+---+-----------+----+------------+--------------------+--------------------+--------------------+----------+



In [21]:
friend_df.dtypes

[('fname', 'string'),
 ('lname', 'string'),
 ('age', 'int'),
 ('univ_roll', 'bigint'),
 ('dgpa', 'float'),
 ('intelligence', 'double'),
 ('profiles', 'array<string>'),
 ('marks', 'map<string,int>'),
 ('properties', 'map<string,string>'),
 ('isCreative', 'boolean')]

In [22]:
friend_df.schema

StructType([StructField('fname', StringType(), True), StructField('lname', StringType(), True), StructField('age', IntegerType(), True), StructField('univ_roll', LongType(), True), StructField('dgpa', FloatType(), True), StructField('intelligence', DoubleType(), True), StructField('profiles', ArrayType(StringType(), True), True), StructField('marks', MapType(StringType(), IntegerType(), True), True), StructField('properties', MapType(StringType(), StringType(), True), True), StructField('isCreative', BooleanType(), True)])

In [6]:
from pyspark.sql import functions as F

In [25]:
friend_df.select( 'fname', 'lname', F.explode('profiles').alias('handles') ).show()

+---------+------+---------+
|    fname| lname|  handles|
+---------+------+---------+
| Debanjan|Sarkar| facebook|
| Debanjan|Sarkar|instagram|
| Debanjan|Sarkar|  twitter|
| Debanjan|Sarkar| linkedin|
| Debanjan|Sarkar|   github|
| Debanjan|Sarkar| bugcrowd|
| Debanjan|Sarkar| telegram|
| Debanjan|Sarkar| whatsapp|
|     Atul| Kumar| facebook|
|     Atul| Kumar| whatsapp|
|     Atul| Kumar|instagram|
|Soumyadip|Mondal| facebook|
|Soumyadip|Mondal| whatsapp|
|Soumyadip|Mondal|instagram|
|Soumyadip|Mondal| snapchat|
+---------+------+---------+



In [29]:
friend_df.select( 'fname', 'lname', F.explode('marks') ).show()

+---------+------+---------+-----+
|    fname| lname|      key|value|
+---------+------+---------+-----+
| Debanjan|Sarkar|chemistry|   95|
| Debanjan|Sarkar|  english|   87|
| Debanjan|Sarkar|  biology|   95|
| Debanjan|Sarkar|    maths|   95|
| Debanjan|Sarkar|  physics|   95|
|     Atul| Kumar|chemistry|   91|
|     Atul| Kumar|  english|   85|
|     Atul| Kumar|  biology|   70|
|     Atul| Kumar|    maths|   98|
|     Atul| Kumar|  physics|   96|
|Soumyadip|Mondal|chemistry|   91|
|Soumyadip|Mondal|  english|   89|
|Soumyadip|Mondal|  biology|   71|
|Soumyadip|Mondal|    maths|   88|
|Soumyadip|Mondal|  physics|   76|
+---------+------+---------+-----+



# <i>'pyspark.sql.Row'</i> objects:
 <hr>
 <i>'pyspark.sql.Row'</i> object is immutable (read-only), that is, once created, the values cannot be changed.

In [5]:
from pyspark.sql import Row

In [31]:
row_obj = Row(
            fname = 'Debanjan',
            lname = 'Sarkar',
            age = 23,
            univ_roll = 19101105023
)

In [32]:
row_obj.fname

'Debanjan'

In [33]:
'lname' in row_obj

True

### Creating Custom class using 'Row', and using it:-
-------------------------------------------------------

In [9]:
Person = Row("name", "age", "gender")

In [15]:
type(Person)

pyspark.sql.types.Row

In [16]:
Person.name

AttributeError: __fields__

In [10]:
p1 = Person("Debanjan Sarkar", 23, "Male")
p2 = Person("Atul Kumar", 24, "Male")
p3 = Person("Debadrita Malakar", 6, "Female")

In [11]:
p1

Row(name='Debanjan Sarkar', age=23, gender='Male')

In [17]:
p1.name

'Debanjan Sarkar'

In [12]:
type(p1)

pyspark.sql.types.Row

In [13]:
person_df = spark.createDataFrame( data = [p1,p2,p3] )

In [14]:
person_df.show()

+-----------------+---+------+
|             name|age|gender|
+-----------------+---+------+
|  Debanjan Sarkar| 23|  Male|
|       Atul Kumar| 24|  Male|
|Debadrita Malakar|  6|Female|
+-----------------+---+------+



### Row class functions:-
<hr>
<ul>
    <li><b>count()</b> - Returns the number of occurences of the value, that is passed as parameter.</li>
    <li><b>index()</b> - Returns the first index at which the value (data in parameter) occurs in the object.</li>
    <li><b>asDict()</b> - Return the Row object as a Dictionary.</li>
</ul>

In [20]:
row_obj = Row(
            fname = 'Debanjan',
            lname = 'Sarkar',
            age = 23,
            univ_roll = 19101105023,
            family_name = 'Sarkar'
)

In [19]:
row_obj

Row(fname='Debanjan', lname='Sarkar', age=23, univ_roll=19101105023)

In [22]:
row_obj.count('Sarkar')

2

In [23]:
row_obj.count(23)

1

In [24]:
row_obj.index('Sarkar')

1

In [25]:
row_obj.index('Debanjan')

0

In [26]:
row_obj.index(23)

2

In [27]:
row_obj.asDict()

{'fname': 'Debanjan',
 'lname': 'Sarkar',
 'age': 23,
 'univ_roll': 19101105023,
 'family_name': 'Sarkar'}

In [28]:
row_dict = row_obj.asDict()

In [29]:
type(row_dict)

dict

In [32]:
for i in row_dict.keys():
    print(i)

fname
lname
age
univ_roll
family_name


In [33]:
row_dict.values()

dict_values(['Debanjan', 'Sarkar', 23, 19101105023, 'Sarkar'])

# Columns :-
<hr>

In [13]:
help( "pyspark.sql.column.Column" )

Help on class Column in pyspark.sql.column:

pyspark.sql.column.Column = class Column(builtins.object)
 |  pyspark.sql.column.Column(jc: py4j.java_gateway.JavaObject) -> None
 |  
 |  A column in a DataFrame.
 |  
 |  :class:`Column` instances can be created by::
 |  
 |      # 1. Select a column out of a DataFrame
 |  
 |      df.colName
 |      df["colName"]
 |  
 |      # 2. Create from an expression
 |      df.colName + 1
 |      1 / df.colName
 |  
 |  .. versionadded:: 1.3.0
 |  
 |  Methods defined here:
 |  
 |  __add__ = _(self: 'Column', other: Union[ForwardRef('Column'), ForwardRef('LiteralType'), ForwardRef('DecimalLiteral'), ForwardRef('DateTimeLiteral')]) -> 'Column'
 |      binary operator
 |  
 |  __and__ = _(self: 'Column', other: Union[ForwardRef('Column'), ForwardRef('LiteralType'), ForwardRef('DecimalLiteral'), ForwardRef('DateTimeLiteral')]) -> 'Column'
 |      binary operator
 |  
 |  __bool__ = __nonzero__(self) -> None
 |  
 |  __contains__(self, item: Any) -> N

In [5]:
from pyspark.sql.types import StructType, StructField, IntegerType, LongType, TimestampType, StringType

ord_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("order_date", TimestampType(), True),
    StructField("cust_id", LongType(), True),
    StructField("order_status", StringType(), True)
])

ord_df = spark.read.csv('data/orders.csv', schema = ord_schema)

In [6]:
ord_df.show(5)

+--------+-------------------+-------+---------------+
|order_id|         order_date|cust_id|   order_status|
+--------+-------------------+-------+---------------+
|       1|2013-07-25 00:00:00|  11599|         CLOSED|
|       2|2013-07-25 00:00:00|    256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|  12111|       COMPLETE|
|       4|2013-07-25 00:00:00|   8827|         CLOSED|
|       5|2013-07-25 00:00:00|  11318|       COMPLETE|
+--------+-------------------+-------+---------------+
only showing top 5 rows



In [27]:
ord_df.rdd.take(5)

[Row(order_id=1, order_date=datetime.datetime(2013, 7, 25, 0, 0), cust_id=11599, order_status='CLOSED'),
 Row(order_id=2, order_date=datetime.datetime(2013, 7, 25, 0, 0), cust_id=256, order_status='PENDING_PAYMENT'),
 Row(order_id=3, order_date=datetime.datetime(2013, 7, 25, 0, 0), cust_id=12111, order_status='COMPLETE'),
 Row(order_id=4, order_date=datetime.datetime(2013, 7, 25, 0, 0), cust_id=8827, order_status='CLOSED'),
 Row(order_id=5, order_date=datetime.datetime(2013, 7, 25, 0, 0), cust_id=11318, order_status='COMPLETE')]

In [10]:
ord_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [11]:
type(ord_df.order_id)

pyspark.sql.column.Column

### Accessing column in the form of a datatype:-
-------------------------------------------------
Suppose, we want to select a column named <b><i>col1</i></b> in a dataframe named <b><i>df1</i></b> :
<ul>
    <li> <b><i>df1.col1</i></b> - Accessing using dot notation, just like member variable.</li>
    <li> <b><i>df1[ 'col1' ]</i></b> - Accesssing the column as we access a particulary value using key in dictionary.</li>
    <li> <b><i>F.col( "col1" )</i></b> - Accessing the column by passing the column name as string to the <i>pyspark.sql.functions.col()</i> function</li>
    <li> <b><i>"col1"</i></b> - In some places such as in <i>df.select()</i> method, column can be accessed by simply passing the column name as string.</li>
</ul>

In [14]:
ord_df.select( ord_df.order_id ).show(5)

+--------+
|order_id|
+--------+
|       1|
|       2|
|       3|
|       4|
|       5|
+--------+
only showing top 5 rows



In [15]:
ord_df.select( ord_df["order_id"] ).show(5)

+--------+
|order_id|
+--------+
|       1|
|       2|
|       3|
|       4|
|       5|
+--------+
only showing top 5 rows



In [16]:
from pyspark.sql import functions as F

ord_df.select( F.col('order_id') ).show(5)

+--------+
|order_id|
+--------+
|       1|
|       2|
|       3|
|       4|
|       5|
+--------+
only showing top 5 rows



In [17]:
ord_df.select( "order_id" ).show(5)

+--------+
|order_id|
+--------+
|       1|
|       2|
|       3|
|       4|
|       5|
+--------+
only showing top 5 rows



### Selecting all the columns:-
---------------------------------

In [18]:
ord_df.select("*").show()

+--------+-------------------+-------+---------------+
|order_id|         order_date|cust_id|   order_status|
+--------+-------------------+-------+---------------+
|       1|2013-07-25 00:00:00|  11599|         CLOSED|
|       2|2013-07-25 00:00:00|    256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|  12111|       COMPLETE|
|       4|2013-07-25 00:00:00|   8827|         CLOSED|
|       5|2013-07-25 00:00:00|  11318|       COMPLETE|
|       6|2013-07-25 00:00:00|   7130|       COMPLETE|
|       7|2013-07-25 00:00:00|   4530|       COMPLETE|
|       8|2013-07-25 00:00:00|   2911|     PROCESSING|
|       9|2013-07-25 00:00:00|   5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|   5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|    918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|   1837|         CLOSED|
|      13|2013-07-25 00:00:00|   9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:00|   9842|     PROCESSING|
|      15|2013-07-25 00:00:00|   2568|       COMPLETE|
|      16|

### Setting column alias:-
<hr>
Alias of a column can be set using the <b><i>'alias()'</i></b> column method.

In [20]:
ord_df.select( ord_df.cust_id.alias("Customer Id to whom the order belongs") ).show()

+-------------------------------------+
|Customer Id to whom the order belongs|
+-------------------------------------+
|                                11599|
|                                  256|
|                                12111|
|                                 8827|
|                                11318|
|                                 7130|
|                                 4530|
|                                 2911|
|                                 5657|
|                                 5648|
|                                  918|
|                                 1837|
|                                 9149|
|                                 9842|
|                                 2568|
|                                 7276|
|                                 2667|
|                                 1205|
|                                 9488|
|                                 9198|
+-------------------------------------+
only showing top 20 rows



### Ordering based on column values / sorting :-
-------------------------------------------------
- asc()
- asc_nulls__first()
- asc_nulls_last()
- desc()
- desc_nulls_first()
- desc_nulls_last()

In [21]:
ord_df.show()

+--------+-------------------+-------+---------------+
|order_id|         order_date|cust_id|   order_status|
+--------+-------------------+-------+---------------+
|       1|2013-07-25 00:00:00|  11599|         CLOSED|
|       2|2013-07-25 00:00:00|    256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|  12111|       COMPLETE|
|       4|2013-07-25 00:00:00|   8827|         CLOSED|
|       5|2013-07-25 00:00:00|  11318|       COMPLETE|
|       6|2013-07-25 00:00:00|   7130|       COMPLETE|
|       7|2013-07-25 00:00:00|   4530|       COMPLETE|
|       8|2013-07-25 00:00:00|   2911|     PROCESSING|
|       9|2013-07-25 00:00:00|   5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|   5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|    918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|   1837|         CLOSED|
|      13|2013-07-25 00:00:00|   9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:00|   9842|     PROCESSING|
|      15|2013-07-25 00:00:00|   2568|       COMPLETE|
|      16|

In [25]:
ord_df.filter( "order_status is NULL" ).show()

+--------+----------+-------+------------+
|order_id|order_date|cust_id|order_status|
+--------+----------+-------+------------+
+--------+----------+-------+------------+



In [26]:
ord_df.filter( "order_date is NULL" ).show()

+--------+----------+-------+------------+
|order_id|order_date|cust_id|order_status|
+--------+----------+-------+------------+
+--------+----------+-------+------------+



In [29]:
ord_df.orderBy( ord_df.order_status.asc() ).distinct().show()

+--------+-------------------+-------+---------------+
|order_id|         order_date|cust_id|   order_status|
+--------+-------------------+-------+---------------+
|     215|2013-07-26 00:00:00|   5925|         CLOSED|
|     537|2013-07-28 00:00:00|  10437|       COMPLETE|
|     663|2013-07-28 00:00:00|   3531| PAYMENT_REVIEW|
|     685|2013-07-29 00:00:00|  10745|        ON_HOLD|
|     708|2013-07-29 00:00:00|   5695|         CLOSED|
|    1214|2013-07-31 00:00:00|   7892|        ON_HOLD|
|    1280|2013-07-31 00:00:00|   5712|         CLOSED|
|    1351|2013-08-01 00:00:00|   9452|     PROCESSING|
|    1739|2013-08-03 00:00:00|   1791|PENDING_PAYMENT|
|    2251|2013-08-06 00:00:00|   4876| PAYMENT_REVIEW|
|    2326|2013-08-06 00:00:00|   8716|       COMPLETE|
|    2573|2013-08-07 00:00:00|   5992|         CLOSED|
|    2680|2013-08-08 00:00:00|   8781|SUSPECTED_FRAUD|
|    3124|2013-08-11 00:00:00|   5509|         CLOSED|
|    3897|2013-08-17 00:00:00|   6769|       COMPLETE|
|    4013|

### Casting the dataype of a Column:-
<hr>

In [9]:
ord_df.select( ord_df.order_id ).printSchema()

root
 |-- order_id: integer (nullable = true)



In [8]:
ord_df.select( ord_df.order_id.cast('string') ).printSchema()

root
 |-- order_id: string (nullable = true)



### Applying filters:-
--------------------------

In [10]:
ord_df.withColumn( "order id between 10 and 20", ord_df.order_id.between(10,20) ).show()

+--------+-------------------+-------+---------------+--------------------------+
|order_id|         order_date|cust_id|   order_status|order id between 10 and 20|
+--------+-------------------+-------+---------------+--------------------------+
|       1|2013-07-25 00:00:00|  11599|         CLOSED|                     false|
|       2|2013-07-25 00:00:00|    256|PENDING_PAYMENT|                     false|
|       3|2013-07-25 00:00:00|  12111|       COMPLETE|                     false|
|       4|2013-07-25 00:00:00|   8827|         CLOSED|                     false|
|       5|2013-07-25 00:00:00|  11318|       COMPLETE|                     false|
|       6|2013-07-25 00:00:00|   7130|       COMPLETE|                     false|
|       7|2013-07-25 00:00:00|   4530|       COMPLETE|                     false|
|       8|2013-07-25 00:00:00|   2911|     PROCESSING|                     false|
|       9|2013-07-25 00:00:00|   5657|PENDING_PAYMENT|                     false|
|      10|2013-0

In [11]:
ord_df.where( ord_df.order_id.between(10,20) ).show()

+--------+-------------------+-------+---------------+
|order_id|         order_date|cust_id|   order_status|
+--------+-------------------+-------+---------------+
|      10|2013-07-25 00:00:00|   5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|    918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|   1837|         CLOSED|
|      13|2013-07-25 00:00:00|   9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:00|   9842|     PROCESSING|
|      15|2013-07-25 00:00:00|   2568|       COMPLETE|
|      16|2013-07-25 00:00:00|   7276|PENDING_PAYMENT|
|      17|2013-07-25 00:00:00|   2667|       COMPLETE|
|      18|2013-07-25 00:00:00|   1205|         CLOSED|
|      19|2013-07-25 00:00:00|   9488|PENDING_PAYMENT|
|      20|2013-07-25 00:00:00|   9198|     PROCESSING|
+--------+-------------------+-------+---------------+



In [15]:
ord_df.filter( (ord_df.order_id>=10) & (ord_df.order_id<=20) ).show()

+--------+-------------------+-------+---------------+
|order_id|         order_date|cust_id|   order_status|
+--------+-------------------+-------+---------------+
|      10|2013-07-25 00:00:00|   5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|    918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|   1837|         CLOSED|
|      13|2013-07-25 00:00:00|   9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:00|   9842|     PROCESSING|
|      15|2013-07-25 00:00:00|   2568|       COMPLETE|
|      16|2013-07-25 00:00:00|   7276|PENDING_PAYMENT|
|      17|2013-07-25 00:00:00|   2667|       COMPLETE|
|      18|2013-07-25 00:00:00|   1205|         CLOSED|
|      19|2013-07-25 00:00:00|   9488|PENDING_PAYMENT|
|      20|2013-07-25 00:00:00|   9198|     PROCESSING|
+--------+-------------------+-------+---------------+



### Column methods for string processing and pattern matching:-
-----------------------------------------------------------------
- contains()
- startswith()
- endswith()
- like()
- rlike()

In [17]:
ord_df.select( ord_df.order_status.contains( "PAYMENT" ) ).distinct().show()

+-------------------------------+
|contains(order_status, PAYMENT)|
+-------------------------------+
|                           true|
|                          false|
+-------------------------------+



In [26]:
order_status_df = ord_df.select("order_status")

order_status_df.show()

+---------------+
|   order_status|
+---------------+
|         CLOSED|
|PENDING_PAYMENT|
|       COMPLETE|
|         CLOSED|
|       COMPLETE|
|       COMPLETE|
|       COMPLETE|
|     PROCESSING|
|PENDING_PAYMENT|
|PENDING_PAYMENT|
| PAYMENT_REVIEW|
|         CLOSED|
|PENDING_PAYMENT|
|     PROCESSING|
|       COMPLETE|
|PENDING_PAYMENT|
|       COMPLETE|
|         CLOSED|
|PENDING_PAYMENT|
|     PROCESSING|
+---------------+
only showing top 20 rows



In [29]:
order_status_df.where( order_status_df.order_status.contains("PAYMENT") ).distinct().show()

+---------------+
|   order_status|
+---------------+
|PENDING_PAYMENT|
| PAYMENT_REVIEW|
+---------------+



In [30]:
order_status_df.where( order_status_df.order_status.like("%END%") ).distinct().show()

+---------------+
|   order_status|
+---------------+
|PENDING_PAYMENT|
|        PENDING|
+---------------+



In [32]:
order_status_df.where( order_status_df.order_status.startswith("P") ).distinct().show()

+---------------+
|   order_status|
+---------------+
|PENDING_PAYMENT|
| PAYMENT_REVIEW|
|     PROCESSING|
|        PENDING|
+---------------+



In [34]:
order_status_df.where( order_status_df.order_status.endswith("ING") ).distinct().show()

+------------+
|order_status|
+------------+
|  PROCESSING|
|     PENDING|
+------------+



In [35]:
order_status_df.filter( order_status_df.order_status.like("%ING") ).distinct().show()

+------------+
|order_status|
+------------+
|  PROCESSING|
|     PENDING|
+------------+



In [36]:
# Orders with status CLOSED

ord_df.where( ord_df.order_status.contains("CLOSED") ).show()

+--------+-------------------+-------+------------+
|order_id|         order_date|cust_id|order_status|
+--------+-------------------+-------+------------+
|       1|2013-07-25 00:00:00|  11599|      CLOSED|
|       4|2013-07-25 00:00:00|   8827|      CLOSED|
|      12|2013-07-25 00:00:00|   1837|      CLOSED|
|      18|2013-07-25 00:00:00|   1205|      CLOSED|
|      24|2013-07-25 00:00:00|  11441|      CLOSED|
|      25|2013-07-25 00:00:00|   9503|      CLOSED|
|      37|2013-07-25 00:00:00|   5863|      CLOSED|
|      51|2013-07-25 00:00:00|  12271|      CLOSED|
|      57|2013-07-25 00:00:00|   7073|      CLOSED|
|      61|2013-07-25 00:00:00|   4791|      CLOSED|
|      62|2013-07-25 00:00:00|   9111|      CLOSED|
|      87|2013-07-25 00:00:00|   3065|      CLOSED|
|      90|2013-07-25 00:00:00|   9131|      CLOSED|
|     101|2013-07-25 00:00:00|   5116|      CLOSED|
|     116|2013-07-26 00:00:00|   8763|      CLOSED|
|     129|2013-07-26 00:00:00|   9937|      CLOSED|
|     133|20

In [37]:
ord_df.filter( ord_df.order_status.contains("CLOSED") ).show()

+--------+-------------------+-------+------------+
|order_id|         order_date|cust_id|order_status|
+--------+-------------------+-------+------------+
|       1|2013-07-25 00:00:00|  11599|      CLOSED|
|       4|2013-07-25 00:00:00|   8827|      CLOSED|
|      12|2013-07-25 00:00:00|   1837|      CLOSED|
|      18|2013-07-25 00:00:00|   1205|      CLOSED|
|      24|2013-07-25 00:00:00|  11441|      CLOSED|
|      25|2013-07-25 00:00:00|   9503|      CLOSED|
|      37|2013-07-25 00:00:00|   5863|      CLOSED|
|      51|2013-07-25 00:00:00|  12271|      CLOSED|
|      57|2013-07-25 00:00:00|   7073|      CLOSED|
|      61|2013-07-25 00:00:00|   4791|      CLOSED|
|      62|2013-07-25 00:00:00|   9111|      CLOSED|
|      87|2013-07-25 00:00:00|   3065|      CLOSED|
|      90|2013-07-25 00:00:00|   9131|      CLOSED|
|     101|2013-07-25 00:00:00|   5116|      CLOSED|
|     116|2013-07-26 00:00:00|   8763|      CLOSED|
|     129|2013-07-26 00:00:00|   9937|      CLOSED|
|     133|20

In [39]:
# Show all the orders which are CLOSED or PENDING

ord_df.where( ord_df.order_status.isin( "CLOSED", "PENDING", "PENDING_PAYMENT" ) ).show()

+--------+-------------------+-------+---------------+
|order_id|         order_date|cust_id|   order_status|
+--------+-------------------+-------+---------------+
|       1|2013-07-25 00:00:00|  11599|         CLOSED|
|       2|2013-07-25 00:00:00|    256|PENDING_PAYMENT|
|       4|2013-07-25 00:00:00|   8827|         CLOSED|
|       9|2013-07-25 00:00:00|   5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|   5648|PENDING_PAYMENT|
|      12|2013-07-25 00:00:00|   1837|         CLOSED|
|      13|2013-07-25 00:00:00|   9149|PENDING_PAYMENT|
|      16|2013-07-25 00:00:00|   7276|PENDING_PAYMENT|
|      18|2013-07-25 00:00:00|   1205|         CLOSED|
|      19|2013-07-25 00:00:00|   9488|PENDING_PAYMENT|
|      21|2013-07-25 00:00:00|   2711|        PENDING|
|      23|2013-07-25 00:00:00|   4367|PENDING_PAYMENT|
|      24|2013-07-25 00:00:00|  11441|         CLOSED|
|      25|2013-07-25 00:00:00|   9503|         CLOSED|
|      27|2013-07-25 00:00:00|   3241|PENDING_PAYMENT|
|      30|

### Handling 'NULL' value while checking for equality:
-------------------------------------------------------

In [45]:
things_data = [
                {"category": "car", "brand": "maruti"},
                {"category": "mobile", "brand": "realme"},
                {"category": "tv", "brand": "toshiba"},
                {"category": "salt", "brand": "tata"},
                {"category": "tea", "brand": "tata"},
                {"category": "sugar", "brand": None},
                {"category": "flour", "brand": None}
]

In [46]:
things_df = spark.createDataFrame( data = things_data )

In [48]:
things_df.show()

+-------+--------+
|  brand|category|
+-------+--------+
| maruti|     car|
| realme|  mobile|
|toshiba|      tv|
|   tata|    salt|
|   tata|     tea|
|   null|   sugar|
|   null|   flour|
+-------+--------+



As can be seen in the cell below, when equality operator is used to filter out record, it does not give truth value for null values. For null value handling, <b><i>'eqNullSafe()'</i></b> Column method iss best, as it gives <b>False</b> for the null values.

In [51]:
things_df.select( things_df.brand == 'tata', things_df.brand.eqNullSafe("tata") ).show()

+--------------+----------------+
|(brand = tata)|(brand <=> tata)|
+--------------+----------------+
|         false|           false|
|         false|           false|
|         false|           false|
|          true|            true|
|          true|            true|
|          null|           false|
|          null|           false|
+--------------+----------------+



In [52]:
things_df.where( things_df.brand != 'tata' ).show()

+-------+--------+
|  brand|category|
+-------+--------+
| maruti|     car|
| realme|  mobile|
|toshiba|      tv|
+-------+--------+



In [54]:
things_df.where( ~things_df.brand.eqNullSafe("tata") ).show()

+-------+--------+
|  brand|category|
+-------+--------+
| maruti|     car|
| realme|  mobile|
|toshiba|      tv|
|   null|   sugar|
|   null|   flour|
+-------+--------+



### Fetching only those records that has Null in a particular column:

In [56]:
things_df.where( things_df.brand.isNull() ).show()

+-----+--------+
|brand|category|
+-----+--------+
| null|   sugar|
| null|   flour|
+-----+--------+



In [57]:
things_df.where( things_df.brand.isNotNull() ).show()

+-------+--------+
|  brand|category|
+-------+--------+
| maruti|     car|
| realme|  mobile|
|toshiba|      tv|
|   tata|    salt|
|   tata|     tea|
+-------+--------+

