### Overview of special data types in spark

* Here are the special data types in spark
    * ARRAY
    * MAP
    * STRUCT
* Python structures such as list and dict can be implicitly converted to Spark ARRAY and MAP resp.
* We need to use few spark related APIs to convert Python data structures to STRUCT type.

In [6]:
# Import libraries
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
import datetime

In [2]:
# Initiate spark session
spark = SparkSession \
        .builder \
        .appName('SparkSpecialDataTypes') \
        .getOrCreate()

#### ARRAY ~ LIST

In [15]:
users = [
            {
                "id": 1,
                "first_name": "Pheobe",
                "last_name": "Buffay",
                "phone_numbers": ["82349238942", "2348910249", "8273929"],
                "email": "pheobebuffay@abc.com",
                "is_customer": True,
                "amount_paid": 1000.55,
                "customer_from": datetime.date(2021, 1, 13),
                "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
            },
            {
                "id": 2,
                "first_name": "Joey",
                "last_name": "Tribbiani",
                "phone_numbers": ["82349238942", "2348910249"],
                "email": "joey@abc.com",
                "is_customer": True,
                "amount_paid": 900.0,
                "customer_from": datetime.date(2021, 2, 14),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": 3,
                "first_name": "Monica",
                "last_name": "Geller",
                "phone_numbers": None,
                "email": "monica@abc.com",
                "is_customer": True,
                "amount_paid": 1000.90,
                "customer_from": datetime.date(2021, 2, 22),
                "last_updated_ts": datetime.datetime(2021, 2, 28, 7, 33, 0)
            },
            {
                "id": 4,
                "first_name": "Ross",
                "last_name": "Geller",
                "phone_numbers": ["82349238942"],
                "email": "ross@abc.com",
                "is_customer": True,
                "amount_paid": 1200.55,
                "customer_from": datetime.date(2021, 1, 19),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 1, 10, 0)
            },
            {
                "id": 5,
                "first_name": "Rachel",
                "last_name": "Green",
                "phone_numbers": ["82349238942", "2348910249", "8273929", "5343434654"],
                "email": "rachel@abc.com",
                "is_customer": True,
                "amount_paid": None,
                "customer_from": datetime.date(2021, 2, 24),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": 6,
                "first_name": "Chandler",
                "last_name": "Bing",
                "phone_numbers": ["8273929"],
                "email": "bing@abc.com",
                "is_customer": True,
                "amount_paid": 1000.80,
                "customer_from": datetime.date(2021, 2, 22),
                "last_updated_ts": datetime.datetime(2021, 2, 25, 7, 33, 0)
            }
        ]

In [16]:
user_df = spark.createDataFrame([Row(**user) for user in users])

In [17]:
user_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- phone_numbers: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- email: string (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)



In [18]:
user_df.show()

+---+----------+---------+--------------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|       phone_numbers|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+--------------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|[82349238942, 234...|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  2|      Joey|Tribbiani|[82349238942, 234...|        joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Monica|   Geller|                null|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  4|      Ross|   Geller|       [82349238942]|        ross@abc.com|       true|    1200.55|   2021-01-19|2021-02-18 01:10:00|
|  5|    Rachel|    Green|[82349238942, 234...|      rachel@abc.com|       true|       null|   2021-02-24|2021-

In [20]:
user_df.select(['id', 'phone_numbers']).show()

+---+--------------------+
| id|       phone_numbers|
+---+--------------------+
|  1|[82349238942, 234...|
|  2|[82349238942, 234...|
|  3|                null|
|  4|       [82349238942]|
|  5|[82349238942, 234...|
|  6|           [8273929]|
+---+--------------------+



In [22]:
user_df.select(['id', 'phone_numbers']).show(truncate=False)

+---+----------------------------------------------+
|id |phone_numbers                                 |
+---+----------------------------------------------+
|1  |[82349238942, 2348910249, 8273929]            |
|2  |[82349238942, 2348910249]                     |
|3  |null                                          |
|4  |[82349238942]                                 |
|5  |[82349238942, 2348910249, 8273929, 5343434654]|
|6  |[8273929]                                     |
+---+----------------------------------------------+



In [23]:
user_df.columns 

['id',
 'first_name',
 'last_name',
 'phone_numbers',
 'email',
 'is_customer',
 'amount_paid',
 'customer_from',
 'last_updated_ts']

In [25]:
# Create new rows for each element in the array or map
user_df \
.withColumn('phone_number', explode('phone_numbers')) \
.drop('phone_numbers') \
.show()

+---+----------+---------+--------------------+-----------+-----------+-------------+-------------------+------------+
| id|first_name|last_name|               email|is_customer|amount_paid|customer_from|    last_updated_ts|phone_number|
+---+----------+---------+--------------------+-----------+-----------+-------------+-------------------+------------+
|  1|    Pheobe|   Buffay|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00| 82349238942|
|  1|    Pheobe|   Buffay|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|  2348910249|
|  1|    Pheobe|   Buffay|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|     8273929|
|  2|      Joey|Tribbiani|        joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00| 82349238942|
|  2|      Joey|Tribbiani|        joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|  2348910249|
|  4|      Ross|   Geller|        ross@abc.com| 

In [26]:
# Unlike explode, explode_outer will create row for user with no phone number
user_df \
.withColumn('phone_number', explode_outer('phone_numbers')) \
.drop('phone_numbers') \
.show()

+---+----------+---------+--------------------+-----------+-----------+-------------+-------------------+------------+
| id|first_name|last_name|               email|is_customer|amount_paid|customer_from|    last_updated_ts|phone_number|
+---+----------+---------+--------------------+-----------+-----------+-------------+-------------------+------------+
|  1|    Pheobe|   Buffay|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00| 82349238942|
|  1|    Pheobe|   Buffay|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|  2348910249|
|  1|    Pheobe|   Buffay|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|     8273929|
|  2|      Joey|Tribbiani|        joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00| 82349238942|
|  2|      Joey|Tribbiani|        joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|  2348910249|
|  3|    Monica|   Geller|      monica@abc.com| 

In [27]:
user_df \
.select('id', col('phone_numbers')[0].alias('mobile'), col('phone_numbers')[1].alias('home')) \
.show()

+---+-----------+----------+
| id|     mobile|      home|
+---+-----------+----------+
|  1|82349238942|2348910249|
|  2|82349238942|2348910249|
|  3|       null|      null|
|  4|82349238942|      null|
|  5|82349238942|2348910249|
|  6|    8273929|      null|
+---+-----------+----------+



#### MAP ~ DICT

In [28]:
users = [
            {
                "id": 1,
                "first_name": "Pheobe",
                "last_name": "Buffay",
                "phone_numbers": {"mobile": "82349238942", "home": "2348910249", "office": "8273929"},
                "email": "pheobebuffay@abc.com",
                "is_customer": True,
                "amount_paid": 1000.55,
                "customer_from": datetime.date(2021, 1, 13),
                "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
            },
            {
                "id": 2,
                "first_name": "Joey",
                "last_name": "Tribbiani",
                "phone_numbers": {"mobile": "82349238942", "home": "2348910249"},
                "email": "joey@abc.com",
                "is_customer": True,
                "amount_paid": 900.0,
                "customer_from": datetime.date(2021, 2, 14),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": 3,
                "first_name": "Monica",
                "last_name": "Geller",
                "phone_numbers": {},
                "email": "monica@abc.com",
                "is_customer": True,
                "amount_paid": 1000.90,
                "customer_from": datetime.date(2021, 2, 22),
                "last_updated_ts": datetime.datetime(2021, 2, 28, 7, 33, 0)
            },
            {
                "id": 4,
                "first_name": "Ross",
                "last_name": "Geller",
                "phone_numbers": {"mobile": "82349238942"},
                "email": "ross@abc.com",
                "is_customer": True,
                "amount_paid": 1200.55,
                "customer_from": datetime.date(2021, 1, 19),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 1, 10, 0)
            },
            {
                "id": 5,
                "first_name": "Rachel",
                "last_name": "Green",
                "phone_numbers": {"mobile": "82349238942", "home": "2348910249", "office": "8273929", "shop": "5343434654"},
                "email": "rachel@abc.com",
                "is_customer": True,
                "amount_paid": None,
                "customer_from": datetime.date(2021, 2, 24),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": 6,
                "first_name": "Chandler",
                "last_name": "Bing",
                "phone_numbers": {"mobile": "8273929"},
                "email": "bing@abc.com",
                "is_customer": True,
                "amount_paid": 1000.80,
                "customer_from": datetime.date(2021, 2, 22),
                "last_updated_ts": datetime.datetime(2021, 2, 25, 7, 33, 0)
            }
        ]

In [30]:
user_df = spark.createDataFrame([Row(**user) for user in users])

In [31]:
user_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- phone_numbers: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- email: string (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)



In [32]:
user_df.show()

+---+----------+---------+--------------------+--------------------+-----------+-----------+-------------+-------------------+
| id|first_name|last_name|       phone_numbers|               email|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+---------+--------------------+--------------------+-----------+-----------+-------------+-------------------+
|  1|    Pheobe|   Buffay|[mobile -> 823492...|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|
|  2|      Joey|Tribbiani|[mobile -> 823492...|        joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Monica|   Geller|                  []|      monica@abc.com|       true|     1000.9|   2021-02-22|2021-02-28 07:33:00|
|  4|      Ross|   Geller|[mobile -> 823492...|        ross@abc.com|       true|    1200.55|   2021-01-19|2021-02-18 01:10:00|
|  5|    Rachel|    Green|[mobile -> 823492...|      rachel@abc.com|       true|       null|   2021-02-24|2021-

In [33]:
user_df.columns

['id',
 'first_name',
 'last_name',
 'phone_numbers',
 'email',
 'is_customer',
 'amount_paid',
 'customer_from',
 'last_updated_ts']

In [36]:
user_df.dtypes

[('id', 'bigint'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('phone_numbers', 'map<string,string>'),
 ('email', 'string'),
 ('is_customer', 'boolean'),
 ('amount_paid', 'double'),
 ('customer_from', 'date'),
 ('last_updated_ts', 'timestamp')]

In [35]:
user_df.select(['id', 'phone_numbers']).show(truncate=False)

+---+----------------------------------------------------------------------------------+
|id |phone_numbers                                                                     |
+---+----------------------------------------------------------------------------------+
|1  |[mobile -> 82349238942, office -> 8273929, home -> 2348910249]                    |
|2  |[mobile -> 82349238942, home -> 2348910249]                                       |
|3  |[]                                                                                |
|4  |[mobile -> 82349238942]                                                           |
|5  |[mobile -> 82349238942, shop -> 5343434654, office -> 8273929, home -> 2348910249]|
|6  |[mobile -> 8273929]                                                               |
+---+----------------------------------------------------------------------------------+



In [40]:
user_df \
.select('id', col('phone_numbers')['mobile'].alias('mobile'), col('phone_numbers')['home'].alias('mobile')) \
.show()

+---+-----------+----------+
| id|     mobile|    mobile|
+---+-----------+----------+
|  1|82349238942|2348910249|
|  2|82349238942|2348910249|
|  3|       null|      null|
|  4|82349238942|      null|
|  5|82349238942|2348910249|
|  6|    8273929|      null|
+---+-----------+----------+



In [44]:
user_df.columns

['id',
 'first_name',
 'last_name',
 'phone_numbers',
 'email',
 'is_customer',
 'amount_paid',
 'customer_from',
 'last_updated_ts']

In [48]:
# Won't be able to give alias when we explode on top of map columns
user_df \
.select('id', explode('phone_numbers')) \
.show()

+---+------+-----------+
| id|   key|      value|
+---+------+-----------+
|  1|mobile|82349238942|
|  1|office|    8273929|
|  1|  home| 2348910249|
|  2|mobile|82349238942|
|  2|  home| 2348910249|
|  4|mobile|82349238942|
|  5|mobile|82349238942|
|  5|  shop| 5343434654|
|  5|office|    8273929|
|  5|  home| 2348910249|
|  6|mobile|    8273929|
+---+------+-----------+



In [49]:
# Consider null values as well
user_df \
.select('id', explode_outer('phone_numbers')) \
.show()

+---+------+-----------+
| id|   key|      value|
+---+------+-----------+
|  1|mobile|82349238942|
|  1|office|    8273929|
|  1|  home| 2348910249|
|  2|mobile|82349238942|
|  2|  home| 2348910249|
|  3|  null|       null|
|  4|mobile|82349238942|
|  5|mobile|82349238942|
|  5|  shop| 5343434654|
|  5|office|    8273929|
|  5|  home| 2348910249|
|  6|mobile|    8273929|
+---+------+-----------+



In [52]:
user_df \
.select('*', explode_outer('phone_numbers')) \
.withColumnRenamed('key', 'phone_type') \
.withColumnRenamed('value', 'phone_number') \
.drop('phone_numbers') \
.show()

+---+----------+---------+--------------------+-----------+-----------+-------------+-------------------+----------+------------+
| id|first_name|last_name|               email|is_customer|amount_paid|customer_from|    last_updated_ts|phone_type|phone_number|
+---+----------+---------+--------------------+-----------+-----------+-------------+-------------------+----------+------------+
|  1|    Pheobe|   Buffay|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|    mobile| 82349238942|
|  1|    Pheobe|   Buffay|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|    office|     8273929|
|  1|    Pheobe|   Buffay|pheobebuffay@abc.com|       true|    1000.55|   2021-01-13|2021-02-10 01:15:00|      home|  2348910249|
|  2|      Joey|Tribbiani|        joey@abc.com|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|    mobile| 82349238942|
|  2|      Joey|Tribbiani|        joey@abc.com|       true|      900.0|   2021-02-14|2021-

#### Struct Type - has predefined structure

In [65]:
users = [
            {
                "id": 1,
                "first_name": "Pheobe",
                "last_name": "Buffay",
                "phone_numbers": Row(mobile= "82349238942", home= "2348910249", office= "8273929", shop=None),
                "email": "pheobebuffay@abc.com",
                "is_customer": True,
                "amount_paid": 1000.55,
                "customer_from": datetime.date(2021, 1, 13),
                "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
            },
            {
                "id": 2,
                "first_name": "Joey",
                "last_name": "Tribbiani",
                "phone_numbers": Row(mobile= "82349238942", home= "2348910249", office= None, shop=None),
                "email": "joey@abc.com",
                "is_customer": True,
                "amount_paid": 900.0,
                "customer_from": datetime.date(2021, 2, 14),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": 3,
                "first_name": "Monica",
                "last_name": "Geller",
                "phone_numbers": Row(mobile= None, home= None, office= None, shop=None),
                "email": "monica@abc.com",
                "is_customer": True,
                "amount_paid": 1000.90,
                "customer_from": datetime.date(2021, 2, 22),
                "last_updated_ts": datetime.datetime(2021, 2, 28, 7, 33, 0)
            },
            {
                "id": 4,
                "first_name": "Ross",
                "last_name": "Geller",
                "phone_numbers": Row(mobile= "82349238942", home= None, office= None, shop=None),
                "email": "ross@abc.com",
                "is_customer": True,
                "amount_paid": 1200.55,
                "customer_from": datetime.date(2021, 1, 19),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 1, 10, 0)
            },
            {
                "id": 5,
                "first_name": "Rachel",
                "last_name": "Green",
                "phone_numbers": Row(mobile= "82349238942", home= "2348910249", office= "8273929", shop= "5343434654"),
                "email": "rachel@abc.com",
                "is_customer": True,
                "amount_paid": None,
                "customer_from": datetime.date(2021, 2, 24),
                "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
            },
            {
                "id": 6,
                "first_name": "Chandler",
                "last_name": "Bing",
                "phone_numbers": Row(mobile= "8273929", home= None, office= None, shop=None),
                "email": "bing@abc.com",
                "is_customer": True,
                "amount_paid": 1000.80,
                "customer_from": datetime.date(2021, 2, 22),
                "last_updated_ts": datetime.datetime(2021, 2, 25, 7, 33, 0)
            }
        ]

In [66]:
user_df = spark.createDataFrame([Row(**user) for user in users])

In [67]:
user_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |    |-- office: string (nullable = true)
 |    |-- shop: string (nullable = true)
 |-- email: string (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)



In [69]:
user_df.select('id', 'phone_numbers').show(truncate=False)

+---+----------------------------------------------+
|id |phone_numbers                                 |
+---+----------------------------------------------+
|1  |[82349238942, 2348910249, 8273929,]           |
|2  |[82349238942, 2348910249,,]                   |
|3  |[,,,]                                         |
|4  |[82349238942,,,]                              |
|5  |[82349238942, 2348910249, 8273929, 5343434654]|
|6  |[8273929,,,]                                  |
+---+----------------------------------------------+



In [70]:
user_df.columns

['id',
 'first_name',
 'last_name',
 'phone_numbers',
 'email',
 'is_customer',
 'amount_paid',
 'customer_from',
 'last_updated_ts']

In [72]:
user_df.select('id', 'phone_numbers.mobile', 'phone_numbers.home').show(truncate=False)

+---+-----------+----------+
|id |mobile     |home      |
+---+-----------+----------+
|1  |82349238942|2348910249|
|2  |82349238942|2348910249|
|3  |null       |null      |
|4  |82349238942|null      |
|5  |82349238942|2348910249|
|6  |8273929    |null      |
+---+-----------+----------+



In [74]:
user_df.select('id', col('phone_numbers')['mobile'], col('phone_numbers')['home']).show(truncate=False)

+---+--------------------+------------------+
|id |phone_numbers.mobile|phone_numbers.home|
+---+--------------------+------------------+
|1  |82349238942         |2348910249        |
|2  |82349238942         |2348910249        |
|3  |null                |null              |
|4  |82349238942         |null              |
|5  |82349238942         |2348910249        |
|6  |8273929             |null              |
+---+--------------------+------------------+



In [75]:
user_df.select('id', 'phone_numbers.*').show(truncate=False)

+---+-----------+----------+-------+----------+
|id |mobile     |home      |office |shop      |
+---+-----------+----------+-------+----------+
|1  |82349238942|2348910249|8273929|null      |
|2  |82349238942|2348910249|null   |null      |
|3  |null       |null      |null   |null      |
|4  |82349238942|null      |null   |null      |
|5  |82349238942|2348910249|8273929|5343434654|
|6  |8273929    |null      |null   |null      |
+---+-----------+----------+-------+----------+

