In [0]:
from pyspark.sql import Row
from pyspark.sql.functions import col
import datetime

# One row is defined with list of strings
users_with_lists = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "phone_numbers": ["+1 234 567 8901", "+1 234 567 8911"],
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "phone_numbers": ["+1 234 567 8923", "+1 234 567 8934"],
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "phone_numbers": ["+1 714 512 9752", "+1 714 512 6601"],
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "phone_numbers": None,
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "phone_numbers": ["+1 817 934 7142"],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    }
]

# One row is defined as a dictionary
users_with_dicts = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "phone_numbers": {"mobile": "+1 234 567 8901", "home": "+1 234 567 8911"},
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "phone_numbers": {"mobile": "+1 234 567 8923", "home": "+1 234 567 8934"},
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "phone_numbers": {"mobile": "+1 714 512 9752", "home": "+1 714 512 6601"},
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "phone_numbers": None,
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "phone_numbers": {"mobile": "+1 817 934 7142", "age": 25},
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    }
]

# One row is defined as pySpark's Row
users_with_row = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "phone_numbers": Row(mobile="+1 234 567 8901", home="+1 234 567 8911"),
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "phone_numbers":  Row(mobile="+1 234 567 8923", home="+1 234 567 8934"),
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "phone_numbers": Row(mobile="+1 714 512 9752", home="+1 714 512 6601"),
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "phone_numbers": Row(mobile=None, home=None),
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "phone_numbers": Row(mobile="+1 817 934 7142", home=None),
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    }
]

In [0]:
# 3 container type columns:
# Arrays <-- list (index notation)
# Maps <-- dict (keyword notation)
# Structs <-- pySpark's Row (dot and keyword notation)

# col(container_type_column) - lets us access the elements of the container column
# col(XXX).alias('new_name') - SELECT XXX AS new_name
# explode(container_type_column) - lets us explode the rows with these columns into multiple rows

## Arrays
#### Perfect for lists of values

In [0]:
# As one can see, the list column transforms into pySpark's Array

users_with_lists_df = spark.createDataFrame([Row(**user) for user in users_with_lists])
users_with_lists_df.printSchema()
users_with_lists_df.show(truncate=False)

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone_numbers: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)

+---+----------+------------+-------------------------+----------------------------------+-----------+-----------+-------------+-------------------+
|id |first_name|last_name   |email                    |phone_numbers                     |is_customer|amount_paid|customer_from|last_updated_ts    |
+---+----------+------------+-------------------------+----------------------------------+-----------+-----------+-------------+-------------------+
|1  |Corrie    |Van den Oord|cvandenoord0@etsy.com    |[+1 234 567 8901, +1 234 567 8911]|true       |1000.55    |2021-01-1

In [0]:
users_with_lists_df.dtypes

[('id', 'bigint'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('email', 'string'),
 ('phone_numbers', 'array<string>'),
 ('is_customer', 'boolean'),
 ('amount_paid', 'double'),
 ('customer_from', 'date'),
 ('last_updated_ts', 'timestamp')]

In [0]:
# We can use 'col' function to access the elements of an array and show them as different columns

users_with_lists_df.select('id', col('phone_numbers')[0].alias('mobile'), col('phone_numbers')[1].alias('home')).show(truncate=False)

+---+---------------+---------------+
|id |mobile         |home           |
+---+---------------+---------------+
|1  |+1 234 567 8901|+1 234 567 8911|
|2  |+1 234 567 8923|+1 234 567 8934|
|3  |+1 714 512 9752|+1 714 512 6601|
|4  |NULL           |NULL           |
|5  |+1 817 934 7142|NULL           |
+---+---------------+---------------+



## Maps
#### Great for flexible key-value pairs

In [0]:
# As one can see, the dict column transforms into pySpark's map

users_with_dicts_df = spark.createDataFrame([Row(**user) for user in users_with_dicts])
users_with_dicts_df.printSchema()
users_with_dicts_df.show(truncate=False)

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone_numbers: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)

+---+----------+------------+-------------------------+----------------------------------------------------+-----------+-----------+-------------+-------------------+
|id |first_name|last_name   |email                    |phone_numbers                                       |is_customer|amount_paid|customer_from|last_updated_ts    |
+---+----------+------------+-------------------------+----------------------------------------------------+-----------+-----------+-------------+-------------------+
|1  |Corrie    |Van den Oord|cvandenoord0@etsy

In [0]:
users_with_dicts_df.dtypes

[('id', 'bigint'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('email', 'string'),
 ('phone_numbers', 'map<string,string>'),
 ('is_customer', 'boolean'),
 ('amount_paid', 'double'),
 ('customer_from', 'date'),
 ('last_updated_ts', 'timestamp')]

In [0]:
# We can use 'col' function to access the elements of a map and show them as different columns
# In contradiction to array we use a key instead of an index
#
# We can also see that not every row needs all keys. If a row does not have a specific key, it's filled with NULL

users_with_dicts_df.select('id', col('phone_numbers')['mobile'].alias('mobile'), col('phone_numbers')['home'], col('phone_numbers')['age']).show(truncate=False)

+---+---------------+-------------------+------------------+
|id |mobile         |phone_numbers[home]|phone_numbers[age]|
+---+---------------+-------------------+------------------+
|1  |+1 234 567 8901|+1 234 567 8911    |NULL              |
|2  |+1 234 567 8923|+1 234 567 8934    |NULL              |
|3  |+1 714 512 9752|+1 714 512 6601    |NULL              |
|4  |NULL           |NULL               |NULL              |
|5  |+1 817 934 7142|NULL               |25                |
+---+---------------+-------------------+------------------+



## Structs
#### The best for fixed, known fields

In [0]:
# Structs work similar to maps, but in this case every row must have all keys
# Like we can see in 4'th and 5'th row - the NULLs had to be defined explicitly

users_with_row_df = spark.createDataFrame([Row(**user) for user in users_with_row])
users_with_row_df.printSchema()
users_with_row_df.show(truncate=False)

root
 |-- id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: string (nullable = true)
 |    |-- home: string (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- amount_paid: double (nullable = true)
 |-- customer_from: date (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)

+---+----------+------------+-------------------------+----------------------------------+-----------+-----------+-------------+-------------------+
|id |first_name|last_name   |email                    |phone_numbers                     |is_customer|amount_paid|customer_from|last_updated_ts    |
+---+----------+------------+-------------------------+----------------------------------+-----------+-----------+-------------+-------------------+
|1  |Corrie    |Van den Oord|cvandenoord0@etsy.com    |{+1 234 567 8901, +1 234 567 89

In [0]:
users_with_row_df.dtypes

[('id', 'bigint'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('email', 'string'),
 ('phone_numbers', 'struct<mobile:string,home:string>'),
 ('is_customer', 'boolean'),
 ('amount_paid', 'double'),
 ('customer_from', 'date'),
 ('last_updated_ts', 'timestamp')]

In [0]:
# We can get the elements of a struct by '.' notation or by using a key

users_with_row_df.select('id', col('phone_numbers').mobile.alias('mobile'), col('phone_numbers')['home']).show(truncate=False)

+---+---------------+------------------+
|id |mobile         |phone_numbers.home|
+---+---------------+------------------+
|1  |+1 234 567 8901|+1 234 567 8911   |
|2  |+1 234 567 8923|+1 234 567 8934   |
|3  |+1 714 512 9752|+1 714 512 6601   |
|4  |NULL           |NULL              |
|5  |+1 817 934 7142|NULL              |
+---+---------------+------------------+



#### Explode

In [0]:
# We can 'explode' rows with these columns into multiple rows:

from pyspark.sql.functions import explode

users_with_lists_df.select('id', explode('phone_numbers')).show()

+---+---------------+
| id|            col|
+---+---------------+
|  1|+1 234 567 8901|
|  1|+1 234 567 8911|
|  2|+1 234 567 8923|
|  2|+1 234 567 8934|
|  3|+1 714 512 9752|
|  3|+1 714 512 6601|
|  5|+1 817 934 7142|
+---+---------------+

