In [0]:
from pyspark.sql import Row
import pandas as pd
from pyspark.sql.functions import *
import datetime
users = [
    {
        "id": 1,
        "first_name": "Corrie",
        "last_name": "Van den Oord",
        "email": "cvandenoord0@etsy.com",
        "phone_numbers": Row(mobile="+1 234 567 8901", home="+1 234 567 8911"),
        "courses": [1, 2],
        "is_customer": True,
        "amount_paid": 1000.55,
        "customer_from": datetime.date(2021, 1, 15),
        "last_updated_ts": datetime.datetime(2021, 2, 10, 1, 15, 0)
    },
    {
        "id": 2,
        "first_name": "Nikolaus",
        "last_name": "Brewitt",
        "email": "nbrewitt1@dailymail.co.uk",
        "phone_numbers":  Row(mobile="+1 234 567 8923", home="1 234 567 8934"),
        "courses": [3],
        "is_customer": True,
        "amount_paid": 900.0,
        "customer_from": datetime.date(2021, 2, 14),
        "last_updated_ts": datetime.datetime(2021, 2, 18, 3, 33, 0)
    },
    {
        "id": 3,
        "first_name": "Orelie",
        "last_name": "Penney",
        "email": "openney2@vistaprint.com",
        "phone_numbers": Row(mobile="+1 714 512 9752", home="+1 714 512 6601"),
        "courses": [2, 4],
        "is_customer": True,
        "amount_paid": 850.55,
        "customer_from": datetime.date(2021, 1, 21),
        "last_updated_ts": datetime.datetime(2021, 3, 15, 15, 16, 55)
    },
    {
        "id": 4,
        "first_name": "Ashby",
        "last_name": "Maddocks",
        "email": "amaddocks3@home.pl",
        "phone_numbers": Row(mobile=None, home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 10, 17, 45, 30)
    },
    {
        "id": 5,
        "first_name": "Kurt",
        "last_name": "Rome",
        "email": "krome4@shutterfly.com",
        "phone_numbers": Row(mobile="+1 817 934 7142", home=None),
        "courses": [],
        "is_customer": False,
        "amount_paid": None,
        "customer_from": None,
        "last_updated_ts": datetime.datetime(2021, 4, 2, 0, 55, 18)
    }
]

spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', False)
users_df = spark.createDataFrame(pd.DataFrame(users))
users_df.show()

In [0]:
users_df.show()

In [0]:
# SORTING

# Basic
users_df.orderBy('first_name').show()

# Ordering
users_df.sort(desc('first_name')).show()
users_df.sort(col('first_name').asc()).show()
users_df.sort('first_name', ascending=False).show()

# Nulls - special treatment
users_df.sort(col('customer_from').asc_nulls_last()).show()
users_df.sort(col('customer_from').desc_nulls_first()).show()

# Composite sorting
users_df.sort('is_customer', 'amount_paid').show()
users_df.sort('is_customer', desc('amount_paid')).show()
users_df.sort(['is_customer', 'amount_paid'], ascending=[0, 1]).show()

# Custom sorting logic
sorting_category = when(size('courses') == 0, 0).otherwise(when(size('courses') == 1, 1).otherwise(2))
users_df.sort(sorting_category, col('amount_paid').desc()).show()

In [0]:
# Basic sorting - ascending is the default

users_df.sort('first_name').show()
users_df.orderBy('first_name').show()

In [0]:
# Descending - 3 ways:
#   desc()
#   col().desc()
#   ascending=False

users_df.sort(desc('first_name')).show()
users_df.sort(col('first_name').desc()).show()
users_df.sort('first_name', ascending=False).show()

In [0]:
# Nulls are treated as the lowest value by default...

users_df.sort('customer_from').show()
users_df.sort(desc('customer_from')).show()

In [0]:
# ... but we can change it with these methods:

users_df.sort(col('customer_from').asc_nulls_last()).show()
users_df.sort(col('customer_from').desc_nulls_first()).show()

In [0]:
# Composite sorting

users_df.sort('is_customer', 'amount_paid').show()
users_df.sort('is_customer', desc('amount_paid')).show()
users_df.sort(['is_customer', 'amount_paid'], ascending=[0, 1]).show()

In [0]:
# Custom sorting logic
# when(condition, value).otherwise(when(condition2, value2).otherwise([...]))

sorting_category = when(size('courses') == 0, 0).otherwise(when(size('courses') == 1, 1).otherwise(2))
users_df.sort(sorting_category, col('amount_paid').desc()).show()