## Joining Spark DataFrame

In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
import datetime

In [2]:
userName = 'CodeInDNA'
spark = SparkSession. \
        builder. \
        appName(f'{userName} - JoinSparkDF'). \
        getOrCreate()

In [3]:
spark

In [4]:
courses = [
    {
        'course_id': 1,
        'course_title': 'Mastering Python',
        'course_published_dt': datetime.date(2021, 1, 14),
        'is_active': True,
        'last_updated_ts': datetime.datetime(2021, 2, 18, 16, 57, 25)
    },
    {
        'course_id': 2,
        'course_title': 'Data Engineering Essentials',
        'course_published_dt': datetime.date(2021, 2, 10),
        'is_active': True,
        'last_updated_ts': datetime.datetime(2021, 3, 5, 12, 7, 33)
    },
    {
        'course_id': 3,
        'course_title': 'Mastering PySpark',
        'course_published_dt': datetime.date(2021, 1, 7),
        'is_active': True,
        'last_updated_ts': datetime.datetime(2021, 4, 6, 10, 5, 42)
    },
    {
        'course_id': 4,
        'course_title': 'AWS Essentials',
        'course_published_dt': datetime.date(2021, 3, 19),
        'is_active': False,
        'last_updated_ts': datetime.datetime(2021, 4, 10, 2, 25, 36)
    },
    {
        'course_id': 5,
        'course_title': 'Docker 101',
        'course_published_dt': datetime.date(2021, 2, 28),
        'is_active': True,
        'last_updated_ts': datetime.datetime(2021, 3, 21, 7, 18, 52)
    },
]

In [5]:
courses_df = spark.createDataFrame([Row(**course) for course in courses])

courses_df.show()

+---------+--------------------+-------------------+---------+-------------------+
|course_id|        course_title|course_published_dt|is_active|    last_updated_ts|
+---------+--------------------+-------------------+---------+-------------------+
|        1|    Mastering Python|         2021-01-14|     true|2021-02-18 16:57:25|
|        2|Data Engineering ...|         2021-02-10|     true|2021-03-05 12:07:33|
|        3|   Mastering PySpark|         2021-01-07|     true|2021-04-06 10:05:42|
|        4|      AWS Essentials|         2021-03-19|    false|2021-04-10 02:25:36|
|        5|          Docker 101|         2021-02-28|     true|2021-03-21 07:18:52|
+---------+--------------------+-------------------+---------+-------------------+



In [6]:
users = [{
            "user_id": 1,
            "user_first_name": "Phoebe",
            "user_last_name": "Buffay",
            "user_email": "phoebebuffay@example.com"
        },
        {
            "user_id": 2,
            "user_first_name": "Joey",
            "user_last_name": "Tribiyani",
            "user_email": "joeytribiyani@example.com"
        },
        {
            "user_id": 3,
            "user_first_name": "Ross",
            "user_last_name": "Geller",
            "user_email": "rossgeller@example.com"
        },
        {
            "user_id": 4,
            "user_first_name": "Monica",
            "user_last_name": "Geller",
            "user_email": "monicageller@example.com"
        },
        {
            "user_id": 5,
            "user_first_name": "Chandler",
            "user_last_name": "Bing",
            "user_email": "chandlerbing@example.com"
        },
        {
            "user_id": 6,
            "user_first_name": "Sandra",
            "user_last_name": "Sharma",
            "user_email": "sandrasharma@example.com"
        },
        {
            "user_id": 7,
            "user_first_name": "Judit",
            "user_last_name": "Stevenson",
            "user_email": "juditstevenson@example.com"
        },
        {
            "user_id": 8,
            "user_first_name": "Jane",
            "user_last_name": "Doe",
            "user_email": "janedoe@example.com"
        },
        {
            "user_id": 9,
            "user_first_name": "Jerry",
            "user_last_name": "Colt",
            "user_email": "jerrycolt@example.com"
        },
        {
            "user_id": 10,
            "user_first_name": "Amiya",
            "user_last_name": "Shetty",
            "user_email": "amiyashetty@example.com"
        }
]

In [7]:
users_df = spark.createDataFrame([Row(**user) for user in users])

users_df.show()

+-------+---------------+--------------+--------------------+
|user_id|user_first_name|user_last_name|          user_email|
+-------+---------------+--------------+--------------------+
|      1|         Phoebe|        Buffay|phoebebuffay@exam...|
|      2|           Joey|     Tribiyani|joeytribiyani@exa...|
|      3|           Ross|        Geller|rossgeller@exampl...|
|      4|         Monica|        Geller|monicageller@exam...|
|      5|       Chandler|          Bing|chandlerbing@exam...|
|      6|         Sandra|        Sharma|sandrasharma@exam...|
|      7|          Judit|     Stevenson|juditstevenson@ex...|
|      8|           Jane|           Doe| janedoe@example.com|
|      9|          Jerry|          Colt|jerrycolt@example...|
|     10|          Amiya|        Shetty|amiyashetty@examp...|
+-------+---------------+--------------+--------------------+



In [8]:
course_enrolments = [
    {
        "course_enrolment_id": 1,
        "user_id": 10,
        "course_id": 2,
        "price_pad": 9.99
    },
    {
        "course_enrolment_id": 2,
        "user_id": 5,
        "course_id": 2,
        "price_pad": 9.99
    },
    {
        "course_enrolment_id": 3,
        "user_id": 7,
        "course_id": 5,
        "price_pad": 10.99
    },
    {
        "course_enrolment_id": 4,
        "user_id": 9,
        "course_id": 2,
        "price_pad": 9.99
    },
    {
        "course_enrolment_id": 5,
        "user_id": 8,
        "course_id": 2,
        "price_pad": 10.99
    },
    {
        "course_enrolment_id": 6,
        "user_id": 5,
        "course_id": 5,
        "price_pad": 10.99
    },
    {
        "course_enrolment_id": 7,
        "user_id": 4,
        "course_id": 5,
        "price_pad": 10.99
    },
    {
        "course_enrolment_id": 8,
        "user_id": 7,
        "course_id": 3,
        "price_pad": 10.99
    },
    {
        "course_enrolment_id": 9,
        "user_id": 8,
        "course_id": 5,
        "price_pad": 10.99
    },
    {
        "course_enrolment_id": 10,
        "user_id": 3,
        "course_id": 3,
        "price_pad": 10.99
    },
    {
        "course_enrolment_id": 11,
        "user_id": 7,
        "course_id": 5,
        "price_pad": 10.99
    },
    {
        "course_enrolment_id": 12,
        "user_id": 3,
        "course_id": 2,
        "price_pad": 9.99
    },
    {
        "course_enrolment_id": 13,
        "user_id": 5,
        "course_id": 2,
        "price_pad": 9.99
    },
    {
        "course_enrolment_id": 14,
        "user_id": 4,
        "course_id": 3,
        "price_pad": 10.99
    },
    {
        "course_enrolment_id": 15,
        "user_id": 8,
        "course_id": 2,
        "price_pad": 9.99
    }
]

In [9]:
course_enrolment_df = spark.createDataFrame([Row(**ce) for ce in course_enrolments])

course_enrolment_df.show()

+-------------------+-------+---------+---------+
|course_enrolment_id|user_id|course_id|price_pad|
+-------------------+-------+---------+---------+
|                  1|     10|        2|     9.99|
|                  2|      5|        2|     9.99|
|                  3|      7|        5|    10.99|
|                  4|      9|        2|     9.99|
|                  5|      8|        2|    10.99|
|                  6|      5|        5|    10.99|
|                  7|      4|        5|    10.99|
|                  8|      7|        3|    10.99|
|                  9|      8|        5|    10.99|
|                 10|      3|        3|    10.99|
|                 11|      7|        5|    10.99|
|                 12|      3|        2|     9.99|
|                 13|      5|        2|     9.99|
|                 14|      4|        3|    10.99|
|                 15|      8|        2|     9.99|
+-------------------+-------+---------+---------+



* **Inner Join** - Join or Inner Join
* **Left or Right Outer Join**
* **Full Outer Join** - **a left outer join b** union **a right outer join b**
* **Cross Join**
* Spark Data Frames have a function called `join`. It can be used to perform inner or outer or full outer join.
* We need to specify **join condition** for inner or outer or full outer join.

In [10]:
help(courses_df.join)

Help on method join in module pyspark.sql.dataframe:

join(other, on=None, how=None) method of pyspark.sql.dataframe.DataFrame instance
    Joins with another :class:`DataFrame`, using the given join expression.
    
    :param other: Right side of the join
    :param on: a string for the join column name, a list of column names,
        a join expression (Column), or a list of Columns.
        If `on` is a string or a list of strings indicating the name of the join column(s),
        the column(s) must exist on both sides, and this performs an equi-join.
    :param how: str, default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,
        ``full``, ``fullouter``, ``full_outer``, ``left``, ``leftouter``, ``left_outer``,
        ``right``, ``rightouter``, ``right_outer``, ``semi``, ``leftsemi``, ``left_semi``,
        ``anti``, ``leftanti`` and ``left_anti``.
    
    The following performs a full outer join between ``df1`` and ``df2``.
    >>> from pyspark.sql.functions impo

* Get the user details who have enrolled for the courses.
    * Need to join **users_df** and **course_enrolments_df**.
    * Here are the fields that needs to be displayed.
        * All fields from users_df
        * course_id and course_enrolment_id from course_enrolments

In [11]:
users_df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- user_first_name: string (nullable = true)
 |-- user_last_name: string (nullable = true)
 |-- user_email: string (nullable = true)



In [12]:
course_enrolment_df.printSchema()

root
 |-- course_enrolment_id: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- course_id: long (nullable = true)
 |-- price_pad: double (nullable = true)



In [13]:
users_df.join(course_enrolment_df, users_df.user_id == course_enrolment_df.user_id).show()

+-------+---------------+--------------+--------------------+-------------------+-------+---------+---------+
|user_id|user_first_name|user_last_name|          user_email|course_enrolment_id|user_id|course_id|price_pad|
+-------+---------------+--------------+--------------------+-------------------+-------+---------+---------+
|      7|          Judit|     Stevenson|juditstevenson@ex...|                  3|      7|        5|    10.99|
|      7|          Judit|     Stevenson|juditstevenson@ex...|                  8|      7|        3|    10.99|
|      7|          Judit|     Stevenson|juditstevenson@ex...|                 11|      7|        5|    10.99|
|      9|          Jerry|          Colt|jerrycolt@example...|                  4|      9|        2|     9.99|
|      5|       Chandler|          Bing|chandlerbing@exam...|                  2|      5|        2|     9.99|
|      5|       Chandler|          Bing|chandlerbing@exam...|                  6|      5|        5|    10.99|
|      5| 

In [14]:
# Advantage: user_id is not repeated by using this method
users_df.join(course_enrolment_df, on='user_id').show()

+-------+---------------+--------------+--------------------+-------------------+---------+---------+
|user_id|user_first_name|user_last_name|          user_email|course_enrolment_id|course_id|price_pad|
+-------+---------------+--------------+--------------------+-------------------+---------+---------+
|      7|          Judit|     Stevenson|juditstevenson@ex...|                  3|        5|    10.99|
|      7|          Judit|     Stevenson|juditstevenson@ex...|                  8|        3|    10.99|
|      7|          Judit|     Stevenson|juditstevenson@ex...|                 11|        5|    10.99|
|      9|          Jerry|          Colt|jerrycolt@example...|                  4|        2|     9.99|
|      5|       Chandler|          Bing|chandlerbing@exam...|                  2|        2|     9.99|
|      5|       Chandler|          Bing|chandlerbing@exam...|                  6|        5|    10.99|
|      5|       Chandler|          Bing|chandlerbing@exam...|                 13| 

In [15]:
users_df.join(course_enrolment_df, on='user_id'). \
select(users_df['*'], course_enrolment_df['course_id'], course_enrolment_df['course_enrolment_id']). \
show()

+-------+---------------+--------------+--------------------+---------+-------------------+
|user_id|user_first_name|user_last_name|          user_email|course_id|course_enrolment_id|
+-------+---------------+--------------+--------------------+---------+-------------------+
|      7|          Judit|     Stevenson|juditstevenson@ex...|        5|                  3|
|      7|          Judit|     Stevenson|juditstevenson@ex...|        3|                  8|
|      7|          Judit|     Stevenson|juditstevenson@ex...|        5|                 11|
|      9|          Jerry|          Colt|jerrycolt@example...|        2|                  4|
|      5|       Chandler|          Bing|chandlerbing@exam...|        2|                  2|
|      5|       Chandler|          Bing|chandlerbing@exam...|        5|                  6|
|      5|       Chandler|          Bing|chandlerbing@exam...|        2|                 13|
|     10|          Amiya|        Shetty|amiyashetty@examp...|        2|         

In [16]:
users_df.alias('u').join(course_enrolment_df.alias('ce'), on='user_id'). \
select('u.*', 'ce.course_id', 'ce.course_enrolment_id'). \
show()

+-------+---------------+--------------+--------------------+---------+-------------------+
|user_id|user_first_name|user_last_name|          user_email|course_id|course_enrolment_id|
+-------+---------------+--------------+--------------------+---------+-------------------+
|      7|          Judit|     Stevenson|juditstevenson@ex...|        5|                  3|
|      7|          Judit|     Stevenson|juditstevenson@ex...|        3|                  8|
|      7|          Judit|     Stevenson|juditstevenson@ex...|        5|                 11|
|      9|          Jerry|          Colt|jerrycolt@example...|        2|                  4|
|      5|       Chandler|          Bing|chandlerbing@exam...|        2|                  2|
|      5|       Chandler|          Bing|chandlerbing@exam...|        5|                  6|
|      5|       Chandler|          Bing|chandlerbing@exam...|        2|                 13|
|     10|          Amiya|        Shetty|amiyashetty@examp...|        2|         

In [17]:
# Get number of courses enroled by each user
# Fails as user_id is part of both the dataframes
users_df.alias('u'). \
join(course_enrolment_df.alias('ce'), users_df.user_id == course_enrolment_df.user_id). \
groupBy('user_id').count().show()

AnalysisException: Reference 'user_id' is ambiguous, could be: u.user_id, ce.user_id.;

In [18]:
users_df.alias('u'). \
join(course_enrolment_df.alias('ce'), 'user_id'). \
groupBy('user_id').count().show()

+-------+-----+
|user_id|count|
+-------+-----+
|      7|    3|
|      9|    1|
|      5|    3|
|     10|    1|
|      3|    2|
|      8|    3|
|      4|    2|
+-------+-----+



In [19]:
users_df.alias('u'). \
join(course_enrolment_df.alias('ce'), 'user_id'). \
groupBy('u.user_id').count().show()

+-------+-----+
|user_id|count|
+-------+-----+
|      7|    3|
|      9|    1|
|      5|    3|
|     10|    1|
|      3|    2|
|      8|    3|
|      4|    2|
+-------+-----+



### LEFT JOIN

* Get all the user details along with course enrolment details(If the user have any course enrolments).
* If the users does not have any course enrolments, we need to get all user details. Course details will be substitued with null values.
    * Need to perform left or right outer join **users_df** and **course_enrolemnts_df**.
    * We will use left for this notebook. As `user_df` is from parent table ans as we are going to use `left outer join`, we need to invoke `join` on top of `users_df`.
    * Here are the fields that needs to be displayed.
        * All fields from `users_df`
        * `course_id` and `course_enrolment_id` from `course_enrolments`
    * For this example using these dataframes, using just `outer` also give same results. But it is not correct to use `outer`.
    * `how='outer'` means **full outer join**.

In [20]:
users_df.alias('u').join(course_enrolment_df.alias('ce'), on='user_id', how='leftOuter'). \
select('u.*', 'ce.course_id', 'ce.course_enrolment_id'). \
show()

+-------+---------------+--------------+--------------------+---------+-------------------+
|user_id|user_first_name|user_last_name|          user_email|course_id|course_enrolment_id|
+-------+---------------+--------------+--------------------+---------+-------------------+
|      7|          Judit|     Stevenson|juditstevenson@ex...|        5|                  3|
|      7|          Judit|     Stevenson|juditstevenson@ex...|        3|                  8|
|      7|          Judit|     Stevenson|juditstevenson@ex...|        5|                 11|
|      6|         Sandra|        Sharma|sandrasharma@exam...|     null|               null|
|      9|          Jerry|          Colt|jerrycolt@example...|        2|                  4|
|      5|       Chandler|          Bing|chandlerbing@exam...|        2|                  2|
|      5|       Chandler|          Bing|chandlerbing@exam...|        5|                  6|
|      5|       Chandler|          Bing|chandlerbing@exam...|        2|         

In [21]:
# Get all the users who have not enroled for any courses
# Recommended to use primary key in the child table when comparing with null values
users_df.alias('u'). \
join(course_enrolment_df.alias('ce'), 'user_id', how='left'). \
filter('ce.course_id IS NULL'). \
select('u.*', 'ce.course_id', 'ce.course_enrolment_id'). \
show()

+-------+---------------+--------------+--------------------+---------+-------------------+
|user_id|user_first_name|user_last_name|          user_email|course_id|course_enrolment_id|
+-------+---------------+--------------+--------------------+---------+-------------------+
|      6|         Sandra|        Sharma|sandrasharma@exam...|     null|               null|
|      1|         Phoebe|        Buffay|phoebebuffay@exam...|     null|               null|
|      2|           Joey|     Tribiyani|joeytribiyani@exa...|     null|               null|
+-------+---------------+--------------+--------------------+---------+-------------------+



* Get number of courses enroled by each user
* If there are no enrolments, then count should return 0

In [22]:
# count will give incorrect results
# Even though users 1, 2, 6 are not enrolled for any courses, it returns 1
users_df.alias('u'). \
    join(course_enrolment_df.alias('ce'), users_df.user_id == course_enrolment_df.user_id, 'left'). \
    groupBy('u.user_id'). \
    count(). \
    orderBy('u.user_id'). \
    show()

+-------+-----+
|user_id|count|
+-------+-----+
|      1|    1|
|      2|    1|
|      3|    2|
|      4|    2|
|      5|    3|
|      6|    1|
|      7|    3|
|      8|    3|
|      9|    1|
|     10|    1|
+-------+-----+



In [23]:
# Solution
users_df.alias('u'). \
    join(course_enrolment_df.alias('ce'), users_df.user_id == course_enrolment_df.user_id, 'left'). \
    groupBy('u.user_id'). \
    agg(sum(when(course_enrolment_df['user_id'].isNull(), 0).otherwise(1)).alias('course_count')). \
    orderBy('u.user_id'). \
    show()

+-------+------------+
|user_id|course_count|
+-------+------------+
|      1|           0|
|      2|           0|
|      3|           2|
|      4|           2|
|      5|           3|
|      6|           0|
|      7|           3|
|      8|           3|
|      9|           1|
|     10|           1|
+-------+------------+



In [24]:
# SQL STYLE Syntax
users_df.alias('u'). \
    join(course_enrolment_df.alias('ce'), users_df.user_id == course_enrolment_df.user_id, 'left'). \
    groupBy('u.user_id'). \
    agg(sum(expr("""
        CASE WHEN ce.user_id IS NULL THEN 0 ELSE 1 END 
    """)).alias('course_count')). \
    orderBy('u.user_id'). \
    show()

+-------+------------+
|user_id|course_count|
+-------+------------+
|      1|           0|
|      2|           0|
|      3|           2|
|      4|           2|
|      5|           3|
|      6|           0|
|      7|           3|
|      8|           3|
|      9|           1|
|     10|           1|
+-------+------------+



### RIGHT JOIN

In [25]:
 course_enrolment_df. \
    join(users_df, course_enrolment_df.user_id == users_df.user_id, 'right'). \
    show()

+-------------------+-------+---------+---------+-------+---------------+--------------+--------------------+
|course_enrolment_id|user_id|course_id|price_pad|user_id|user_first_name|user_last_name|          user_email|
+-------------------+-------+---------+---------+-------+---------------+--------------+--------------------+
|                  3|      7|        5|    10.99|      7|          Judit|     Stevenson|juditstevenson@ex...|
|                  8|      7|        3|    10.99|      7|          Judit|     Stevenson|juditstevenson@ex...|
|                 11|      7|        5|    10.99|      7|          Judit|     Stevenson|juditstevenson@ex...|
|               null|   null|     null|     null|      6|         Sandra|        Sharma|sandrasharma@exam...|
|                  4|      9|        2|     9.99|      9|          Jerry|          Colt|jerrycolt@example...|
|                  2|      5|        2|     9.99|      5|       Chandler|          Bing|chandlerbing@exam...|
|         

In [26]:
 course_enrolment_df. \
    join(users_df, 'user_id', 'right'). \
    show()

+-------+-------------------+---------+---------+---------------+--------------+--------------------+
|user_id|course_enrolment_id|course_id|price_pad|user_first_name|user_last_name|          user_email|
+-------+-------------------+---------+---------+---------------+--------------+--------------------+
|      7|                  3|        5|    10.99|          Judit|     Stevenson|juditstevenson@ex...|
|      7|                  8|        3|    10.99|          Judit|     Stevenson|juditstevenson@ex...|
|      7|                 11|        5|    10.99|          Judit|     Stevenson|juditstevenson@ex...|
|      6|               null|     null|     null|         Sandra|        Sharma|sandrasharma@exam...|
|      9|                  4|        2|     9.99|          Jerry|          Colt|jerrycolt@example...|
|      5|                  2|        2|     9.99|       Chandler|          Bing|chandlerbing@exam...|
|      5|                  6|        5|    10.99|       Chandler|          Bing|ch

In [27]:
course_enrolment_df. \
join(users_df, users_df['user_id'] == course_enrolment_df.user_id, 'right'). \
select(users_df["*"], course_enrolment_df['course_enrolment_id'], course_enrolment_df['course_id']). \
show()

+-------+---------------+--------------+--------------------+-------------------+---------+
|user_id|user_first_name|user_last_name|          user_email|course_enrolment_id|course_id|
+-------+---------------+--------------+--------------------+-------------------+---------+
|      7|          Judit|     Stevenson|juditstevenson@ex...|                  3|        5|
|      7|          Judit|     Stevenson|juditstevenson@ex...|                  8|        3|
|      7|          Judit|     Stevenson|juditstevenson@ex...|                 11|        5|
|      6|         Sandra|        Sharma|sandrasharma@exam...|               null|     null|
|      9|          Jerry|          Colt|jerrycolt@example...|                  4|        2|
|      5|       Chandler|          Bing|chandlerbing@exam...|                  2|        2|
|      5|       Chandler|          Bing|chandlerbing@exam...|                  6|        5|
|      5|       Chandler|          Bing|chandlerbing@exam...|                 13

In [28]:
course_enrolment_df.alias('ce'). \
join(users_df.alias('u'), 'user_id', how='right'). \
filter('ce.course_id IS NULL'). \
select('u.*', 'ce.course_id', 'ce.course_enrolment_id'). \
show()

+-------+---------------+--------------+--------------------+---------+-------------------+
|user_id|user_first_name|user_last_name|          user_email|course_id|course_enrolment_id|
+-------+---------------+--------------+--------------------+---------+-------------------+
|      6|         Sandra|        Sharma|sandrasharma@exam...|     null|               null|
|      1|         Phoebe|        Buffay|phoebebuffay@exam...|     null|               null|
|      2|           Joey|     Tribiyani|joeytribiyani@exa...|     null|               null|
+-------+---------------+--------------+--------------------+---------+-------------------+



In [29]:
# count will give incorrect results
# Even though users 1, 2, 6 are not enrolled for any courses, it returns 1
course_enrolment_df.alias('ce'). \
    join(users_df.alias('u'), users_df.user_id == course_enrolment_df.user_id, 'right'). \
    groupBy('u.user_id'). \
    count(). \
    orderBy('u.user_id'). \
    show()

+-------+-----+
|user_id|count|
+-------+-----+
|      1|    1|
|      2|    1|
|      3|    2|
|      4|    2|
|      5|    3|
|      6|    1|
|      7|    3|
|      8|    3|
|      9|    1|
|     10|    1|
+-------+-----+



In [30]:
course_enrolment_df.alias('ce'). \
    join(users_df.alias('u'), users_df.user_id == course_enrolment_df.user_id, 'right'). \
    groupBy('u.user_id'). \
    agg(sum(when(course_enrolment_df['user_id'].isNull(), 0).otherwise(1)).alias('course_count')). \
    orderBy('u.user_id'). \
    show()

+-------+------------+
|user_id|course_count|
+-------+------------+
|      1|           0|
|      2|           0|
|      3|           2|
|      4|           2|
|      5|           3|
|      6|           0|
|      7|           3|
|      8|           3|
|      9|           1|
|     10|           1|
+-------+------------+



**NOTE:**
* Left or Right is used based on the side of the driving DataFrame.
* Between users and course_enrolments, users is typically the driving DataFrame as there is one to many relationship between users and course_enrolments.
* Here is how we typically perform outer join between **users** and **course_enrolments**.
    * **users** left outer join **course_enrolments**.
    * **course_enrolments** right outer join **users**.
* Also here is how we typically perform outer join between **courses** and **course_enrolments**.
    * **courses** left outer join **course_enrolments**.
    * **course_enrolments** right outer join **courses**.

### FULL OUTER JOIN

In [32]:
users_df.show()

+-------+---------------+--------------+--------------------+
|user_id|user_first_name|user_last_name|          user_email|
+-------+---------------+--------------+--------------------+
|      1|         Phoebe|        Buffay|phoebebuffay@exam...|
|      2|           Joey|     Tribiyani|joeytribiyani@exa...|
|      3|           Ross|        Geller|rossgeller@exampl...|
|      4|         Monica|        Geller|monicageller@exam...|
|      5|       Chandler|          Bing|chandlerbing@exam...|
|      6|         Sandra|        Sharma|sandrasharma@exam...|
|      7|          Judit|     Stevenson|juditstevenson@ex...|
|      8|           Jane|           Doe| janedoe@example.com|
|      9|          Jerry|          Colt|jerrycolt@example...|
|     10|          Amiya|        Shetty|amiyashetty@examp...|
+-------+---------------+--------------+--------------------+



In [34]:
users1 = [{
            "user_id": 1,
            "user_first_name": "Phoebe",
            "user_last_name": "Buffay",
            "user_email": "phoebebuffay@example.com"
        },
        {
            "user_id": 3,
            "user_first_name": "Ross",
            "user_last_name": "Geller",
            "user_email": "rossgeller@example.com"
        },
        {
            "user_id": 6,
            "user_first_name": "Sandra",
            "user_last_name": "Sharma",
            "user_email": "sandrasharma@example.com"
        },
        {
            "user_id": 7,
            "user_first_name": "Judit",
            "user_last_name": "Stevenson",
            "user_email": "juditstevenson@example.com"
        },
        {
            "user_id": 8,
            "user_first_name": "Jane",
            "user_last_name": "Doe",
            "user_email": "janedoe@example.com"
        }, 
        {
            "user_id": 11,
            "user_first_name": "Joe",
            "user_last_name": "Dane",
            "user_email": "joedane@example.com"  
        }, 
        {
            "user_id": 13,
            "user_first_name": "Anu",
            "user_last_name": "K",
            "user_email": "anuk@example.com"  
        }
]

In [35]:
user_df1 = spark.createDataFrame(users1)



In [36]:
users_df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- user_first_name: string (nullable = true)
 |-- user_last_name: string (nullable = true)
 |-- user_email: string (nullable = true)



In [37]:
user_df1.printSchema()

root
 |-- user_email: string (nullable = true)
 |-- user_first_name: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- user_last_name: string (nullable = true)



In [38]:
users_df. \
join(user_df1, users_df.user_id == user_df1.user_id, 'full'). \
show()

+-------+---------------+--------------+--------------------+--------------------+---------------+-------+--------------+
|user_id|user_first_name|user_last_name|          user_email|          user_email|user_first_name|user_id|user_last_name|
+-------+---------------+--------------+--------------------+--------------------+---------------+-------+--------------+
|      7|          Judit|     Stevenson|juditstevenson@ex...|juditstevenson@ex...|          Judit|      7|     Stevenson|
|      6|         Sandra|        Sharma|sandrasharma@exam...|sandrasharma@exam...|         Sandra|      6|        Sharma|
|      9|          Jerry|          Colt|jerrycolt@example...|                null|           null|   null|          null|
|      5|       Chandler|          Bing|chandlerbing@exam...|                null|           null|   null|          null|
|      1|         Phoebe|        Buffay|phoebebuffay@exam...|phoebebuffay@exam...|         Phoebe|      1|        Buffay|
|     10|          Amiya

In [40]:
# users_df (10) + user_df1 unique(2)
users_df. \
join(user_df1, 'user_id', 'full'). \
count()

12

In [43]:
# Union doesn't remove the duplicates (Count: 17)
users_df. \
join(user_df1, 'user_id', 'left'). \
union(
    users_df. \
    join(user_df1, 'user_id', 'right')
). \
show()

+-------+---------------+--------------+--------------------+--------------------+---------------+--------------+
|user_id|user_first_name|user_last_name|          user_email|          user_email|user_first_name|user_last_name|
+-------+---------------+--------------+--------------------+--------------------+---------------+--------------+
|      7|          Judit|     Stevenson|juditstevenson@ex...|juditstevenson@ex...|          Judit|     Stevenson|
|      6|         Sandra|        Sharma|sandrasharma@exam...|sandrasharma@exam...|         Sandra|        Sharma|
|      9|          Jerry|          Colt|jerrycolt@example...|                null|           null|          null|
|      5|       Chandler|          Bing|chandlerbing@exam...|                null|           null|          null|
|      1|         Phoebe|        Buffay|phoebebuffay@exam...|phoebebuffay@exam...|         Phoebe|        Buffay|
|     10|          Amiya|        Shetty|amiyashetty@examp...|                null|      

In [44]:
# use distinct to remove duplicates
users_df. \
join(user_df1, 'user_id', 'left'). \
union(
    users_df. \
    join(user_df1, 'user_id', 'right')
). \
distinct(). \
count()

12

In [47]:
# Projecting data after full outer join
# Get the details from the user_df, if missing, get details from user_df1

users_df. \
join(user_df1, users_df.user_id == user_df1.user_id, 'full'). \
select(
    coalesce(users_df['user_id'], user_df1['user_id']).alias('user_id'),
    coalesce(users_df['user_first_name'], user_df1['user_first_name']).alias('user_first_name'),
    coalesce(users_df['user_last_name'], user_df1['user_last_name']).alias('user_last_name'),
    coalesce(users_df['user_email'], user_df1['user_email']).alias('user_email')
). \
show()

+-------+---------------+--------------+--------------------+
|user_id|user_first_name|user_last_name|          user_email|
+-------+---------------+--------------+--------------------+
|      7|          Judit|     Stevenson|juditstevenson@ex...|
|      6|         Sandra|        Sharma|sandrasharma@exam...|
|      9|          Jerry|          Colt|jerrycolt@example...|
|      5|       Chandler|          Bing|chandlerbing@exam...|
|      1|         Phoebe|        Buffay|phoebebuffay@exam...|
|     10|          Amiya|        Shetty|amiyashetty@examp...|
|      3|           Ross|        Geller|rossgeller@exampl...|
|      8|           Jane|           Doe| janedoe@example.com|
|     11|            Joe|          Dane| joedane@example.com|
|      2|           Joey|     Tribiyani|joeytribiyani@exa...|
|      4|         Monica|        Geller|monicageller@exam...|
|     13|            Anu|             K|    anuk@example.com|
+-------+---------------+--------------+--------------------+



### CROSS JOIN

In [48]:
help(users_df.crossJoin)

Help on method crossJoin in module pyspark.sql.dataframe:

crossJoin(other) method of pyspark.sql.dataframe.DataFrame instance
    Returns the cartesian product with another :class:`DataFrame`.
    
    :param other: Right side of the cartesian product.
    
    >>> df.select("age", "name").collect()
    [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
    >>> df2.select("name", "height").collect()
    [Row(name='Tom', height=80), Row(name='Bob', height=85)]
    >>> df.crossJoin(df2.select("height")).select("age", "name", "height").collect()
    [Row(age=2, name='Alice', height=80), Row(age=2, name='Alice', height=85),
     Row(age=5, name='Bob', height=80), Row(age=5, name='Bob', height=85)]
    
    .. versionadded:: 2.1



In [49]:
users_df. \
crossJoin(courses_df). \
show()

+-------+---------------+--------------+--------------------+---------+--------------------+-------------------+---------+-------------------+
|user_id|user_first_name|user_last_name|          user_email|course_id|        course_title|course_published_dt|is_active|    last_updated_ts|
+-------+---------------+--------------+--------------------+---------+--------------------+-------------------+---------+-------------------+
|      1|         Phoebe|        Buffay|phoebebuffay@exam...|        1|    Mastering Python|         2021-01-14|     true|2021-02-18 16:57:25|
|      2|           Joey|     Tribiyani|joeytribiyani@exa...|        1|    Mastering Python|         2021-01-14|     true|2021-02-18 16:57:25|
|      1|         Phoebe|        Buffay|phoebebuffay@exam...|        2|Data Engineering ...|         2021-02-10|     true|2021-03-05 12:07:33|
|      2|           Joey|     Tribiyani|joeytribiyani@exa...|        2|Data Engineering ...|         2021-02-10|     true|2021-03-05 12:07:33|

In [50]:
# Number of records will be equal to
# Number of records in 1st DF * Number of records in 2nd DF
# 10 * 5 = 50
users_df. \
crossJoin(courses_df). \
count()

50

**NOTE**

* A cross-join that does not have a `'where' clause or condition` gives the Cartesian product. Cartesian product result-set contains the number of rows in the first table, multiplied by the number of rows in second table.