In [0]:
%run "./00 - Setup Data Sets"

In [0]:
from pyspark.sql.functions import *

In [0]:
# ON <specific conditions>
users_df \
    .join(course_enrolments_df, users_df.user_id == course_enrolments_df.user_id, "inner") \
    .show()

# ON <common columns>
# common columns will occur only once in the result table
users_df \
    .join(course_enrolments_df, 'user_id', "inner") \
    .show()

# BROADCAST
broadcast(users_df) \
    .join(course_enrolments_df, 'user_id', "inner") \
    .show()

# JOIN OPTIONS
"""
> inner (default)                       INNER JOIN
> outer, full, fullouter, full_outer    FULL OUTER JOIN
> left, leftouter, left_outer           LEFT JOIN
> right, rightouter, right_outer        RIGHT JOIN
> anti, leftanti, left_anti             Reutrns leftovers from left table after filtering with right table
> semi, leftsemi, left_semi             Filters left table with right table
> cross                                 Cartesian (You can also use .crossJoin(<table>))
"""

+---------+--------------------+-------------------+---------+-------------------+
|course_id|        course_title|course_published_dt|is_active|    last_updated_ts|
+---------+--------------------+-------------------+---------+-------------------+
|        1|    Mastering Python|         2021-01-14|     true|2021-02-18 16:57:25|
|        2|Data Engineering ...|         2021-02-10|     true|2021-03-05 12:07:33|
|        3|   Mastering Pyspark|         2021-01-07|     true|2021-04-06 10:05:42|
|        4|      AWS Essentials|         2021-03-19|    false|2021-04-10 02:25:36|
|        5|          Docker 101|         2021-02-28|     true|2021-03-21 07:18:52|
+---------+--------------------+-------------------+---------+-------------------+

+-------+---------------+--------------+--------------------+
|user_id|user_first_name|user_last_name|          user_email|
+-------+---------------+--------------+--------------------+
|      1|         Sandra|        Karpov|    skarpov0@ovh.net|
|   

In [0]:
# JOIN OPTIONS

"""
> inner (default)                       INNER JOIN
> outer, full, fullouter, full_outer    FULL OUTER JOIN
> left, leftouter, left_outer           LEFT JOIN
> right, rightouter, right_outer        RIGHT JOIN
> anti, leftanti, left_anti             Reutrns leftovers from left table after filtering with right table
> semi, leftsemi, left_semi             Filters left table with right table
> cross                                 Cartesian (You can also use .crossJoin(<table>))
"""

In [0]:
# EXAMPLE

# ON <specific conditions>
users_df \
    .join(course_enrolments_df, users_df.user_id == course_enrolments_df.user_id, "inner") \
    .show()

# ON <common columns>
# common columns will occur only once in the result table
users_df \
    .join(course_enrolments_df, 'user_id', "inner") \
    .show()

+-------+---------------+--------------+--------------------+-------------------+-------+---------+----------+
|user_id|user_first_name|user_last_name|          user_email|course_enrolment_id|user_id|course_id|price_paid|
+-------+---------------+--------------+--------------------+-------------------+-------+---------+----------+
|      3|         Joanna|      Spennock|jspennock2@redcro...|                 10|      3|        3|     10.99|
|      3|         Joanna|      Spennock|jspennock2@redcro...|                 12|      3|        2|      9.99|
|      4|         Hirsch|       Conaboy|hconaboy3@barnesa...|                  7|      4|        5|     10.99|
|      4|         Hirsch|       Conaboy|hconaboy3@barnesa...|                 14|      4|        3|     10.99|
|      5|         Loreen|         Malin|lmalin4@independe...|                  2|      5|        2|      9.99|
|      5|         Loreen|         Malin|lmalin4@independe...|                  6|      5|        5|     10.99|
|

In [0]:
# ALIAS

# We can't use aliases in ON conditions, but we can use it in the next operations:

users_df.alias('u') \
    .join(course_enrolments_df.alias('c'), 'user_id') \
    .filter(col('c.course_id') <= lit(3)) \
    .select('c.*', 'u.user_email') \
    .show()

+-------+-------------------+---------+----------+--------------------+
|user_id|course_enrolment_id|course_id|price_paid|          user_email|
+-------+-------------------+---------+----------+--------------------+
|      3|                 10|        3|     10.99|jspennock2@redcro...|
|      3|                 12|        2|      9.99|jspennock2@redcro...|
|      4|                 14|        3|     10.99|hconaboy3@barnesa...|
|      5|                  2|        2|      9.99|lmalin4@independe...|
|      5|                 13|        2|      9.99|lmalin4@independe...|
|      7|                  8|        3|     10.99|     tchoupin6@de.vu|
|      8|                  5|        2|      9.99|ngrimsdell7@sohu.com|
|      8|                 15|        2|      9.99|ngrimsdell7@sohu.com|
|      9|                  4|        2|      9.99|vtamas8@businessw...|
|     10|                  1|        2|      9.99|wsimpkins9@amazon...|
+-------+-------------------+---------+----------+--------------

In [0]:
# BROADCAST JOIN

"""
In a broadcast join, the smaller DataFrame is broadcasted to all executors and kept in memory. The larger DataFrame is split and distributed across the executors. This allows for a join without shuffling any data, as the required data is already colocated on each executor.
"""

# This threshold says that if the size of the smaller DataFrame is smaller than or equal to this threshold, it will be broadcasted automatically.
spark.conf.get('spark.sql.autoBroadcastJoinThreshold')

# Broadcast example:
broadcast(users_df) \
    .join(course_enrolments_df, 'user_id', "inner") \
    .show()

+-------+---------------+--------------+--------------------+-------------------+---------+----------+
|user_id|user_first_name|user_last_name|          user_email|course_enrolment_id|course_id|price_paid|
+-------+---------------+--------------+--------------------+-------------------+---------+----------+
|     10|          Wells|      Simpkins|wsimpkins9@amazon...|                  1|        2|      9.99|
|      5|         Loreen|         Malin|lmalin4@independe...|                  2|        2|      9.99|
|      7|         Trudey|       Choupin|     tchoupin6@de.vu|                  3|        5|     10.99|
|      9|        Vassily|         Tamas|vtamas8@businessw...|                  4|        2|      9.99|
|      8|         Nadine|     Grimsdell|ngrimsdell7@sohu.com|                  5|        2|      9.99|
|      5|         Loreen|         Malin|lmalin4@independe...|                  6|        5|     10.99|
|      4|         Hirsch|       Conaboy|hconaboy3@barnesa...|            