In [1]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext , SparkConf
from pyspark.sql import SparkSession
conf = SparkConf().setAppName('join').setMaster('local')
sc = SparkContext(conf = conf)
spark = SparkSession(sc)

In [2]:
races_df = spark.read.parquet("processed/races").filter("race_year = 2019").\
                     withColumnRenamed("name","race_name")
races_df.show(2)

+-------+-----+----------+--------------------+-------------------+--------------------+---------+
|race_id|round|circuit_id|           race_name|     race_timestamp|      ingestion_date|race_year|
+-------+-----+----------+--------------------+-------------------+--------------------+---------+
|   1010|    1|         1|Australian Grand ...|2019-03-17 05:10:00|2024-01-06 12:59:...|     2019|
|   1011|    2|         3|  Bahrain Grand Prix|2019-03-31 15:10:00|2024-01-06 12:59:...|     2019|
+-------+-----+----------+--------------------+-------------------+--------------------+---------+
only showing top 2 rows



In [3]:
circuit_df = spark.read.parquet("processed/circuit").\
                     withColumnRenamed("name","circuit_name")
circuit_df.show(2)

+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
|circuitId| circuitRef|        circuit_name|    location|  country|     lat|    lng|alt|                 url|
+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
|        1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|144.968| 10|http://en.wikiped...|
|        2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|101.738| 18|http://en.wikiped...|
+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
only showing top 2 rows



In [4]:
joined_df = circuit_df.join(races_df, races_df.circuit_id == circuit_df.circuitId, "inner").\
                        select(circuit_df.circuit_name,
                               circuit_df.location,
                               circuit_df.country,
                               races_df.race_name,
                               races_df.round)
joined_df.show(2)

+--------------------+---------+---------+--------------------+-----+
|        circuit_name| location|  country|           race_name|round|
+--------------------+---------+---------+--------------------+-----+
|Albert Park Grand...|Melbourne|Australia|Australian Grand ...|    1|
|Bahrain Internati...|   Sakhir|  Bahrain|  Bahrain Grand Prix|    2|
+--------------------+---------+---------+--------------------+-----+
only showing top 2 rows



In [5]:
circuit_df = spark.read.parquet("processed/circuit").filter("circuitId < 70").\
                     withColumnRenamed("name","circuit_name")
circuit_df.show(2)

+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
|circuitId| circuitRef|        circuit_name|    location|  country|     lat|    lng|alt|                 url|
+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
|        1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|144.968| 10|http://en.wikiped...|
|        2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|101.738| 18|http://en.wikiped...|
+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
only showing top 2 rows



In [6]:
joined_df = circuit_df.join(races_df, races_df.circuit_id == circuit_df.circuitId, "leftouter").\
                        select(circuit_df.circuit_name,
                               circuit_df.location,
                               circuit_df.country,
                               races_df.race_name,
                               races_df.round)#left can also be given
joined_df.show(2)
joined_df.count()# all circuit records present

+--------------------+------------+---------+--------------------+-----+
|        circuit_name|    location|  country|           race_name|round|
+--------------------+------------+---------+--------------------+-----+
|Albert Park Grand...|   Melbourne|Australia|Australian Grand ...|    1|
|Sepang Internatio...|Kuala Lumpur| Malaysia|                null| null|
+--------------------+------------+---------+--------------------+-----+
only showing top 2 rows



69

In [7]:
joined_df = circuit_df.join(races_df, races_df.circuit_id == circuit_df.circuitId, "right").\
                        select(circuit_df.circuit_name,
                               circuit_df.location,
                               circuit_df.country,
                               races_df.race_name,
                               races_df.round)#rightouter can also be given
joined_df.show(2)
joined_df.count()# all races records present

+--------------------+---------+---------+--------------------+-----+
|        circuit_name| location|  country|           race_name|round|
+--------------------+---------+---------+--------------------+-----+
|Albert Park Grand...|Melbourne|Australia|Australian Grand ...|    1|
|Bahrain Internati...|   Sakhir|  Bahrain|  Bahrain Grand Prix|    2|
+--------------------+---------+---------+--------------------+-----+
only showing top 2 rows



21

In [8]:
joined_df = circuit_df.join(races_df, races_df.circuit_id == circuit_df.circuitId, "full").\
                        select(circuit_df.circuit_name,
                               circuit_df.location,
                               circuit_df.country,
                               races_df.race_name,
                               races_df.round)#fullouter can also be given
joined_df.show(2)
joined_df.count()# records in both df present with null populated for records which didn't match

+--------------------+------------+---------+--------------------+-----+
|        circuit_name|    location|  country|           race_name|round|
+--------------------+------------+---------+--------------------+-----+
|Albert Park Grand...|   Melbourne|Australia|Australian Grand ...|    1|
|Sepang Internatio...|Kuala Lumpur| Malaysia|                null| null|
+--------------------+------------+---------+--------------------+-----+
only showing top 2 rows



72

In [9]:
joined_df = circuit_df.join(races_df, races_df.circuit_id == circuit_df.circuitId, "semi").\
                        select(circuit_df.circuit_name,
                               circuit_df.location,
                               circuit_df.country)
joined_df.show(2)
joined_df.count()
# similar to inner join. records present in both df are given as output but columns from left df is only given as output
#records from race_df can not be called

+--------------------+---------+---------+
|        circuit_name| location|  country|
+--------------------+---------+---------+
|Albert Park Grand...|Melbourne|Australia|
|Bahrain Internati...|   Sakhir|  Bahrain|
+--------------------+---------+---------+
only showing top 2 rows



18

In [10]:
joined_df = circuit_df.join(races_df, races_df.circuit_id == circuit_df.circuitId, "anti").\
                        select(circuit_df.circuit_name,
                               circuit_df.location,
                               circuit_df.country)
joined_df.show(2)
joined_df.count()
# oposite of inner join. records not present in both df are given as output but columns from left df is only given as output
#records from race_df can not be called

+--------------------+------------+--------+
|        circuit_name|    location| country|
+--------------------+------------+--------+
|Sepang Internatio...|Kuala Lumpur|Malaysia|
|       Istanbul Park|    Istanbul|  Turkey|
+--------------------+------------+--------+
only showing top 2 rows



51

In [12]:
joined_df = circuit_df.crossJoin(races_df).\
                        select(circuit_df.circuit_name,
                               circuit_df.location,
                               circuit_df.country)
joined_df.show(2)
joined_df.count()
print(f"cartesian product: {circuit_df.count() * races_df.count()}")

+--------------------+------------+---------+
|        circuit_name|    location|  country|
+--------------------+------------+---------+
|Albert Park Grand...|   Melbourne|Australia|
|Sepang Internatio...|Kuala Lumpur| Malaysia|
+--------------------+------------+---------+
only showing top 2 rows

cartesian product: 1449
