# 0. Setting up

In [0]:
%run "../includes/configuration"

In [0]:
races_df = spark.read.parquet(f'{processed_folder_path}/races')

In [0]:
circuits_df = spark.read.parquet(f'{processed_folder_path}/circuits')

# 1. Inner Join

In [0]:
race_circuit_df = races_df\
    .join(circuits_df, on='circuit_id', how='inner')\
    .select(
        circuits_df.name.alias('circuit_name'),
        circuits_df.location.alias('circuit_location'),
        circuits_df.country.alias('circuit_country'),
        races_df.name.alias('race_name'),
        races_df.round.alias('race_round')
    )
race_circuit_df.count()

1058

In [0]:
race_circuit_df.limit(10).display()

circuit_name,circuit_location,circuit_country,race_name,race_round
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1
Sepang International Circuit,Kuala Lumpur,Malaysia,Malaysian Grand Prix,2
Shanghai International Circuit,Shanghai,China,Chinese Grand Prix,3
Bahrain International Circuit,Sakhir,Bahrain,Bahrain Grand Prix,4
Circuit de Barcelona-Catalunya,Montmeló,Spain,Spanish Grand Prix,5
Circuit de Monaco,Monte-Carlo,Monaco,Monaco Grand Prix,6
Istanbul Park,Istanbul,Turkey,Turkish Grand Prix,7
Silverstone Circuit,Silverstone,UK,British Grand Prix,8
Nürburgring,Nürburg,Germany,German Grand Prix,9
Hungaroring,Budapest,Hungary,Hungarian Grand Prix,10


# 2. Outer Join

## a. Left Outer Join

In [0]:
race_circuit_df = circuits_df\
    .join(races_df, on='circuit_id', how='left')\
    .select(
        circuits_df.name.alias('circuit_name'),
        circuits_df.location.alias('circuit_location'),
        circuits_df.country.alias('circuit_country'),
        races_df.name.alias('race_name'),
        races_df.round.alias('race_round')
    )
race_circuit_df.count()

1060

In [0]:
race_circuit_df.display()

circuit_name,circuit_location,circuit_country,race_name,race_round
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,21.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0


## b. Right Outer Join

In [0]:
race_circuit_df = circuits_df\
    .join(races_df, on='circuit_id', how='right')\
    .select(
        circuits_df.name.alias('circuit_name'),
        circuits_df.location.alias('circuit_location'),
        circuits_df.country.alias('circuit_country'),
        races_df.name.alias('race_name'),
        races_df.round.alias('race_round')
    )
race_circuit_df.count()

1058

In [0]:
race_circuit_df.display()

circuit_name,circuit_location,circuit_country,race_name,race_round
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1
Sepang International Circuit,Kuala Lumpur,Malaysia,Malaysian Grand Prix,2
Shanghai International Circuit,Shanghai,China,Chinese Grand Prix,3
Bahrain International Circuit,Sakhir,Bahrain,Bahrain Grand Prix,4
Circuit de Barcelona-Catalunya,Montmeló,Spain,Spanish Grand Prix,5
Circuit de Monaco,Monte-Carlo,Monaco,Monaco Grand Prix,6
Istanbul Park,Istanbul,Turkey,Turkish Grand Prix,7
Silverstone Circuit,Silverstone,UK,British Grand Prix,8
Nürburgring,Nürburg,Germany,German Grand Prix,9
Hungaroring,Budapest,Hungary,Hungarian Grand Prix,10


## c. Full Outer Join

In [0]:
race_circuit_df = circuits_df\
    .join(races_df, on='circuit_id', how='full')\
    .select(
        circuits_df.name.alias('circuit_name'),
        circuits_df.location.alias('circuit_location'),
        circuits_df.country.alias('circuit_country'),
        races_df.name.alias('race_name'),
        races_df.round.alias('race_round')
    )
race_circuit_df.count()

1060

In [0]:
race_circuit_df.display()

circuit_name,circuit_location,circuit_country,race_name,race_round
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,3.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0
Albert Park Grand Prix Circuit,Melbourne,Australia,Australian Grand Prix,1.0


# 3. Semi Join

In [0]:
race_circuit_df = circuits_df\
    .join(races_df, on='circuit_id', how='semi')\
    .select(
        circuits_df.name.alias('circuit_name'),
        circuits_df.location.alias('circuit_location'),
        circuits_df.country.alias('circuit_country'),
    )
race_circuit_df.count()

75

In [0]:
race_circuit_df.display()

circuit_name,circuit_location,circuit_country
Albert Park Grand Prix Circuit,Melbourne,Australia
Sepang International Circuit,Kuala Lumpur,Malaysia
Bahrain International Circuit,Sakhir,Bahrain
Circuit de Barcelona-Catalunya,Montmeló,Spain
Istanbul Park,Istanbul,Turkey
Circuit de Monaco,Monte-Carlo,Monaco
Circuit Gilles Villeneuve,Montreal,Canada
Circuit de Nevers Magny-Cours,Magny Cours,France
Silverstone Circuit,Silverstone,UK
Hockenheimring,Hockenheim,Germany


# 4. Anti Join

In [0]:
race_circuit_df = circuits_df\
    .join(races_df, on='circuit_id', how='anti')\
    .select(
        circuits_df.name.alias('circuit_name'),
        circuits_df.location.alias('circuit_location'),
        circuits_df.country.alias('circuit_country'),
    )
race_circuit_df.count()

2

In [0]:
race_circuit_df.display()

circuit_name,circuit_location,circuit_country
Port Imperial Street Circuit,New Jersey,USA
Hanoi Street Circuit,Hanoi,Vietnam


# 5. Cross Join

In [0]:
race_circuit_df = circuits_df.crossJoin(races_df)
race_circuit_df.count()

81466

In [0]:
races_df.count()

1058

In [0]:
circuits_df.count()

77