# 0. Prepare configurations and tables

In [0]:
%run "../includes/configuration"

In [0]:
races_df = spark.read.parquet(f'{processed_folder_path}/races')
races_df.columns

['race_id',
 'race_year',
 'round',
 'circuit_id',
 'name',
 'ingestion_date',
 'race_timestamp',
 'data_source']

In [0]:
circuits_df = spark.read.parquet(f'{processed_folder_path}/circuits')
circuits_df.columns

['circuit_id',
 'circuit_ref',
 'name',
 'location',
 'country',
 'latitude',
 'longitude',
 'altitude',
 'data_source',
 'ingestion_date']

In [0]:
drivers_df = spark.read.parquet(f'{processed_folder_path}/drivers')
drivers_df.columns

['driver_id',
 'driver_ref',
 'number',
 'code',
 'name',
 'dob',
 'nationality',
 'data_source',
 'ingestion_date']

In [0]:
constructors_df = spark.read.parquet(f'{processed_folder_path}/constructors')
constructors_df.columns

['constructor_id',
 'constructor_ref',
 'name',
 'nationality',
 'data_source',
 'ingestion_date']

In [0]:
results_df = spark.read.parquet(f'{processed_folder_path}/results')
results_df.columns

['result_id',
 'driver_id',
 'constructor_id',
 'number',
 'grid',
 'position',
 'position_text',
 'position_order',
 'points',
 'laps',
 'time',
 'milliseconds',
 'fastest_lap',
 'rank',
 'fastest_lap_time',
 'fastest_lap_speed',
 'data_source',
 'ingestion_date',
 'race_id']

# 1. Get the required joined table

In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
final_df = results_df\
    .join(constructors_df, on='constructor_id', how='inner')\
    .join(drivers_df, on='driver_id', how='inner')\
    .join(races_df, on='race_id', how='inner')\
    .join(circuits_df, on='circuit_id', how='inner')\
    .select(
        races_df.race_year.alias('race_year'),
        races_df.name.alias('race_name'),
        races_df.race_timestamp.alias('race_date'),
        circuits_df.location.alias('circuit_location'),
        drivers_df.name.alias('driver_name'),
        drivers_df.number.alias('driver_number'),
        drivers_df.nationality.alias('driver_nationality'),
        constructors_df.name.alias('team'),
        results_df.grid.alias('grid'),
        results_df.fastest_lap.alias('fastest_lap'),
        results_df.time.alias('race_time'),
        results_df.points.alias('points'),
        results_df.position.alias('position')
    )\
    .withColumn('created_date', current_timestamp())

final_df.limit(10).display()


race_year,race_name,race_date,circuit_location,driver_name,driver_number,driver_nationality,team,grid,fastest_lap,race_time,points,position,created_date
1954,Indianapolis 500,,Indianapolis,Bill Vukovich,,American,Kurtis Kraft,19,,3:49:17.27,8.0,1,2024-09-05T02:06:43.022Z
1954,Indianapolis 500,,Indianapolis,Jimmy Bryan,,American,Kuzma,3,,+1:09.95,6.0,2,2024-09-05T02:06:43.022Z
1954,Indianapolis 500,,Indianapolis,Jack McGrath,,American,Kurtis Kraft,1,,+1:19.73,5.0,3,2024-09-05T02:06:43.022Z
1954,Indianapolis 500,,Indianapolis,Troy Ruttman,,American,Kurtis Kraft,11,,+2:52.68,1.5,4,2024-09-05T02:06:43.022Z
1954,Indianapolis 500,,Indianapolis,Mike Nazaruk,,American,Kurtis Kraft,14,,+3:24.55,2.0,5,2024-09-05T02:06:43.022Z
1954,Indianapolis 500,,Indianapolis,Fred Agabashian,,American,Kurtis Kraft,24,,+3:47.55,0.0,6,2024-09-05T02:06:43.022Z
1954,Indianapolis 500,,Indianapolis,Don Freeland,,American,Phillips,6,,+4:13.35,0.0,7,2024-09-05T02:06:43.022Z
1954,Indianapolis 500,,Indianapolis,Paul Russo,,American,Kurtis Kraft,32,,+5:01.17,0.0,8,2024-09-05T02:06:43.022Z
1954,Indianapolis 500,,Indianapolis,Larry Crockett,,American,Kurtis Kraft,25,,+7:07.24,0.0,9,2024-09-05T02:06:43.022Z
1954,Indianapolis 500,,Indianapolis,Cal Niday,,American,Stevens,13,,+7:07.69,0.0,10,2024-09-05T02:06:43.022Z


In [0]:
display(final_df.filter("race_year = 2020 and race_name ='Abu Dhabi Grand Prix'").orderBy(final_df.points.desc()))

race_year,race_name,race_date,circuit_location,driver_name,driver_number,driver_nationality,team,grid,fastest_lap,race_time,points,position,created_date
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Max Verstappen,33,Dutch,Red Bull,1,14,1:36:28.645,25.0,1.0,2024-09-05T02:06:44.899Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Valtteri Bottas,77,Finnish,Mercedes,2,40,+15.976,18.0,2.0,2024-09-05T02:06:44.899Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Lewis Hamilton,44,British,Mercedes,3,37,+18.415,15.0,3.0,2024-09-05T02:06:44.899Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Alexander Albon,23,Thai,Red Bull,5,42,+19.987,12.0,4.0,2024-09-05T02:06:44.899Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Lando Norris,4,British,McLaren,4,53,+1:00.729,10.0,5.0,2024-09-05T02:06:44.899Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Carlos Sainz,55,Spanish,McLaren,6,48,+1:05.662,8.0,6.0,2024-09-05T02:06:44.899Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Daniel Ricciardo,3,Australian,Renault,11,55,+1:13.748,7.0,7.0,2024-09-05T02:06:44.899Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Pierre Gasly,10,French,AlphaTauri,9,53,+1:29.718,4.0,8.0,2024-09-05T02:06:44.899Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Esteban Ocon,31,French,Renault,10,47,+1:41.069,2.0,9.0,2024-09-05T02:06:44.899Z
2020,Abu Dhabi Grand Prix,2020-12-13T13:10:00Z,Abu Dhabi,Lance Stroll,18,Canadian,Racing Point,8,41,+1:42.738,1.0,10.0,2024-09-05T02:06:44.899Z


#2. Save to parquet in presentation folder

In [0]:
final_df.write.mode('overwrite').parquet(f'{presentation_folder_path}/race_results')