# 0. Setting up

In [0]:
%run "../includes/configuration"

In [0]:
race_results_df = spark.read.parquet(f'{presentation_folder_path}/race_results')
race_results_df.limit(10).display()

race_year,race_name,race_date,circuit_location,driver_name,driver_number,driver_nationality,team,grid,fastest_lap,race_time,points,created_date
1954,Indianapolis 500,,Indianapolis,Bill Vukovich,,American,Kurtis Kraft,19,,3:49:17.27,8.0,2024-09-04T09:58:42.937Z
1954,Indianapolis 500,,Indianapolis,Jimmy Bryan,,American,Kuzma,3,,+1:09.95,6.0,2024-09-04T09:58:42.937Z
1954,Indianapolis 500,,Indianapolis,Jack McGrath,,American,Kurtis Kraft,1,,+1:19.73,5.0,2024-09-04T09:58:42.937Z
1954,Indianapolis 500,,Indianapolis,Troy Ruttman,,American,Kurtis Kraft,11,,+2:52.68,1.5,2024-09-04T09:58:42.937Z
1954,Indianapolis 500,,Indianapolis,Mike Nazaruk,,American,Kurtis Kraft,14,,+3:24.55,2.0,2024-09-04T09:58:42.937Z
1954,Indianapolis 500,,Indianapolis,Fred Agabashian,,American,Kurtis Kraft,24,,+3:47.55,0.0,2024-09-04T09:58:42.937Z
1954,Indianapolis 500,,Indianapolis,Don Freeland,,American,Phillips,6,,+4:13.35,0.0,2024-09-04T09:58:42.937Z
1954,Indianapolis 500,,Indianapolis,Paul Russo,,American,Kurtis Kraft,32,,+5:01.17,0.0,2024-09-04T09:58:42.937Z
1954,Indianapolis 500,,Indianapolis,Larry Crockett,,American,Kurtis Kraft,25,,+7:07.24,0.0,2024-09-04T09:58:42.937Z
1954,Indianapolis 500,,Indianapolis,Cal Niday,,American,Stevens,13,,+7:07.69,0.0,2024-09-04T09:58:42.937Z


In [0]:
demo_df = race_results_df.filter('race_year = 2020')
demo_df.limit(10).display()

race_year,race_name,race_date,circuit_location,driver_name,driver_number,driver_nationality,team,grid,fastest_lap,race_time,points,created_date
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Lewis Hamilton,44,British,Mercedes,2,58,1:35:49.641,25.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Max Verstappen,33,Dutch,Red Bull,3,60,+4.470,19.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Daniel Ricciardo,3,Australian,Renault,6,53,+14.613,15.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Sergio Pérez,11,Mexican,Racing Point,9,58,+16.070,12.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Carlos Sainz,55,Spanish,McLaren,10,53,+21.905,10.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Pierre Gasly,10,French,AlphaTauri,12,53,+22.766,8.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Charles Leclerc,16,Monegasque,Ferrari,4,54,+30.814,6.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Nico Hülkenberg,27,German,Racing Point,20,58,+32.596,4.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Romain Grosjean,8,French,Haas F1 Team,16,54,+39.081,2.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Antonio Giovinazzi,99,Italian,Alfa Romeo,14,38,+40.035,1.0,2024-09-04T09:58:42.937Z


# 1. Simple aggregations

In [0]:
from pyspark.sql.functions import sum, count, countDistinct

In [0]:
demo_df.select(count('*')).display()

count(1)
340


In [0]:
demo_df.select(countDistinct('race_name')).display()

count(DISTINCT race_name)
17


In [0]:
demo_df\
    .filter(demo_df.driver_name == 'Lewis Hamilton')\
    .select(sum('points'), countDistinct('race_name'))\
    .withColumnRenamed('sum(points)', 'total_points')\
    .withColumnRenamed('count(DISTINCT race_name)', 'races_attended')\
    .display()

total_points,races_attended
347.0,16


# 2. Now with Group By

In [0]:
demo_df\
    .groupBy('driver_name')\
    .agg(sum('points'), countDistinct('race_name'))\
    .withColumnRenamed('sum(points)', 'total_points')\
    .withColumnRenamed('count(DISTINCT race_name)', 'races_attended')\
    .display()

driver_name,total_points,races_attended
Jack Aitken,0.0,1
Daniil Kvyat,32.0,17
Kevin Magnussen,1.0,17
Sergio Pérez,125.0,15
Carlos Sainz,105.0,17
Kimi Räikkönen,4.0,17
Romain Grosjean,2.0,15
Charles Leclerc,98.0,17
Alexander Albon,105.0,17
Lance Stroll,75.0,16


# 3. Window Functions

In [0]:
demo_df = race_results_df.filter('race_year in (2019, 2020)')
demo_df.limit(10).display()

race_year,race_name,race_date,circuit_location,driver_name,driver_number,driver_nationality,team,grid,fastest_lap,race_time,points,created_date
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Lewis Hamilton,44,British,Mercedes,2,58,1:35:49.641,25.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Max Verstappen,33,Dutch,Red Bull,3,60,+4.470,19.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Daniel Ricciardo,3,Australian,Renault,6,53,+14.613,15.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Sergio Pérez,11,Mexican,Racing Point,9,58,+16.070,12.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Carlos Sainz,55,Spanish,McLaren,10,53,+21.905,10.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Pierre Gasly,10,French,AlphaTauri,12,53,+22.766,8.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Charles Leclerc,16,Monegasque,Ferrari,4,54,+30.814,6.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Nico Hülkenberg,27,German,Racing Point,20,58,+32.596,4.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Romain Grosjean,8,French,Haas F1 Team,16,54,+39.081,2.0,2024-09-04T09:58:42.937Z
2020,Eifel Grand Prix,2020-10-11T13:10:00Z,Nürburg,Antonio Giovinazzi,99,Italian,Alfa Romeo,14,38,+40.035,1.0,2024-09-04T09:58:42.937Z


In [0]:
grouped_demo_df = demo_df\
    .groupBy('race_year', 'driver_name')\
    .agg(sum('points').alias('total_points'), countDistinct('race_name').alias('races_attended'))\
    .orderBy('race_year')\
    .select('*')

grouped_demo_df.display()

race_year,driver_name,total_points,races_attended
2019,Kevin Magnussen,20.0,21
2019,Robert Kubica,1.0,21
2019,Lance Stroll,21.0,21
2019,Lando Norris,49.0,21
2019,Sebastian Vettel,240.0,21
2019,George Russell,0.0,21
2019,Carlos Sainz,96.0,21
2019,Max Verstappen,278.0,21
2019,Valtteri Bottas,326.0,21
2019,Charles Leclerc,264.0,21


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc, rank

In [0]:
driver_rank_spec = Window.partitionBy('race_year').orderBy(desc('total_points'))
grouped_demo_df = grouped_demo_df.withColumn('rank', rank().over(driver_rank_spec))

grouped_demo_df.display()

race_year,driver_name,total_points,races_attended,rank
2019,Lewis Hamilton,413.0,21,1
2019,Valtteri Bottas,326.0,21,2
2019,Max Verstappen,278.0,21,3
2019,Charles Leclerc,264.0,21,4
2019,Sebastian Vettel,240.0,21,5
2019,Carlos Sainz,96.0,21,6
2019,Pierre Gasly,95.0,21,7
2019,Alexander Albon,92.0,21,8
2019,Daniel Ricciardo,54.0,21,9
2019,Sergio Pérez,52.0,21,10
