In [0]:
%run "../includes/common_code"

Using race_results from presentation layer to get driver standings

In [0]:
race_results_df = spark.read.parquet(f"{presentation_container_path}/race_results")

In [0]:
from pyspark.sql.functions import col, when, count, sum, desc, asc

First way to get driver standings but this don't has rank column but shows the correct order

In [0]:
driver_standings_1 = race_results_df \
    .groupBy(race_results_df.race_year, race_results_df.nationality, race_results_df.driver_name, race_results_df.team_name) \
        .agg(sum(col('points')).alias('points'), count(when(col('position') == 1, True)).alias('Wins')) \
            .orderBy(desc(col('points')), desc(col('Wins')))

In [0]:
display(driver_standings_1.filter(driver_standings_1.race_year == 2020))

race_year,nationality,driver_name,team_name,points,Wins
2020,British,Lewis Hamilton,Mercedes,347.0,11
2020,Finnish,Valtteri Bottas,Mercedes,223.0,2
2020,Dutch,Max Verstappen,Red Bull,214.0,2
2020,Mexican,Sergio Pérez,Racing Point,125.0,1
2020,Australian,Daniel Ricciardo,Renault,119.0,0
2020,Thai,Alexander Albon,Red Bull,105.0,0
2020,Spanish,Carlos Sainz,McLaren,105.0,0
2020,Monegasque,Charles Leclerc,Ferrari,98.0,0
2020,British,Lando Norris,McLaren,97.0,0
2020,French,Pierre Gasly,AlphaTauri,75.0,1


Lets do it using window functions, with this you can get rank

In [0]:
from pyspark.sql import Window
from pyspark.sql.functions import rank

In [0]:
driver_standings_2 = race_results_df \
    .groupBy(race_results_df.race_year, race_results_df.nationality, race_results_df.driver_name, race_results_df.team_name) \
        .agg(sum(col('points')).alias('points'), count(when(col('position') == 1, True)).alias('Wins'))

In [0]:
driver_rank = Window.partitionBy('race_year').orderBy(desc(col('points')), desc(col('Wins')))
driver_standings_2 = driver_standings_2.withColumn('rank', rank().over(driver_rank))

In [0]:
# Lets reorder according to the BBC driver standings page
driver_standings_final = driver_standings_2.select(driver_standings_2.rank, driver_standings_2.nationality, driver_standings_2.driver_name, driver_standings_2.team_name, driver_standings_2.Wins, driver_standings_2.points, driver_standings_2.race_year)

In [0]:
display(driver_standings_final.filter(driver_standings_final.race_year == 2020))

rank,nationality,driver_name,team_name,Wins,points,race_year
1,British,Lewis Hamilton,Mercedes,11,347.0,2020
2,Finnish,Valtteri Bottas,Mercedes,2,223.0,2020
3,Dutch,Max Verstappen,Red Bull,2,214.0,2020
4,Mexican,Sergio Pérez,Racing Point,1,125.0,2020
5,Australian,Daniel Ricciardo,Renault,0,119.0,2020
6,Thai,Alexander Albon,Red Bull,0,105.0,2020
6,Spanish,Carlos Sainz,McLaren,0,105.0,2020
8,Monegasque,Charles Leclerc,Ferrari,0,98.0,2020
9,British,Lando Norris,McLaren,0,97.0,2020
10,French,Pierre Gasly,AlphaTauri,1,75.0,2020


In [0]:
# Lets write this into presentation layer
driver_standings_final.write.mode("overwrite").parquet(f"{presentation_container_path}/driver_standings")

In [0]:
dbutils.notebook.exit("success")