# 0. Prepare configurations and race_results table

In [0]:
%run "../includes/configuration"

In [0]:
race_results_df = spark.read.parquet(f'{presentation_folder_path}/race_results')
race_results_df.columns

['race_year',
 'race_name',
 'race_date',
 'circuit_location',
 'driver_name',
 'driver_number',
 'driver_nationality',
 'team',
 'grid',
 'fastest_lap',
 'race_time',
 'points',
 'position',
 'created_date']

# 1. Get driver standing with their corresponding year, team and nationality

In [0]:
from pyspark.sql.functions import sum, count, when, rank, desc
from pyspark.sql.window import Window

In [0]:
driver_rank_spec = Window.partitionBy('race_year').orderBy(desc('points'), desc('wins'))

driver_standings_df = race_results_df\
    .groupBy('driver_name', 'race_year', 'team', 'driver_nationality')\
    .agg(
        sum('points').alias('points'),
        count(when(race_results_df.position == 1, True)).alias('wins')
    )\
    .withColumn('rank', rank().over(driver_rank_spec))\
    .select('*')

In [0]:
driver_standings_df.filter('race_year = 2020').display()

driver_name,race_year,team,driver_nationality,points,wins,rank
Lewis Hamilton,2020,Mercedes,British,347.0,11,1
Valtteri Bottas,2020,Mercedes,Finnish,223.0,2,2
Max Verstappen,2020,Red Bull,Dutch,214.0,2,3
Sergio Pérez,2020,Racing Point,Mexican,125.0,1,4
Daniel Ricciardo,2020,Renault,Australian,119.0,0,5
Carlos Sainz,2020,McLaren,Spanish,105.0,0,6
Alexander Albon,2020,Red Bull,Thai,105.0,0,6
Charles Leclerc,2020,Ferrari,Monegasque,98.0,0,8
Lando Norris,2020,McLaren,British,97.0,0,9
Pierre Gasly,2020,AlphaTauri,French,75.0,1,10


#2. Save to parquet in presentation folder

In [0]:
driver_standings_df.write.mode('overwrite').parquet(f'{presentation_folder_path}/driver_standings')