# 0. Prepare configurations and race_results table

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text('p_file_date', '2021-03-21')
param_file_date = dbutils.widgets.get('p_file_date')

In [0]:
race_results_df = spark.read\
    .table('f1_presentation.race_results')\
    .filter(f"file_date <= '{param_file_date}'")
race_results_df.columns

['race_id',
 'race_year',
 'race_name',
 'race_date',
 'circuit_location',
 'driver_name',
 'driver_number',
 'driver_nationality',
 'team',
 'grid',
 'fastest_lap',
 'race_time',
 'points',
 'position',
 'created_date',
 'file_date']

# 1. Get driver standing with their corresponding year, team and nationality

In [0]:
from pyspark.sql.functions import sum, count, when, rank, desc, lit
from pyspark.sql.window import Window

In [0]:
driver_rank_spec = Window.partitionBy('race_year').orderBy(desc('points'), desc('wins'))

driver_standings_df = race_results_df\
    .groupBy('driver_name', 'race_year', 'driver_nationality')\
    .agg(
        sum('points').alias('points'),
        count(when(race_results_df.position == 1, True)).alias('wins')
    )\
    .withColumn('rank', rank().over(driver_rank_spec))\
    .withColumn('file_date', lit(param_file_date))\
    .select('*')

In [0]:
driver_standings_df.filter('race_year = 2021').display()

driver_name,race_year,driver_nationality,points,wins,rank,file_date
Lewis Hamilton,2021,British,44.0,1,1,2021-04-18
Max Verstappen,2021,Dutch,43.0,1,2,2021-04-18
Lando Norris,2021,British,27.0,0,3,2021-04-18
Charles Leclerc,2021,Monegasque,20.0,0,4,2021-04-18
Valtteri Bottas,2021,Finnish,16.0,0,5,2021-04-18
Carlos Sainz,2021,Spanish,14.0,0,6,2021-04-18
Daniel Ricciardo,2021,Australian,14.0,0,6,2021-04-18
Sergio Pérez,2021,Mexican,10.0,0,8,2021-04-18
Pierre Gasly,2021,French,6.0,0,9,2021-04-18
Lance Stroll,2021,Canadian,5.0,0,10,2021-04-18


#2. Save to parquet in presentation folder

In [0]:
merge_condition = 'target.driver_name = source.driver_name and\
                   target.race_year = source.race_year'
upsert_to_delta_table('f1_presentation', 'driver_standings', presentation_folder_path, driver_standings_df, merge_condition, 'race_year')

In [0]:
driver_standings_df = spark.read.table('f1_presentation.driver_standings')
display(driver_standings_df.filter(f"file_date = '{param_file_date}'").limit(10))

driver_name,race_year,driver_nationality,points,wins,rank,file_date
Alberto Ascari,1952,Italian,53.5,6,1,2021-04-18
Nino Farina,1952,Italian,27.0,0,2,2021-04-18
Piero Taruffi,1952,Italian,22.0,1,3,2021-04-18
Rudi Fischer,1952,Swiss,10.0,0,4,2021-04-18
Mike Hawthorn,1952,British,10.0,0,4,2021-04-18
Robert Manzon,1952,French,9.0,0,6,2021-04-18
Troy Ruttman,1952,American,8.0,1,7,2021-04-18
Luigi Villoresi,1952,Italian,8.0,0,8,2021-04-18
José Froilán González,1952,Argentine,6.5,0,9,2021-04-18
Jim Rathmann,1952,American,6.0,0,10,2021-04-18
