In [1]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = SparkConf().setAppName("aggregate").setMaster('local')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [2]:
race_df = spark.read.parquet('presentation/race_result')
race_df.show(2)

+---------+-----------------+-------------------+----------------+--------------+-------------+------------------+-------+----+-----------+---------+------+
|race_year|        race_name|          race_date|circuit_location|   driver_name|driver_number|driver_nationality|   team|grid|fastest_lap|race_time|points|
+---------+-----------------+-------------------+----------------+--------------+-------------+------------------+-------+----+-----------+---------+------+
|     1974|German Grand Prix|1974-08-04 00:00:00|         Nürburg|Clay Regazzoni|         null|             Swiss|Ferrari|   2|       null|1:41:35.0|   9.0|
|     1974|German Grand Prix|1974-08-04 00:00:00|         Nürburg|Jody Scheckter|         null|     South African|Tyrrell|   4|       null|    +50.7|   6.0|
+---------+-----------------+-------------------+----------------+--------------+-------------+------------------+-------+----+-----------+---------+------+
only showing top 2 rows



In [3]:
from pyspark.sql.functions import count, countDistinct, sum, max

In [4]:
filtered_df = race_df.filter('race_year = 2020').filter('driver_name = "Lewis Hamilton"').\
                    select(count("*").alias('Total Count'), 
                            countDistinct('race_name').alias('no. of races participated'),
                           sum('points').alias('total points earned'),
                          max('fastest_lap').alias('fastest lap'))
# filtered_df = filtered_df.filter('driver_name = Lewis Hamilton')
filtered_df.show(2)

+-----------+-------------------------+-------------------+-----------+
|Total Count|no. of races participated|total points earned|fastest lap|
+-----------+-------------------------+-------------------+-----------+
|         16|                       16|              347.0|         70|
+-----------+-------------------------+-------------------+-----------+



### Group By

In [5]:
race_df.groupBy("driver_name").\
        agg(count("*").alias('Total Count'), 
            countDistinct('race_name').alias('no. of races participated'),
            sum('points').alias('total points earned'),
            max('fastest_lap').alias('fastest lap')).show(2)

+------------------+-----------+-------------------------+-------------------+-----------+
|       driver_name|Total Count|no. of races participated|total points earned|fastest lap|
+------------------+-----------+-------------------------+-------------------+-----------+
|Piercarlo Ghinzani|        111|                       23|                2.0|       null|
|    Richie Ginther|         54|                       11|              107.0|       null|
+------------------+-----------+-------------------------+-------------------+-----------+
only showing top 2 rows



### Window Function

In [14]:
df = race_df.filter('race_year in (2019,2020)').groupBy("race_year","driver_name").\
        agg(countDistinct('race_name').alias('no. of races participated'),
            sum('points').alias('total points earned'),
            max('fastest_lap').alias('fastest lap'))

In [15]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, desc

In [18]:
driverRankSpec = Window.partitionBy('race_year').orderBy(desc('total points earned'))

In [19]:
fnl_df = df.withColumn("rank",rank().over(driverRankSpec)).show(5)

+---------+----------------+-------------------------+-------------------+-----------+----+
|race_year|     driver_name|no. of races participated|total points earned|fastest lap|rank|
+---------+----------------+-------------------------+-------------------+-----------+----+
|     2019|  Lewis Hamilton|                       21|              413.0|         69|   1|
|     2019| Valtteri Bottas|                       21|              326.0|         69|   2|
|     2019|  Max Verstappen|                       21|              278.0|         69|   3|
|     2019| Charles Leclerc|                       21|              264.0|         63|   4|
|     2019|Sebastian Vettel|                       21|              240.0|         68|   5|
+---------+----------------+-------------------------+-------------------+-----------+----+
only showing top 5 rows

