In [21]:
file_date = '2021-04-18'

<div style="max-width:1400px;margin-center: auto">
<img src="images\laptime.png" width="600"/>
</div>

In [22]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local') \
    .appName("laptime") \
    .config("spark.sql.catalogImplementation", "hive") \
    .getOrCreate()

In [23]:
from pyspark.sql.types import StructField, StructType, IntegerType, StringType
from pyspark.sql.functions import col, current_timestamp,desc, lit, count

In [24]:
lap_time_schema = StructType( fields = [StructField("raceId", IntegerType(), False),
                                        StructField("driverId", IntegerType(), True),
                                        StructField("lap", IntegerType(), True),
                                        StructField("position", IntegerType(), True),
                                        StructField("time", StringType(), True),
                                        StructField("milliseconds", IntegerType(), True)])

In [25]:
lap_time_df = spark.read.csv(f'raw files\\{file_date}\\lap_times', schema = lap_time_schema)
lap_time_df.show(2)

+------+--------+---+--------+--------+------------+
|raceId|driverId|lap|position|    time|milliseconds|
+------+--------+---+--------+--------+------------+
|  1053|     830|  1|       1|1:38.603|       98603|
|  1053|     830|  2|       1|2:29.163|      149163|
+------+--------+---+--------+--------+------------+
only showing top 2 rows



In [26]:
lap_time_df.count()

1124

In [27]:
lap_time_fnl_df = lap_time_df.withColumnRenamed("raceId","race_id").\
                              withColumnRenamed("driverId","driver_id").\
                              withColumn("ingestion_date",current_timestamp()).\
                              withColumn("file_date",lit(file_date))
lap_time_fnl_df.show(2)

+-------+---------+---+--------+--------+------------+--------------------+----------+
|race_id|driver_id|lap|position|    time|milliseconds|      ingestion_date| file_date|
+-------+---------+---+--------+--------+------------+--------------------+----------+
|   1053|      830|  1|       1|1:38.603|       98603|2024-02-01 12:34:...|2021-04-18|
|   1053|      830|  2|       1|2:29.163|      149163|2024-02-01 12:34:...|2021-04-18|
+-------+---------+---+--------+--------+------------+--------------------+----------+
only showing top 2 rows



In [28]:
%run "common_functions.ipynb"

In [29]:
overwrite_partition(lap_time_fnl_df, 'f1_processed', 'lap_times', 'race_id')

In [30]:
df = spark.sql('SELECT * FROM f1_processed.lap_times')
print('df_file1_count for each raceId : ')
df.groupBy('race_id').agg(count('*')).orderBy(desc('race_id')).show()

df_file1_count for each raceId : 
+-------+--------+
|race_id|count(1)|
+-------+--------+
|   1053|    1124|
|   1052|    1026|
|   1047|    1043|
|   1046|    1531|
|   1045|    1016|
|   1044|    1076|
|   1043|    1128|
|   1042|    1288|
|   1041|    1017|
|   1040|     946|
|   1039|     778|
|   1038|     924|
|   1037|     766|
|   1036|    1274|
|   1035|    1025|
|   1034|     895|
|   1033|    1327|
|   1032|    1226|
|   1031|    1140|
|   1030|    1075|
+-------+--------+
only showing top 20 rows

