# 0. Get configuration parameters and functions

In [0]:
%run "../includes/configuration"

In [0]:
raw_folder_path

'abfss://raw@aubdbcourse.dfs.core.windows.net'

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text('p_data_source', '')
param_data_source = dbutils.widgets.get('p_data_source')

# 1. Read data from results.json

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [0]:
results_schema = StructType([
    StructField('resultId', IntegerType(), False),
    StructField('raceId', IntegerType(), False),
    StructField('driverId', IntegerType(), False),
    StructField('constructorId', IntegerType(), False),
    StructField('number', IntegerType(), True),
    StructField('grid', IntegerType(), False),
    StructField('position', IntegerType(), True),
    StructField('positionText', StringType(), False),
    StructField('positionOrder', IntegerType(), False),
    StructField('points', FloatType(), False),
    StructField('laps', IntegerType(), False),
    StructField('time', StringType(), True),
    StructField('milliseconds', IntegerType(), True),
    StructField('fastestLap', IntegerType(), True),
    StructField('rank', IntegerType(), True),
    StructField('fastestLapTime', StringType(), True),
    StructField('fastestLapSpeed', StringType(), True),
    StructField('statusId', StringType(), False)
])

In [0]:
results_df = spark.read \
    .option('inferSchema', 'false') \
    .schema(results_schema) \
    .json(f'{raw_folder_path}/results.json')

In [0]:
results_df.printSchema()

root
 |-- resultId: integer (nullable = true)
 |-- raceId: integer (nullable = true)
 |-- driverId: integer (nullable = true)
 |-- constructorId: integer (nullable = true)
 |-- number: integer (nullable = true)
 |-- grid: integer (nullable = true)
 |-- position: integer (nullable = true)
 |-- positionText: string (nullable = true)
 |-- positionOrder: integer (nullable = true)
 |-- points: float (nullable = true)
 |-- laps: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- milliseconds: integer (nullable = true)
 |-- fastestLap: integer (nullable = true)
 |-- rank: integer (nullable = true)
 |-- fastestLapTime: string (nullable = true)
 |-- fastestLapSpeed: string (nullable = true)
 |-- statusId: string (nullable = true)



In [0]:
display(results_df.limit(10))

resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
1,18,1,1,22,1,1.0,1,1,10.0,58,1:34:50.616,5690616.0,39,2,1:27.452,218.3,1
2,18,2,2,3,5,2.0,2,2,8.0,58,+5.478,5696094.0,41,3,1:27.739,217.586,1
3,18,3,3,7,7,3.0,3,3,6.0,58,+8.163,5698779.0,41,5,1:28.090,216.719,1
4,18,4,4,5,11,4.0,4,4,5.0,58,+17.181,5707797.0,58,7,1:28.603,215.464,1
5,18,5,1,23,3,5.0,5,5,4.0,58,+18.014,5708630.0,43,1,1:27.418,218.385,1
6,18,6,3,8,13,6.0,6,6,3.0,57,\N,,50,14,1:29.639,212.974,11
7,18,7,5,14,17,7.0,7,7,2.0,55,\N,,22,12,1:29.534,213.224,5
8,18,8,6,1,15,8.0,8,8,1.0,53,\N,,20,4,1:27.903,217.18,5
9,18,9,2,4,2,,R,9,0.0,47,\N,,15,9,1:28.753,215.1,4
10,18,10,7,12,18,,R,10,0.0,43,\N,,23,13,1:29.558,213.166,3


# 2. Reform the schema:
- rename ids, position and lap attributes to snake_case convention
- add ingestion date
- get rid of status column

In [0]:
from pyspark.sql.functions import current_timestamp, lit

In [0]:
results_df = results_df \
  .withColumnRenamed('resultId', 'result_id') \
  .withColumnRenamed('raceId', 'race_id') \
  .withColumnRenamed('driverId', 'driver_id') \
  .withColumnRenamed('constructorId', 'constructor_id') \
  .withColumnRenamed('positionText', 'position_text') \
  .withColumnRenamed('positionOrder', 'position_order') \
  .withColumnRenamed('fastestLap', 'fastest_lap') \
  .withColumnRenamed('fastestLapTime', 'fastest_lap_time') \
  .withColumnRenamed('fastestLapSpeed', 'fastest_lap_speed') \
  .withColumn('data_source', lit(param_data_source)) \
  .withColumn('ingestion_date', current_timestamp()) \
  .drop('statusId')

In [0]:
display(results_df.limit(10))

result_id,race_id,driver_id,constructor_id,number,grid,position,position_text,position_order,points,laps,time,milliseconds,fastest_lap,rank,fastest_lap_time,fastest_lap_speed,ingestion_date
1,18,1,1,22,1,1.0,1,1,10.0,58,1:34:50.616,5690616.0,39,2,1:27.452,218.3,2024-09-04T06:15:05.84Z
2,18,2,2,3,5,2.0,2,2,8.0,58,+5.478,5696094.0,41,3,1:27.739,217.586,2024-09-04T06:15:05.84Z
3,18,3,3,7,7,3.0,3,3,6.0,58,+8.163,5698779.0,41,5,1:28.090,216.719,2024-09-04T06:15:05.84Z
4,18,4,4,5,11,4.0,4,4,5.0,58,+17.181,5707797.0,58,7,1:28.603,215.464,2024-09-04T06:15:05.84Z
5,18,5,1,23,3,5.0,5,5,4.0,58,+18.014,5708630.0,43,1,1:27.418,218.385,2024-09-04T06:15:05.84Z
6,18,6,3,8,13,6.0,6,6,3.0,57,\N,,50,14,1:29.639,212.974,2024-09-04T06:15:05.84Z
7,18,7,5,14,17,7.0,7,7,2.0,55,\N,,22,12,1:29.534,213.224,2024-09-04T06:15:05.84Z
8,18,8,6,1,15,8.0,8,8,1.0,53,\N,,20,4,1:27.903,217.18,2024-09-04T06:15:05.84Z
9,18,9,2,4,2,,R,9,0.0,47,\N,,15,9,1:28.753,215.1,2024-09-04T06:15:05.84Z
10,18,10,7,12,18,,R,10,0.0,43,\N,,23,13,1:29.558,213.166,2024-09-04T06:15:05.84Z


# 3. Save and read from parquet

In [0]:
results_df.write.mode('overwrite').partitionBy('race_id').parquet(f'{processed_folder_path}/results')

In [0]:
results_df = spark.read.parquet(f'{processed_folder_path}/results')
display(results_df.limit(10))

result_id,driver_id,constructor_id,number,grid,position,position_text,position_order,points,laps,time,milliseconds,fastest_lap,rank,fastest_lap_time,fastest_lap_speed,ingestion_date,race_id
19232,657,113,14,19,1,1,1,8.0,200,3:49:17.27,13757270,,,\N,\N,2024-09-04T06:15:06.293Z,800
19233,525,114,9,3,2,2,2,6.0,200,+1:09.95,13827220,,,\N,\N,2024-09-04T06:15:06.293Z,800
19234,658,113,2,1,3,3,3,5.0,200,+1:19.73,13837000,,,\N,\N,2024-09-04T06:15:06.293Z,800
19235,526,113,34,11,4,4,4,1.5,200,+2:52.68,13929950,,,\N,\N,2024-09-04T06:15:06.293Z,800
19236,673,113,73,14,5,5,5,2.0,200,+3:24.55,13961820,,,\N,\N,2024-09-04T06:15:06.293Z,800
19237,615,113,77,24,6,6,6,0.0,200,+3:47.55,13984820,,,\N,\N,2024-09-04T06:15:06.293Z,800
19238,528,109,7,6,7,7,7,0.0,200,+4:13.35,14010620,,,\N,\N,2024-09-04T06:15:06.293Z,800
19239,555,113,5,32,8,8,8,0.0,200,+5:01.17,14058440,,,\N,\N,2024-09-04T06:15:06.293Z,800
19240,674,113,28,25,9,9,9,0.0,200,+7:07.24,14184510,,,\N,\N,2024-09-04T06:15:06.293Z,800
19241,655,129,24,13,10,10,10,0.0,200,+7:07.69,14184960,,,\N,\N,2024-09-04T06:15:06.293Z,800


In [0]:
dbutils.notebook.exit('success')