In [0]:
%run "../includes/common_code"

### Ingestion qualifying/qualifying_split*.json
1. Read qualifying/qualifying_split*.json file from raw container using Dataframe reader API
2. Select only the required columns or drop the unwanted columns
3. Rename few columns to the suitable name
4. Add audit column which specifies the ingestion date/time/timestamp

In [0]:
# listing all the mounts
display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/mnt/formula1dl-demo,abfss://demo@formula1dlsiddhi.dfs.core.windows.net/,
/mnt/formula1dlsiddhi/processed,abfss://processed@formula1dlsiddhi.dfs.core.windows.net/,
/Volumes,UnityCatalogVolumes,
/mnt/formula1dlsiddhi/demo,abfss://demo@formula1dlsiddhi.dfs.core.windows.net/,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/databricks-results,databricks-results,
/mnt/formula1dlsiddhi/raw,abfss://raw@formula1dlsiddhi.dfs.core.windows.net/,
/databricks/mlflow-registry,databricks/mlflow-registry,
/Volume,DbfsReserved,


In [0]:
# listing all the files in raw container
display(dbutils.fs.ls(raw_container_path))

path,name,size,modificationTime
dbfs:/mnt/formula1dlsiddhi/raw/circuits.csv,circuits.csv,10044,1743845809000
dbfs:/mnt/formula1dlsiddhi/raw/constructors/,constructors/,0,1743863074000
dbfs:/mnt/formula1dlsiddhi/raw/constructors.json,constructors.json,30415,1743845809000
dbfs:/mnt/formula1dlsiddhi/raw/drivers.json,drivers.json,180812,1743845809000
dbfs:/mnt/formula1dlsiddhi/raw/lap_times/,lap_times/,0,1743845837000
dbfs:/mnt/formula1dlsiddhi/raw/pit_stops.json,pit_stops.json,1369387,1743845809000
dbfs:/mnt/formula1dlsiddhi/raw/qualifying/,qualifying/,0,1743845857000
dbfs:/mnt/formula1dlsiddhi/raw/races.csv,races.csv,116847,1743845809000
dbfs:/mnt/formula1dlsiddhi/raw/results.json,results.json,7165641,1743845810000


Visit [spark_documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameReader.json.html) for read_json()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, IntegerType, FloatType

#### Defining the schema manually
1. Visit [spark documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.types.StructType.html) for Struct_Type

In [0]:
qualifying_schema = StructType([ StructField("qualifyingId", IntegerType(), False),
                                StructField("raceId", IntegerType(), True),
                                StructField("driverId", IntegerType(), True),
                                StructField("constructorId", IntegerType(), True),
                                StructField("number", IntegerType(), True),
                                StructField("position", IntegerType(), True),
                                StructField("q1", StringType(), True),
                                StructField("q2", StringType(), True),
                                StructField("q3", StringType(), True)
                            ])

In [0]:
qualifying_df = spark.read.schema(qualifying_schema) \
                        .option("header", "true") \
                        .option("multiLine", "true") \
                        .json(f"{raw_container_path}/qualifying/qualifying_split*.json")

In [0]:
# pit_stops schema
qualifying_df.printSchema()

root
 |-- qualifyingId: integer (nullable = true)
 |-- raceId: integer (nullable = true)
 |-- driverId: integer (nullable = true)
 |-- constructorId: integer (nullable = true)
 |-- number: integer (nullable = true)
 |-- position: integer (nullable = true)
 |-- q1: string (nullable = true)
 |-- q2: string (nullable = true)
 |-- q3: string (nullable = true)



In [0]:
display(qualifying_df)

qualifyingId,raceId,driverId,constructorId,number,position,q1,q2,q3
,18,1,1,22,1,1:26.572,1:25.187,1:26.714
,18,9,2,4,2,1:26.103,1:25.315,1:26.869
,18,5,1,23,3,1:25.664,1:25.452,1:27.079
,18,13,6,2,4,1:25.994,1:25.691,1:27.178
,18,2,2,3,5,1:25.960,1:25.518,1:27.236
,18,15,7,11,6,1:26.427,1:26.101,1:28.527
,18,3,3,7,7,1:26.295,1:26.059,1:28.687
,18,14,9,9,8,1:26.381,1:26.063,1:29.041
,18,10,7,12,9,1:26.919,1:26.164,1:29.593
,18,20,5,15,10,1:26.702,1:25.842,\N


#### Further processing 1
1. Rename the following columns
  - qualifyingId ---> qualifying_id
  - driverId ---> driver_id
  - raceId ---> race_id
  - constructorId ---> constructor_id
3. Add the audit column ingestion_date

In [0]:
from pyspark.sql.functions import col, concat, lit, current_timestamp

In [0]:
qualifying_df_final = qualifying_df.withColumnRenamed("raceId", "race_id") \
                                   .withColumnRenamed("driverId", "driver_id") \
                                   .withColumnRenamed("constructorId", "constructor_id") \
                                   .withColumnRenamed("qualifyingId", "qualifying_id")


In [0]:
qualifying_df_final = ingest_current_timestamp(qualifying_df_final)

In [0]:
display(qualifying_df_final)

qualifying_id,race_id,driver_id,constructor_id,number,position,q1,q2,q3,ingest_date
,18,1,1,22,1,1:26.572,1:25.187,1:26.714,2025-04-06T11:31:05.917Z
,18,9,2,4,2,1:26.103,1:25.315,1:26.869,2025-04-06T11:31:05.917Z
,18,5,1,23,3,1:25.664,1:25.452,1:27.079,2025-04-06T11:31:05.917Z
,18,13,6,2,4,1:25.994,1:25.691,1:27.178,2025-04-06T11:31:05.917Z
,18,2,2,3,5,1:25.960,1:25.518,1:27.236,2025-04-06T11:31:05.917Z
,18,15,7,11,6,1:26.427,1:26.101,1:28.527,2025-04-06T11:31:05.917Z
,18,3,3,7,7,1:26.295,1:26.059,1:28.687,2025-04-06T11:31:05.917Z
,18,14,9,9,8,1:26.381,1:26.063,1:29.041,2025-04-06T11:31:05.917Z
,18,10,7,12,9,1:26.919,1:26.164,1:29.593,2025-04-06T11:31:05.917Z
,18,20,5,15,10,1:26.702,1:25.842,\N,2025-04-06T11:31:05.917Z


#### Write the final data into processed folder
1. Visit [spark documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.write.html?highlight=write) for write()

In [0]:
qualifying_df_final.write.mode("overwrite") \
                      .parquet(f"{processed_container_path}/qualifying")

In [0]:
# Verify that the data was written to the parquet file
df = spark.read.parquet(f"{processed_container_path}/qualifying")
display(df)

qualifying_id,race_id,driver_id,constructor_id,number,position,q1,q2,q3,ingest_date
,18,1,1,22,1,1:26.572,1:25.187,1:26.714,2025-04-06T11:31:22.596Z
,18,9,2,4,2,1:26.103,1:25.315,1:26.869,2025-04-06T11:31:22.596Z
,18,5,1,23,3,1:25.664,1:25.452,1:27.079,2025-04-06T11:31:22.596Z
,18,13,6,2,4,1:25.994,1:25.691,1:27.178,2025-04-06T11:31:22.596Z
,18,2,2,3,5,1:25.960,1:25.518,1:27.236,2025-04-06T11:31:22.596Z
,18,15,7,11,6,1:26.427,1:26.101,1:28.527,2025-04-06T11:31:22.596Z
,18,3,3,7,7,1:26.295,1:26.059,1:28.687,2025-04-06T11:31:22.596Z
,18,14,9,9,8,1:26.381,1:26.063,1:29.041,2025-04-06T11:31:22.596Z
,18,10,7,12,9,1:26.919,1:26.164,1:29.593,2025-04-06T11:31:22.596Z
,18,20,5,15,10,1:26.702,1:25.842,\N,2025-04-06T11:31:22.596Z


In [0]:
dbutils.notebook.exit("Success...")