In [0]:
dbutils.widgets.text("v_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("v_file_date")

In [0]:
%run "../includes/common_code"

### Ingestion qualifying/qualifying_split*.json
1. Read qualifying/qualifying_split*.json file from raw container using Dataframe reader API
2. Select only the required columns or drop the unwanted columns
3. Rename few columns to the suitable name
4. Add audit column which specifies the ingestion date/time/timestamp

In [0]:
# listing all the mounts
display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/mnt/formula1dl-demo,abfss://demo@formula1dlsiddhi.dfs.core.windows.net/,
/mnt/formula1dlsiddhi/processed,abfss://processed@formula1dlsiddhi.dfs.core.windows.net/,
/Volumes,UnityCatalogVolumes,
/mnt/formula1dlsiddhi/demo,abfss://demo@formula1dlsiddhi.dfs.core.windows.net/,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/databricks-results,databricks-results,
/mnt/formula1dlsiddhi/raw,abfss://raw@formula1dlsiddhi.dfs.core.windows.net/,
/databricks/mlflow-registry,databricks/mlflow-registry,
/Volume,DbfsReserved,


In [0]:
# listing all the files in raw container
display(dbutils.fs.ls(raw_container_path))

path,name,size,modificationTime
dbfs:/mnt/formula1dlsiddhi/raw/2021-03-21/,2021-03-21/,0,1744679356000
dbfs:/mnt/formula1dlsiddhi/raw/2021-03-28/,2021-03-28/,0,1744679367000
dbfs:/mnt/formula1dlsiddhi/raw/2021-04-18/,2021-04-18/,0,1744679375000


Visit [spark_documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameReader.json.html) for read_json()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, IntegerType, FloatType

#### Defining the schema manually
1. Visit [spark documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.types.StructType.html) for Struct_Type

In [0]:
qualifying_schema = StructType([ StructField("qualifyingId", IntegerType(), False),
                                StructField("raceId", IntegerType(), True),
                                StructField("driverId", IntegerType(), True),
                                StructField("constructorId", IntegerType(), True),
                                StructField("number", IntegerType(), True),
                                StructField("position", IntegerType(), True),
                                StructField("q1", StringType(), True),
                                StructField("q2", StringType(), True),
                                StructField("q3", StringType(), True)
                            ])

In [0]:
qualifying_df = spark.read.schema(qualifying_schema) \
                        .option("header", "true") \
                        .option("multiLine", "true") \
                        .json(f"{raw_container_path}/{v_file_date}/qualifying/")

In [0]:
# pit_stops schema
qualifying_df.printSchema()

root
 |-- qualifyingId: integer (nullable = true)
 |-- raceId: integer (nullable = true)
 |-- driverId: integer (nullable = true)
 |-- constructorId: integer (nullable = true)
 |-- number: integer (nullable = true)
 |-- position: integer (nullable = true)
 |-- q1: string (nullable = true)
 |-- q2: string (nullable = true)
 |-- q3: string (nullable = true)



In [0]:
display(qualifying_df)

qualifyingId,raceId,driverId,constructorId,number,position,q1,q2,q3
,1053,1,131,44,1,1:14.823,1:14.817,1:14.411
,1053,815,9,11,2,1:15.395,1:14.716,1:14.446
,1053,830,9,33,3,1:15.109,1:14.884,1:14.498
,1053,844,6,16,4,1:15.413,1:14.808,1:14.740
,1053,842,213,10,5,1:15.548,1:14.927,1:14.790
,1053,817,1,3,6,1:15.669,1:15.033,1:14.826
,1053,846,1,4,7,1:15.009,1:14.718,1:14.875
,1053,822,131,77,8,1:14.672,1:14.905,1:14.898
,1053,839,214,31,9,1:15.385,1:15.117,1:15.210
,1053,840,117,18,10,1:15.522,1:15.138,\N


#### Further processing 1
1. Rename the following columns
  - qualifyingId ---> qualifying_id
  - driverId ---> driver_id
  - raceId ---> race_id
  - constructorId ---> constructor_id
3. Add the audit column ingestion_date

In [0]:
from pyspark.sql.functions import col, concat, lit, current_timestamp

In [0]:
qualifying_df_final = qualifying_df.withColumnRenamed("raceId", "race_id") \
                                   .withColumnRenamed("driverId", "driver_id") \
                                   .withColumnRenamed("constructorId", "constructor_id") \
                                   .withColumnRenamed("qualifyingId", "qualifying_id") \
                                   .withColumn("file_date", lit(v_file_date))


In [0]:
qualifying_df_final = ingest_current_timestamp(qualifying_df_final)

In [0]:
display(qualifying_df_final)

qualifying_id,race_id,driver_id,constructor_id,number,position,q1,q2,q3,file_date,ingest_date
,1053,1,131,44,1,1:14.823,1:14.817,1:14.411,2021-04-18,2025-04-18T13:24:40.857Z
,1053,815,9,11,2,1:15.395,1:14.716,1:14.446,2021-04-18,2025-04-18T13:24:40.857Z
,1053,830,9,33,3,1:15.109,1:14.884,1:14.498,2021-04-18,2025-04-18T13:24:40.857Z
,1053,844,6,16,4,1:15.413,1:14.808,1:14.740,2021-04-18,2025-04-18T13:24:40.857Z
,1053,842,213,10,5,1:15.548,1:14.927,1:14.790,2021-04-18,2025-04-18T13:24:40.857Z
,1053,817,1,3,6,1:15.669,1:15.033,1:14.826,2021-04-18,2025-04-18T13:24:40.857Z
,1053,846,1,4,7,1:15.009,1:14.718,1:14.875,2021-04-18,2025-04-18T13:24:40.857Z
,1053,822,131,77,8,1:14.672,1:14.905,1:14.898,2021-04-18,2025-04-18T13:24:40.857Z
,1053,839,214,31,9,1:15.385,1:15.117,1:15.210,2021-04-18,2025-04-18T13:24:40.857Z
,1053,840,117,18,10,1:15.522,1:15.138,\N,2021-04-18,2025-04-18T13:24:40.857Z


#### Write the final data into processed folder
1. Visit [spark documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.write.html?highlight=write) for write()

In [0]:
merge_condition = "target.qualifying_id = source.qualifying_id AND target.race_id = source.race_id"
mergeTable(qualifying_df_final, 'f1_processed', 'qualifying', merge_condition, 'race_id')

In [0]:
# overwrite_partition(qualifying_df_final, "f1_processed", "qualifying", "race_id")

In [0]:
# Verify that the data was written to the parquet file
df = spark.read.format("delta").load(f"{processed_container_path}/qualifying").filter("race_id = 1053")
display(df)

qualifying_id,race_id,driver_id,constructor_id,number,position,q1,q2,q3,file_date,ingest_date
,1053,1,131,44,1,1:14.823,1:14.817,1:14.411,2021-04-18,2025-04-18T13:24:41.505Z
,1053,815,9,11,2,1:15.395,1:14.716,1:14.446,2021-04-18,2025-04-18T13:24:41.505Z
,1053,830,9,33,3,1:15.109,1:14.884,1:14.498,2021-04-18,2025-04-18T13:24:41.505Z
,1053,844,6,16,4,1:15.413,1:14.808,1:14.740,2021-04-18,2025-04-18T13:24:41.505Z
,1053,842,213,10,5,1:15.548,1:14.927,1:14.790,2021-04-18,2025-04-18T13:24:41.505Z
,1053,817,1,3,6,1:15.669,1:15.033,1:14.826,2021-04-18,2025-04-18T13:24:41.505Z
,1053,846,1,4,7,1:15.009,1:14.718,1:14.875,2021-04-18,2025-04-18T13:24:41.505Z
,1053,822,131,77,8,1:14.672,1:14.905,1:14.898,2021-04-18,2025-04-18T13:24:41.505Z
,1053,839,214,31,9,1:15.385,1:15.117,1:15.210,2021-04-18,2025-04-18T13:24:41.505Z
,1053,840,117,18,10,1:15.522,1:15.138,\N,2021-04-18,2025-04-18T13:24:41.505Z


In [0]:
dbutils.notebook.exit("Success...")