In [0]:
dbutils.widgets.text("v_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("v_file_date")

In [0]:
%run "../includes/common_code"

### Ingestion constructors.csv
1. Read constructors.csv file from raw container using Dataframe reader API
2. Select only the required columns or drop the unwanted columns
3. Rename few columns to the suitable name
4. Add audit column which specifies the ingestion date/time/timestamp
5. Write the final transformed data into processsed container

In [0]:
# listing all the mounts
display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/mnt/formula1dl-demo,abfss://demo@formula1dlsiddhi.dfs.core.windows.net/,
/mnt/formula1dlsiddhi/processed,abfss://processed@formula1dlsiddhi.dfs.core.windows.net/,
/Volumes,UnityCatalogVolumes,
/mnt/formula1dlsiddhi/demo,abfss://demo@formula1dlsiddhi.dfs.core.windows.net/,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/databricks-results,databricks-results,
/mnt/formula1dlsiddhi/raw,abfss://raw@formula1dlsiddhi.dfs.core.windows.net/,
/databricks/mlflow-registry,databricks/mlflow-registry,
/Volume,DbfsReserved,


In [0]:
# listing all the files in raw container
display(dbutils.fs.ls(f"{raw_container_path}"))

path,name,size,modificationTime
dbfs:/mnt/formula1dlsiddhi/raw/2021-03-21/,2021-03-21/,0,1744679356000
dbfs:/mnt/formula1dlsiddhi/raw/2021-03-28/,2021-03-28/,0,1744679367000
dbfs:/mnt/formula1dlsiddhi/raw/2021-04-18/,2021-04-18/,0,1744679375000


In [0]:
# Reading the circuits.csv using spark reader API
# inferSchema automatically detects the underlying datatype of the data. but this will go through the data to one more scan, so it is not recommended
'''
constructors_df = spark.read.option("header", "true") \
                        .option("inferSchema", "true") \
                        .json("/mnt/formula1dlsiddhi/raw/constructors.json")
'''

'\nconstructors_df = spark.read.option("header", "true")                         .option("inferSchema", "true")                         .json("/mnt/formula1dlsiddhi/raw/constructors.json")\n'

In [0]:
# Defining schema for constructor dataframe using DDL, Alternately you can use StructType
constructors_schema = "constructorId INT, constructorRef STRING, name STRING, nationality STRING, url STRING"

In [0]:
constructors_df = spark.read.schema(constructors_schema) \
                            .option("header", "true") \
                            .json(f"{raw_container_path}/{v_file_date}/constructors.json")

In [0]:
constructors_df.printSchema()

root
 |-- constructorId: integer (nullable = true)
 |-- constructorRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url: string (nullable = true)



In [0]:
display(constructors_df)

constructorId,constructorRef,name,nationality,url
1,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren
2,bmw_sauber,BMW Sauber,German,http://en.wikipedia.org/wiki/BMW_Sauber
3,williams,Williams,British,http://en.wikipedia.org/wiki/Williams_Grand_Prix_Engineering
4,renault,Renault,French,http://en.wikipedia.org/wiki/Renault_in_Formula_One
5,toro_rosso,Toro Rosso,Italian,http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso
6,ferrari,Ferrari,Italian,http://en.wikipedia.org/wiki/Scuderia_Ferrari
7,toyota,Toyota,Japanese,http://en.wikipedia.org/wiki/Toyota_Racing
8,super_aguri,Super Aguri,Japanese,http://en.wikipedia.org/wiki/Super_Aguri_F1
9,red_bull,Red Bull,Austrian,http://en.wikipedia.org/wiki/Red_Bull_Racing
10,force_india,Force India,Indian,http://en.wikipedia.org/wiki/Racing_Point_Force_India


#### Further processing 1
1. From the above dataframe we don't need URL, so either select only the required columns or drop the unwanted column url
2. Rename the following columns
  - constructorId ---> constructor_id
  - constructorRef ---> constructor_ref
3. Add the audit column ingestion_date

In [0]:
from pyspark.sql.functions import col, current_timestamp, lit

In [0]:
# Dropping the unwanted URL column using dataframe.drop() function. Alternately you can select the only required columns ignoring the column that is not required and get the dataframe out of it
constructors_df = constructors_df.drop(col("url"))
display(constructors_df)

constructorId,constructorRef,name,nationality
1,mclaren,McLaren,British
2,bmw_sauber,BMW Sauber,German
3,williams,Williams,British
4,renault,Renault,French
5,toro_rosso,Toro Rosso,Italian
6,ferrari,Ferrari,Italian
7,toyota,Toyota,Japanese
8,super_aguri,Super Aguri,Japanese
9,red_bull,Red Bull,Austrian
10,force_india,Force India,Indian


In [0]:

constructors_df_final = constructors_df.withColumnRenamed("constructorId", "constructor_id") \
                                       .withColumnRenamed("constructorRef", "constructor_ref") \
                                       .withColumn("file_date", lit(v_file_date))

In [0]:
constructors_df_final = ingest_current_timestamp(constructors_df_final)

In [0]:
display(constructors_df_final)

constructor_id,constructor_ref,name,nationality,file_date,ingest_date
1,mclaren,McLaren,British,2021-03-21,2025-04-18T12:34:10.081Z
2,bmw_sauber,BMW Sauber,German,2021-03-21,2025-04-18T12:34:10.081Z
3,williams,Williams,British,2021-03-21,2025-04-18T12:34:10.081Z
4,renault,Renault,French,2021-03-21,2025-04-18T12:34:10.081Z
5,toro_rosso,Toro Rosso,Italian,2021-03-21,2025-04-18T12:34:10.081Z
6,ferrari,Ferrari,Italian,2021-03-21,2025-04-18T12:34:10.081Z
7,toyota,Toyota,Japanese,2021-03-21,2025-04-18T12:34:10.081Z
8,super_aguri,Super Aguri,Japanese,2021-03-21,2025-04-18T12:34:10.081Z
9,red_bull,Red Bull,Austrian,2021-03-21,2025-04-18T12:34:10.081Z
10,force_india,Force India,Indian,2021-03-21,2025-04-18T12:34:10.081Z


In [0]:
# Write the final dataframe into processed folder in the delta format
constructors_df_final.write.mode("overwrite").format("delta").saveAsTable("f1_processed.constructors")

In [0]:
# verifying if the data is written properly or not
df = spark.read.format("delta").load(f"{processed_container_path}/constructors")
display(df)

constructor_id,constructor_ref,name,nationality,file_date,ingest_date
1,mclaren,McLaren,British,2021-03-21,2025-04-18T12:35:07.445Z
2,bmw_sauber,BMW Sauber,German,2021-03-21,2025-04-18T12:35:07.445Z
3,williams,Williams,British,2021-03-21,2025-04-18T12:35:07.445Z
4,renault,Renault,French,2021-03-21,2025-04-18T12:35:07.445Z
5,toro_rosso,Toro Rosso,Italian,2021-03-21,2025-04-18T12:35:07.445Z
6,ferrari,Ferrari,Italian,2021-03-21,2025-04-18T12:35:07.445Z
7,toyota,Toyota,Japanese,2021-03-21,2025-04-18T12:35:07.445Z
8,super_aguri,Super Aguri,Japanese,2021-03-21,2025-04-18T12:35:07.445Z
9,red_bull,Red Bull,Austrian,2021-03-21,2025-04-18T12:35:07.445Z
10,force_india,Force India,Indian,2021-03-21,2025-04-18T12:35:07.445Z


In [0]:
dbutils.notebook.exit("success...")