In [0]:
%run "../includes/common_code"

### Ingestion constructors.csv
1. Read constructors.csv file from raw container using Dataframe reader API
2. Select only the required columns or drop the unwanted columns
3. Rename few columns to the suitable name
4. Add audit column which specifies the ingestion date/time/timestamp
5. Write the final transformed data into processsed container

In [0]:
# listing all the mounts
display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/mnt/formula1dl-demo,abfss://demo@formula1dlsiddhi.dfs.core.windows.net/,
/mnt/formula1dlsiddhi/processed,abfss://processed@formula1dlsiddhi.dfs.core.windows.net/,
/Volumes,UnityCatalogVolumes,
/mnt/formula1dlsiddhi/demo,abfss://demo@formula1dlsiddhi.dfs.core.windows.net/,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/databricks-results,databricks-results,
/mnt/formula1dlsiddhi/raw,abfss://raw@formula1dlsiddhi.dfs.core.windows.net/,
/databricks/mlflow-registry,databricks/mlflow-registry,
/Volume,DbfsReserved,


In [0]:
# listing all the files in raw container
display(dbutils.fs.ls(f"{raw_container_path}"))

path,name,size,modificationTime
dbfs:/mnt/formula1dlsiddhi/raw/circuits.csv,circuits.csv,10044,1743845809000
dbfs:/mnt/formula1dlsiddhi/raw/constructors/,constructors/,0,1743863074000
dbfs:/mnt/formula1dlsiddhi/raw/constructors.json,constructors.json,30415,1743845809000
dbfs:/mnt/formula1dlsiddhi/raw/drivers.json,drivers.json,180812,1743845809000
dbfs:/mnt/formula1dlsiddhi/raw/lap_times/,lap_times/,0,1743845837000
dbfs:/mnt/formula1dlsiddhi/raw/pit_stops.json,pit_stops.json,1369387,1743845809000
dbfs:/mnt/formula1dlsiddhi/raw/qualifying/,qualifying/,0,1743845857000
dbfs:/mnt/formula1dlsiddhi/raw/races.csv,races.csv,116847,1743845809000
dbfs:/mnt/formula1dlsiddhi/raw/results.json,results.json,7165641,1743845810000


In [0]:
# Reading the circuits.csv using spark reader API
# inferSchema automatically detects the underlying datatype of the data. but this will go through the data to one more scan, so it is not recommended
'''
constructors_df = spark.read.option("header", "true") \
                        .option("inferSchema", "true") \
                        .json("/mnt/formula1dlsiddhi/raw/constructors.json")
'''

'\nconstructors_df = spark.read.option("header", "true")                         .option("inferSchema", "true")                         .json("/mnt/formula1dlsiddhi/raw/constructors.json")\n'

In [0]:
# Defining schema for constructor dataframe using DDL, Alternately you can use StructType
constructors_schema = "constructorId INT, constructorRef STRING, name STRING, nationality STRING, url STRING"

In [0]:
constructors_df = spark.read.schema(constructors_schema) \
                            .option("header", "true") \
                            .json(f"{raw_container_path}/constructors.json")

In [0]:
constructors_df.printSchema()

root
 |-- constructorId: integer (nullable = true)
 |-- constructorRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url: string (nullable = true)



In [0]:
display(constructors_df)

constructorId,constructorRef,name,nationality,url
1,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren
2,bmw_sauber,BMW Sauber,German,http://en.wikipedia.org/wiki/BMW_Sauber
3,williams,Williams,British,http://en.wikipedia.org/wiki/Williams_Grand_Prix_Engineering
4,renault,Renault,French,http://en.wikipedia.org/wiki/Renault_in_Formula_One
5,toro_rosso,Toro Rosso,Italian,http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso
6,ferrari,Ferrari,Italian,http://en.wikipedia.org/wiki/Scuderia_Ferrari
7,toyota,Toyota,Japanese,http://en.wikipedia.org/wiki/Toyota_Racing
8,super_aguri,Super Aguri,Japanese,http://en.wikipedia.org/wiki/Super_Aguri_F1
9,red_bull,Red Bull,Austrian,http://en.wikipedia.org/wiki/Red_Bull_Racing
10,force_india,Force India,Indian,http://en.wikipedia.org/wiki/Racing_Point_Force_India


#### Further processing 1
1. From the above dataframe we don't need URL, so either select only the required columns or drop the unwanted column url
2. Rename the following columns
  - constructorId ---> constructor_id
  - constructorRef ---> constructor_ref
3. Add the audit column ingestion_date

In [0]:
from pyspark.sql.functions import col, current_timestamp

In [0]:
# Dropping the unwanted URL column using dataframe.drop() function. Alternately you can select the only required columns ignoring the column that is not required and get the dataframe out of it
constructors_df = constructors_df.drop(col("url"))
display(constructors_df)

constructorId,constructorRef,name,nationality
1,mclaren,McLaren,British
2,bmw_sauber,BMW Sauber,German
3,williams,Williams,British
4,renault,Renault,French
5,toro_rosso,Toro Rosso,Italian
6,ferrari,Ferrari,Italian
7,toyota,Toyota,Japanese
8,super_aguri,Super Aguri,Japanese
9,red_bull,Red Bull,Austrian
10,force_india,Force India,Indian


In [0]:

constructors_df_final = constructors_df.withColumnRenamed("constructorId", "constructor_id") \
                                       .withColumnRenamed("constructorRef", "constructor_ref")

In [0]:
constructors_df_final = ingest_current_timestamp(constructors_df_final)

In [0]:
display(constructors_df_final)

constructor_id,constructor_ref,name,nationality,ingest_date
1,mclaren,McLaren,British,2025-04-06T11:41:14.608Z
2,bmw_sauber,BMW Sauber,German,2025-04-06T11:41:14.608Z
3,williams,Williams,British,2025-04-06T11:41:14.608Z
4,renault,Renault,French,2025-04-06T11:41:14.608Z
5,toro_rosso,Toro Rosso,Italian,2025-04-06T11:41:14.608Z
6,ferrari,Ferrari,Italian,2025-04-06T11:41:14.608Z
7,toyota,Toyota,Japanese,2025-04-06T11:41:14.608Z
8,super_aguri,Super Aguri,Japanese,2025-04-06T11:41:14.608Z
9,red_bull,Red Bull,Austrian,2025-04-06T11:41:14.608Z
10,force_india,Force India,Indian,2025-04-06T11:41:14.608Z


In [0]:
# Write the final dataframe into processed folder in the parquet format
constructors_df_final.write.mode("overwrite").parquet(f"{processed_container_path}/constructors")

In [0]:
# verifying if the data is written properly or not
df = spark.read.parquet(f"{processed_container_path}/constructors")
display(df)

constructor_id,constructor_ref,name,nationality,ingest_date
1,mclaren,McLaren,British,2025-04-06T11:41:17.228Z
2,bmw_sauber,BMW Sauber,German,2025-04-06T11:41:17.228Z
3,williams,Williams,British,2025-04-06T11:41:17.228Z
4,renault,Renault,French,2025-04-06T11:41:17.228Z
5,toro_rosso,Toro Rosso,Italian,2025-04-06T11:41:17.228Z
6,ferrari,Ferrari,Italian,2025-04-06T11:41:17.228Z
7,toyota,Toyota,Japanese,2025-04-06T11:41:17.228Z
8,super_aguri,Super Aguri,Japanese,2025-04-06T11:41:17.228Z
9,red_bull,Red Bull,Austrian,2025-04-06T11:41:17.228Z
10,force_india,Force India,Indian,2025-04-06T11:41:17.228Z


In [0]:
dbutils.notebook.exit("success...")