In [0]:
dbutils.widgets.text("v_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("v_file_date")

In [0]:
%run "../includes/common_code"

### Ingestion circuits.csv
1. Read circuits.csv file from raw container using Dataframe reader API
2. Select only the required columns or drop the unwanted columns
3. Rename few columns to the suitable name
4. Add audit column which specifies the ingestion date/time/timestamp
5. Write the final transformed data into processsed container

In [0]:
# listing all the mounts
display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/databricks-datasets,databricks-datasets,
/mnt/formula1dl-demo,abfss://demo@formula1dlsiddhi.dfs.core.windows.net/,
/mnt/formula1dlsiddhi/processed,abfss://processed@formula1dlsiddhi.dfs.core.windows.net/,
/Volumes,UnityCatalogVolumes,
/mnt/formula1dlsiddhi/demo,abfss://demo@formula1dlsiddhi.dfs.core.windows.net/,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/databricks-results,databricks-results,
/mnt/formula1dlsiddhi/raw,abfss://raw@formula1dlsiddhi.dfs.core.windows.net/,
/databricks/mlflow-registry,databricks/mlflow-registry,
/Volume,DbfsReserved,


In [0]:
# listing all the files in raw container
# display(dbutils.fs.ls("/mnt/formula1dlsiddhi/raw"))
display(dbutils.fs.ls(raw_container_path))

path,name,size,modificationTime
dbfs:/mnt/formula1dlsiddhi/raw/2021-03-21/,2021-03-21/,0,1744679356000
dbfs:/mnt/formula1dlsiddhi/raw/2021-03-28/,2021-03-28/,0,1744679367000
dbfs:/mnt/formula1dlsiddhi/raw/2021-04-18/,2021-04-18/,0,1744679375000


Visit [spark_documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/api/pyspark.pandas.read_csv.html#pyspark.pandas.read_csv) for read_csv()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType

#### Defining the schema manually
1. Visit [spark documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.types.StructType.html) for Struct_Type

In [0]:
circuits_schema = StructType([ StructField("circuitId", IntegerType(), True),
                               StructField("circuitRef", StringType(), True),
                               StructField("name", StringType(), True),
                               StructField("location", StringType(), True),
                               StructField("country", StringType(), True),
                               StructField("lat", DoubleType(), True),
                               StructField("lng", DoubleType(), True),
                               StructField("alt", IntegerType(), True),
                               StructField("url", StringType(), True)
                            ])

In [0]:
circuits_df = spark.read.schema(circuits_schema) \
                        .option("header", "true") \
                        .csv(f"{raw_container_path}/{v_file_date}/circuits.csv")

In [0]:
# printing schema
circuits_df.printSchema()

root
 |-- circuitId: integer (nullable = true)
 |-- circuitRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lng: double (nullable = true)
 |-- alt: integer (nullable = true)
 |-- url: string (nullable = true)



In [0]:
display(circuits_df)

circuitId,circuitRef,name,location,country,lat,lng,alt,url
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_Circuit
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,http://en.wikipedia.org/wiki/Sepang_International_Circuit
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_International_Circuit
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcelona-Catalunya
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,http://en.wikipedia.org/wiki/Istanbul_Park
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,http://en.wikipedia.org/wiki/Circuit_de_Monaco
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,http://en.wikipedia.org/wiki/Circuit_Gilles_Villeneuve
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,http://en.wikipedia.org/wiki/Circuit_de_Nevers_Magny-Cours
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,http://en.wikipedia.org/wiki/Silverstone_Circuit
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,http://en.wikipedia.org/wiki/Hockenheimring


#### Further processing 1
1. From the above dataframe we don't need URL, so either select only the required columns or drop the unwanted column url
2. Rename the following columns
  - circuitId ---> circuit_id
  - circuitRef ---> circuit_ref
  - name ---> circuit_name
  - location ---> circuit_location
  - country ---> circuit_country
  - lat ---> circuit_latitude
  - long ---> circuit_longitude
  - alt ---> circuit_altitude
3. Add the audit column ingestion_date

In [0]:
from pyspark.sql.functions import col, lit

In [0]:
# you can col, dataframe.col() to select columns here I used the simple method
# Can be used like this as well
# circuits_df_selected = circuits_df.select(col("circuitId").alias("circuit_id"), col("circuitRef").alias("circuit_ref"), col("name").alias("circuit_name"), col("location").alias("circuit_location"), col("country").alias("circuit_country"), col("lat").alias("circuit_lat"), col("lng").alias("circuit_lng"), col("alt").alias("circuit_alt"))
# Can be used like this as well
# circuits_df_selected = circuits_df.select(circuits_df.circuitId.alias("circuit_id"), ......)
circuits_df_selected = circuits_df.select("circuitId", "circuitRef", "name", "location", "country", "lat", "lng", "alt").withColumn("file_date", lit(v_file_date))


In [0]:
display(circuits_df_selected)

circuitId,circuitRef,name,location,country,lat,lng,alt,file_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,2021-03-21
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,2021-03-21
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,2021-03-21
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,2021-03-21
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,2021-03-21
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,2021-03-21
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,2021-03-21
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,2021-03-21
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,2021-03-21
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,2021-03-21


In [0]:
circuits_df_columns_renamed = circuits_df_selected.withColumnRenamed("circuitId", "circuit_id") \
                                                  .withColumnRenamed("circuitRef", "circuit_ref") \
                                                  .withColumnRenamed("name", "circuit_name") \
                                                  .withColumnRenamed("location", "circuit_location") \
                                                  .withColumnRenamed("country", "circuit_country") \
                                                  .withColumnRenamed("lat", "circuit_lat") \
                                                  .withColumnRenamed("lng", "circuit_lng") \
                                                  .withColumnRenamed("alt", "circuit_altitude")
display(circuits_df_columns_renamed)

circuit_id,circuit_ref,circuit_name,circuit_location,circuit_country,circuit_lat,circuit_lng,circuit_altitude,file_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,2021-03-21
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,2021-03-21
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,2021-03-21
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,2021-03-21
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,2021-03-21
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,2021-03-21
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,2021-03-21
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,2021-03-21
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,2021-03-21
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,2021-03-21


In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
circuits_df_add_audit_column = ingest_current_timestamp(circuits_df_columns_renamed)
display(circuits_df_add_audit_column)

circuit_id,circuit_ref,circuit_name,circuit_location,circuit_country,circuit_lat,circuit_lng,circuit_altitude,file_date,ingest_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,2021-03-21,2025-04-17T15:58:48.653Z
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,2021-03-21,2025-04-17T15:58:48.653Z
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,2021-03-21,2025-04-17T15:58:48.653Z
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,2021-03-21,2025-04-17T15:58:48.653Z
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,2021-03-21,2025-04-17T15:58:48.653Z
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,2021-03-21,2025-04-17T15:58:48.653Z
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,2021-03-21,2025-04-17T15:58:48.653Z
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,2021-03-21,2025-04-17T15:58:48.653Z
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,2021-03-21,2025-04-17T15:58:48.653Z
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,2021-03-21,2025-04-17T15:58:48.653Z


In [0]:
# For naming conventions copy the circuits_df_add_audit_column DataFrame to a new DataFrame with the name circuits_df_final
circuits_df_final = circuits_df_add_audit_column
display(circuits_df_final)

circuit_id,circuit_ref,circuit_name,circuit_location,circuit_country,circuit_lat,circuit_lng,circuit_altitude,file_date,ingest_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,2021-03-21,2025-04-17T15:58:49.012Z
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,2021-03-21,2025-04-17T15:58:49.012Z
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,2021-03-21,2025-04-17T15:58:49.012Z
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,2021-03-21,2025-04-17T15:58:49.012Z
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,2021-03-21,2025-04-17T15:58:49.012Z
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,2021-03-21,2025-04-17T15:58:49.012Z
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,2021-03-21,2025-04-17T15:58:49.012Z
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,2021-03-21,2025-04-17T15:58:49.012Z
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,2021-03-21,2025-04-17T15:58:49.012Z
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,2021-03-21,2025-04-17T15:58:49.012Z


#### Write the final data into processed folder
1. Visit [spark documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.write.html?highlight=write) for write()

In [0]:
# circuits_df_final.write.mode("overwrite").parquet(f"{processed_container_path}/circuits")
circuits_df_final.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.circuits")

In [0]:
# Verify that the data was written to the parquet file
df = spark.read.parquet(f"{processed_container_path}/circuits")
display(df)

circuit_id,circuit_ref,circuit_name,circuit_location,circuit_country,circuit_lat,circuit_lng,circuit_altitude,file_date,ingest_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,2021-03-21,2025-04-17T15:58:49.979Z
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,2021-03-21,2025-04-17T15:58:49.979Z
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,2021-03-21,2025-04-17T15:58:49.979Z
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,2021-03-21,2025-04-17T15:58:49.979Z
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,2021-03-21,2025-04-17T15:58:49.979Z
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,2021-03-21,2025-04-17T15:58:49.979Z
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,2021-03-21,2025-04-17T15:58:49.979Z
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,2021-03-21,2025-04-17T15:58:49.979Z
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153,2021-03-21,2025-04-17T15:58:49.979Z
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103,2021-03-21,2025-04-17T15:58:49.979Z


In [0]:
dbutils.notebook.exit("success...")