In [0]:
%run "../includes/common_code"

### Ingestion circuits.csv
1. Read circuits.csv file from raw container using Dataframe reader API
2. Select only the required columns or drop the unwanted columns
3. Rename few columns to the suitable name
4. Add audit column which specifies the ingestion date/time/timestamp
5. Write the final transformed data into processsed container

In [0]:
# listing all the mounts
display(dbutils.fs.mounts())

In [0]:
# listing all the files in raw container
# display(dbutils.fs.ls("/mnt/formula1dlsiddhi/raw"))
display(dbutils.fs.ls(raw_container_path))

Visit [spark_documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/api/pyspark.pandas.read_csv.html#pyspark.pandas.read_csv) for read_csv()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType

#### Defining the schema manually
1. Visit [spark documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.types.StructType.html) for Struct_Type

In [0]:
circuits_schema = StructType([ StructField("circuitId", IntegerType(), True),
                               StructField("circuitRef", StringType(), True),
                               StructField("name", StringType(), True),
                               StructField("location", StringType(), True),
                               StructField("country", StringType(), True),
                               StructField("lat", DoubleType(), True),
                               StructField("lng", DoubleType(), True),
                               StructField("alt", IntegerType(), True),
                               StructField("url", StringType(), True)
                            ])

In [0]:
circuits_df = spark.read.schema(circuits_schema) \
                        .option("header", "true") \
                        .csv(f"{raw_container_path}/circuits.csv")

In [0]:
# printing schema
circuits_df.printSchema()

In [0]:
display(circuits_df)

#### Further processing 1
1. From the above dataframe we don't need URL, so either select only the required columns or drop the unwanted column url
2. Rename the following columns
  - circuitId ---> circuit_id
  - circuitRef ---> circuit_ref
  - name ---> circuit_name
  - location ---> circuit_location
  - country ---> circuit_country
  - lat ---> circuit_latitude
  - long ---> circuit_longitude
  - alt ---> circuit_altitude
3. Add the audit column ingestion_date

In [0]:
from pyspark.sql.functions import col

In [0]:
# you can col, dataframe.col() to select columns here I used the simple method
# Can be used like this as well
# circuits_df_selected = circuits_df.select(col("circuitId").alias("circuit_id"), col("circuitRef").alias("circuit_ref"), col("name").alias("circuit_name"), col("location").alias("circuit_location"), col("country").alias("circuit_country"), col("lat").alias("circuit_lat"), col("lng").alias("circuit_lng"), col("alt").alias("circuit_alt"))
# Can be used like this as well
# circuits_df_selected = circuits_df.select(circuits_df.circuitId.alias("circuit_id"), ......)
circuits_df_selected = circuits_df.select("circuitId", "circuitRef", "name", "location", "country", "lat", "lng", "alt")


In [0]:
display(circuits_df_selected)

In [0]:
circuits_df_columns_renamed = circuits_df_selected.withColumnRenamed("circuitId", "circuit_id") \
                                                  .withColumnRenamed("circuitRef", "circuit_ref") \
                                                  .withColumnRenamed("name", "circuit_name") \
                                                  .withColumnRenamed("location", "circuit_location") \
                                                  .withColumnRenamed("country", "circuit_country") \
                                                  .withColumnRenamed("lat", "circuit_lat") \
                                                  .withColumnRenamed("lng", "circuit_lng") \
                                                  .withColumnRenamed("alt", "circuit_altitude")
display(circuits_df_columns_renamed)

In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
circuits_df_add_audit_column = ingest_current_timestamp(circuits_df_columns_renamed)
display(circuits_df_add_audit_column)

In [0]:
# For naming conventions copy the circuits_df_add_audit_column DataFrame to a new DataFrame with the name circuits_df_final
circuits_df_final = circuits_df_add_audit_column
display(circuits_df_final)

#### Write the final data into processed folder
1. Visit [spark documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.write.html?highlight=write) for write()

In [0]:
circuits_df_final.write.mode("overwrite") \
                       .parquet(f"{processed_container_path}/circuits")

In [0]:
# Verify that the data was written to the parquet file
df = spark.read.parquet(f"{processed_container_path}/circuits")
display(df)

In [0]:
dbutils.notebook.exit("success...")