# Ingest race data

In [0]:
%run ../SetUp

In [0]:
from pyspark.sql.functions import current_timestamp

csv_file_path = session_helper.get_storage_account_url(folder="raw", file="races.csv")

races_df = (
    spark.read
    .format("csv")
    .option("header", True)
    .option("inferSchema", True)
    .load(csv_file_path)
    .withColumnRenamed("raceId", "race_id")
    .withColumnRenamed("year", "race_year")
    .withColumnRenamed("CircuitId", "circuit_id")
    .withColumn("ingestion_date", current_timestamp())
    .drop("url")
)

#races_df.display()

In [0]:
import pyspark.sql.functions as F

# create the 'race_timestamp' column
transformed_races_df = (
    races_df
    .withColumn("time_split", F.split(F.col("time"), ":"))
    .withColumn(
        "race_timestamp", 
        F.make_timestamp(
            F.date_format(F.col("date"), "yyyy"), 
            F.date_format(F.col("date"), "MM"), 
            F.date_format(F.col("date"), "dd"), 
            F.col("time_split").getItem(0), 
            F.col("time_split").getItem(1), 
            F.col("time_split").getItem(2),
            )
        )
    .drop("time_split", "date", "time")
)

#transformed_races_df.display()

In [0]:
spark.sql("DROP TABLE IF EXISTS dev.races_bronze")
transformed_races_df.write.format("delta").saveAsTable("dev.races_bronze", mode="overwrite")