# Ingest Races Data

In [0]:
dbutils.widgets.text("env", "dev", "Environment")

env = dbutils.widgets.get("env")

In [0]:
%run ../config $env=$env

In [0]:
%run ../utils

In [0]:
csv_file_path = raw_data_folder_path + "races.csv"

df_races = (
    spark.read.format("csv")
    .option("header", True)
    .option("inferSchema", True)
    .load(csv_file_path)
)

df_races.display() if env == "dev" else None

In [0]:
import pyspark.sql.functions as F

DATE_FORMAT_SPECIFIER = "yyyy-MM-dd"

df_races_formatted = (
    df_races
    .withColumnRenamed("raceId", "race_id")
    .withColumnRenamed("name", "race_name")
    .withColumnRenamed("CircuitId", "circuit_id")
    .withColumn("ingestion_timestamp", F.current_timestamp())
    .drop("url")
)

date_cols = ["quali_date", "sprint_date"] + [f"fp{i}_date" for i in range(1, 4)]
time_cols = ["time", "quali_time", "sprint_time"] + [f"fp{i}_time" for i in range(1, 4)]

for col_name in date_cols + time_cols:
    df_races_formatted = df_races_formatted.withColumn(
        col_name,
        F.when(F.col(col_name) == F.lit(r"\N"), F.lit(None)).otherwise(F.col(col_name)),
    )

for date_col in date_cols:
    df_races_formatted = df_races_formatted.withColumn(
        date_col, F.to_date(F.col(date_col), DATE_FORMAT_SPECIFIER)
    )

df_races_formatted.display() if env == "dev" else None

In [0]:
df_races_formatted = fillna_str(df_races_formatted, r"\N")

df_races_formatted.display()  if env == "dev" else None

In [0]:
df_races_transformed = (
    df_races_formatted.withColumn("time_split", F.split(F.col("time"), ":"))
    .withColumn(
        "race_timestamp",
        F.make_timestamp(
            F.date_format(F.col("date"), "yyyy"),
            F.date_format(F.col("date"), "MM"),
            F.date_format(F.col("date"), "dd"),
            F.col("time_split").getItem(0),
            F.col("time_split").getItem(1),
            F.col("time_split").getItem(2),
        ),
    )
    .drop("time_split")
)

df_races_transformed.display() if env == "dev" else None

In [0]:
df_races_transformed.write.format("delta").saveAsTable("bronze_tbl_races", mode="overwrite")