### Step 1 - Read CSV file using spark dataframe reader

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [0]:
races_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                  StructField("year", IntegerType(), True),
                                  StructField("round", IntegerType(), True),
                                  StructField("circuitId", IntegerType(), True),
                                  StructField("name", StringType(), True),
                                  StructField("date", StringType(), True),
                                  StructField("time", StringType(), True),
                                  StructField("url", StringType(), True)])

In [0]:
races_df = spark.read \
.option("header", True) \
.schema(races_schema) \
.csv("abfss://raw@formula1dl1216.dfs.core.windows.net/races.csv")
display(circuits_df)

### Step 2 - Select only required columns and add ingestion date n race_timestamp to dataframe 

### Step 3 - Combine renaming as well!

In [0]:
from pyspark.sql.functions import col, current_timestamp, to_timestamp, concat, lit

In [0]:
races_with_timestamp_df = races_df.withColumn("ingestion_date", current_timestamp()) \
                                .withColumn("race_timestamp", to_timestamp(concat(col('date'), lit(' '), col('time')), 'yyyy-MM-dd HH:mm:ss'))


In [0]:
races_final_df = races_with_timestamp_df.select(col("raceId").alias("race_id"), col("year").alias("race_year"), col("round"), col("circuitId").alias("circuit_id"), col("name"), col("ingestion_date"), col("race_timestamp"))

### Step 4 - Write output to processed container in parquet format

In [0]:
races_final_df.write.mode("overwrite").partitionBy('race_year').parquet("abfss://processed@formula1dl1216.dfs.core.windows.net/")

In [0]:
%fs ls abfss://raw@formula1dl1216.dfs.core.windows.net/races

In [0]:
df = spark.read.parquet("abfss://raw@formula1dl1216.dfs.core.windows.net/races")
display(df) 