## Ingest races.csv file

### Step 1. Read the csv file using spark DaraFrame reader

In [1]:
import findspark
findspark.init('/opt/spark')
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.sql.functions import col, current_timestamp, to_timestamp, concat, lit

In [2]:
spark = SparkSession.builder\
    .enableHiveSupport()\
    .getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
races_schema = StructType(
    fields = [
        StructField("raceId", IntegerType(), False),
        StructField("year", IntegerType(), True),
        StructField("round", IntegerType(), True),
        StructField("circuitId", IntegerType(), True),
        StructField("name", StringType(), True),
        StructField("date", DateType(), True),
        StructField("time", StringType(), True),      
        StructField("url", StringType(), True),
    ]
)

In [4]:
races_df= spark.read \
    .option("header", True) \
    .schema(races_schema) \
    .csv("/user/jupyter/formula1/raw/races.csv")

### Step 2. Select and rename only required columns

In [5]:
races_selected_df = races_df.select(
    col("raceId").alias("race_id"),
    col("year").alias("race_year"),
    col("round"),
    col("circuitId").alias("circuit_id"),
    col("name"),
    col("date"),
    col("time")
)

### Step 3. Transform date and time to race_timestamp and add ingestion_date

In [6]:
races_final_df = races_selected_df \
    .withColumn("race_timestamp", to_timestamp(concat(col("date"), lit(" "), col("time")), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("ingestion_date", current_timestamp())

### Step 4. Partition by race_year and save the transformed data in HDFS as a parquet

In [7]:
races_final_df.write.mode("overwrite").partitionBy("race_year").parquet("/user/jupyter/formula1/processed/races")

                                                                                

In [8]:
spark.stop()