## Ingest lap_times csv files

### Step1. Read multiple csv files using spark DataFrame reader

In [5]:
import findspark
findspark.init('/opt/spark')
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import current_timestamp

In [6]:
spark = SparkSession.builder\
    .enableHiveSupport()\
    .getOrCreate()

In [7]:
lap_times_schema = StructType(
    fields = [
        StructField("raceId", IntegerType(), False),
        StructField("driverId", IntegerType(), True),
        StructField("lap", IntegerType(), True),
        StructField("position", IntegerType(), True),
        StructField("time", StringType(), True),
        StructField("milliseconds", IntegerType(), True),
])

In [8]:
lap_times_df = spark.read \
    .schema(lap_times_schema) \
    .csv("/user/jupyter/formula1/raw/lap_times/lap_times_split_*.csv")

### Step2. Rename columns and add new columns


In [10]:
final_df = lap_times_df \
    .withColumnRenamed("raceId", "race_id") \
    .withColumnRenamed("driverId", "driver_id") \
    .withColumn("ingestion_date", current_timestamp())

In [12]:
final_df.write.mode("overwrite").parquet("/user/jupyter/formula1/processed/lap_times")

                                                                                

In [14]:
spark.stop()