## Ingest pitstops.json file

### Step1. Read multiLine json file using spark DataFrame reader

In [1]:
import findspark
findspark.init('/opt/spark')
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import current_timestamp

In [2]:
spark = SparkSession.builder\
    .enableHiveSupport()\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
pit_stops_schema = StructType(
    fields = [
        StructField("raceId", IntegerType(), False),
        StructField("driverId", IntegerType(), True),
        StructField("stop", StringType(), True),
        StructField("lap", IntegerType(), True),
        StructField("time", StringType(), True),
        StructField("milliseconds", IntegerType(), True),
])

In [7]:
pit_stops_df = spark.read \
    .option("multiLine", True) \
    .schema(pit_stops_schema) \
    .json("/user/jupyter/formula1/raw/pit_stops.json")

### Step2. Rename columns and add new columns

In [9]:
final_df = pit_stops_df \
    .withColumnRenamed("driverId", "driver_id") \
    .withColumnRenamed("raceID", "race_id") \
    .withColumn("ingestion_date", current_timestamp())

### Step3. Save the transformed data in HDFS as a parquet

In [10]:
final_df.write.mode("overwrite").parquet("/user/jupyter/formula1/processed/pit_stops")

                                                                                

In [12]:
spark.stop()