## Ingest qualifying multiLine json files

### Step1. Read multiple multiLine json files using DataFrame reader

In [1]:
import findspark
findspark.init('/opt/spark')
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import current_timestamp

In [2]:
spark = SparkSession.builder\
    .enableHiveSupport()\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
qualifying_schema = StructType(
    fields = [
        StructField("qualifyId", IntegerType(), False),
        StructField("raceId", IntegerType(), True),
        StructField("driverId", IntegerType(), True),
        StructField("constructorId", IntegerType(), True),
        StructField("number", IntegerType(), True),
        StructField("position", IntegerType(), True),
        StructField("q1", StringType(), True),
        StructField("q2", StringType(), True),
        StructField("q3", StringType(), True),
])

In [4]:
qualifying_df = spark.read \
    .schema(qualifying_schema) \
    .option("multiLine", True) \
    .json("/user/jupyter/formula1/raw/qualifying/qualifying_split_*.json")

### Step2. Rename columns and add new columns


In [5]:
final_qualifying_df = qualifying_df \
    .withColumnRenamed("driverId", "driver_id") \
    .withColumnRenamed("raceId", "race_id") \
    .withColumnRenamed("constructorId", "constructor_id") \
    .withColumn("ingestion_date", current_timestamp())

In [6]:
final_qualifying_df.write.mode("overwrite").parquet("/user/jupyter/formula1/processed/qualifying")

                                                                                

In [7]:
spark.stop()