## Ingest drivers.json file

### Step 1. Read the json file using spark DataFrame reader

In [18]:
import findspark
findspark.init('/opt/spark')
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.sql.functions import col, current_timestamp, concat, lit

In [7]:
spark = SparkSession.builder\
    .enableHiveSupport()\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [8]:
name_schema = StructType(
    fields = [
        StructField("forename", StringType(), True),
        StructField("surname", StringType(), True)
])

In [12]:
drivers_schema = StructType(
    fields = [
        StructField("driverId", IntegerType(), False),
        StructField("driverRef", StringType(), True),
        StructField("number", IntegerType(), True),
        StructField("code", StringType(), True),
        StructField("name", name_schema),
        StructField("dob", DateType(), True),
        StructField("nationality", StringType(), True),
        StructField("url", StringType(), True)
])

In [13]:
drivers_df = spark.read \
    .schema(drivers_schema) \
    .json("/user/jupyter/formula1/raw/drivers.json")

### Step 2. Rename columns and add new columns

In [25]:
drivers_with_new_columns_df = drivers_df \
    .withColumnRenamed("driverId", "driver_id") \
    .withColumnRenamed("driverRef", "driver_ref") \
    .withColumn("ingestion_date", current_timestamp()) \
    .withColumn("name", concat(col("name.forename"), lit(" "), col("name.surname")))

### Step 3. Drop unwanted columns

In [27]:
drivers_final_df = driver_with_new_columns_df.drop(col("url"))

### Step 4. Save the transformed data in HDFS as a parquet

In [30]:
drivers_final_df.write.mode("overwrite").parquet("/user/jupyter/formula1/processed/drivers")

                                                                                

In [33]:
spark.stop()