## Ingest constructors.json file

### Step 1. Read the json file using spark DataFrame reader

In [1]:
import findspark
findspark.init('/opt/spark')
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.sql.functions import col, current_timestamp

In [2]:
spark = SparkSession.builder\
    .enableHiveSupport()\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
constructors_schema = StructType(
    fields = [
        StructField("constructorId", IntegerType(), False),
        StructField("constructorRef", StringType(), True),
        StructField("name", StringType(), True),
        StructField("nationality", StringType(), True),
        StructField("url", StringType(), True),
])

In [4]:
constructors_df = spark.read \
    .schema(constructors_schema) \
    .json("/user/jupyter/formula1/raw/constructors.json")

### Step 2. Drop unwanted columns

In [6]:
constructors_selected_df = constructors_df \
    .drop(col("url"))

### Step 3. Rename columns and add ingestion date

In [12]:
constructors_final_df = constructors_selected_df \
    .withColumnRenamed("constructorId", "constructor_id") \
    .withColumnRenamed("constructorRef", "constructor_ref") \
    .withColumn("ingestion_date", current_timestamp())

### Step 4. Save the transformed data in HDFS as a parquet

In [14]:
constructors_final_df.write.mode("overwrite").parquet("/user/jupyter/formula1/processed/constructors")

[Stage 6:>                                                          (0 + 1) / 1]                                                                                

In [16]:
spark.stop()