## Ingest circuits.csv file

### Step 1. Read the csv file using the spark DataFrame reader

In [1]:
import findspark
findspark.init('/opt/spark')
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.sql.functions import col, current_timestamp

In [2]:
spark = SparkSession.builder\
    .enableHiveSupport()\
    .getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
circuits_schema = StructType(
    fields = [
        StructField("circuitId", IntegerType(), False),
        StructField("circuitRef", StringType(), True),
        StructField("name", StringType(), True),
        StructField("location", StringType(), False),
        StructField("country", StringType(), True),
        StructField("lat", DoubleType(), True),
        StructField("lng", DoubleType(), True),
        StructField("alt", IntegerType(), True),
        StructField("url", StringType(), True),
])

In [4]:
circuits_df = spark.read \
    .option("header", True) \
    .schema(circuits_schema) \
    .csv('/user/jupyter/formula1/raw/circuits.csv')

### Step 2. Select only required columns

In [5]:
circuits_selected_df = circuits_df.select(
    col("circuitId"),
    col("circuitRef"), 
    col("name"), 
    col("location"), 
    col("country"), 
    col("lat"),
    col("lng"),
    col("alt")
)

### Step 3. Rename columns

In [6]:
circuits_renamed_df = circuits_selected_df \
    .withColumnRenamed("circuitId", "circuit_id") \
    .withColumnRenamed("circuitRef", "circuit_ref") \
    .withColumnRenamed("lat", "latitude") \
    .withColumnRenamed("lng", "longitude") \
    .withColumnRenamed("alt", "altitude")

### Step 4. Add ingestion data to the DataFrame

In [7]:
circuits_final_df = circuits_renamed_df.withColumn("ingestion_date", current_timestamp())

### Step 5. Save the transformed data in HDFS as a parquet

In [8]:
circuits_final_df.write.mode("overwrite").parquet("/user/jupyter/formula1/processed/circuits")

                                                                                

In [9]:
spark.stop()