In [1]:
file_date = '2021-03-21'

## Requirement

<div style="max-width:1400px;margin-center: auto">
<img src="images\circuit.png" width="600"/>
</div>

## Insert Data

In [2]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local') \
    .appName("circuit") \
    .config("spark.sql.catalogImplementation", "hive") \
    .getOrCreate()
import os
os.chdir(os.getcwd())

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [4]:
circuits_schema = StructType(fields=[StructField("circuitId", IntegerType(), False),
                                     StructField("circuitRef", StringType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("location", StringType(), True),
                                     StructField("country", StringType(), True),
                                     StructField("lat", DoubleType(), True),
                                     StructField("lng", DoubleType(), True),
                                     StructField("alt", IntegerType(), True),
                                     StructField("url", StringType(), True)
])
#even though nullable for circuit id is given false it is not applied,
#to apply this we should use other functions

In [5]:
circuit_df = spark.read.csv(f'raw files\\{file_date}\\circuits.csv'
                            ,header=True,schema =circuits_schema)

### Select only the required columns

In [6]:
from pyspark.sql.functions import col,lit

In [7]:
circuits_selected_df = circuit_df.select(col("circuitId"), col("circuitRef"), col("name"), col("location"), col("country"), col("lat"), col("lng"), col("alt"))

In [8]:
circuits_selected_df.show(2)

+---------+-----------+--------------------+------------+---------+--------+-------+---+
|circuitId| circuitRef|                name|    location|  country|     lat|    lng|alt|
+---------+-----------+--------------------+------------+---------+--------+-------+---+
|        1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|144.968| 10|
|        2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|101.738| 18|
+---------+-----------+--------------------+------------+---------+--------+-------+---+
only showing top 2 rows



### Rename The columns

In [9]:
circuits_renamed_df = circuits_selected_df.withColumnRenamed("circuitId", "circuit_id") \
.withColumnRenamed("circuitRef", "circuit_ref") \
.withColumnRenamed("lat", "latitude") \
.withColumnRenamed("lng", "longitude") \
.withColumnRenamed("alt", "altitude") \
.withColumn('file_date',lit(file_date))
#When using .withColumn to add a new column to a DataFrame, you might use lit to assign a constant value to that column for all rows.

In [10]:
circuits_renamed_df.show(2)

+----------+-----------+--------------------+------------+---------+--------+---------+--------+----------+
|circuit_id|circuit_ref|                name|    location|  country|latitude|longitude|altitude| file_date|
+----------+-----------+--------------------+------------+---------+--------+---------+--------+----------+
|         1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10|2021-03-21|
|         2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18|2021-03-21|
+----------+-----------+--------------------+------------+---------+--------+---------+--------+----------+
only showing top 2 rows



### Add the column

In [11]:
from pyspark.sql.functions import current_timestamp, lit

In [12]:
circuits_final_df = circuits_renamed_df.withColumn("ingestion_date", current_timestamp()) 
# col name and value of column as parameters

In [13]:
# circuits_final_df.show(2)

In [14]:
# circuits_final_df.printSchema()

In [15]:
# circuits_final_df = circuits_renamed_df.withColumn("env", lit("prod")) 


In [16]:
# circuits_final_df.show(2)

### write data as parquet

In [17]:
circuits_renamed_df.write.mode('overwrite').format('delta').saveAsTable('f1_processed.circuits')