## Requirement

<div style="max-width:1400px;margin-center: auto">
<img src="images\circuit.png" width="600"/>
</div>

## Insert Data

In [1]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
# conf = SparkConf().setAppName('DB_creation').setMaster('local')
# sc = SparkContext(conf = conf)
# spark = SparkSession(sc)
spark = SparkSession.builder.master('local') \
    .appName("circuit") \
    .config("spark.sql.catalogImplementation", "hive") \
    .getOrCreate()

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [3]:
circuits_schema = StructType(fields=[StructField("circuitId", IntegerType(), False),
                                     StructField("circuitRef", StringType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("location", StringType(), True),
                                     StructField("country", StringType(), True),
                                     StructField("lat", DoubleType(), True),
                                     StructField("lng", DoubleType(), True),
                                     StructField("alt", IntegerType(), True),
                                     StructField("url", StringType(), True)
])
#even though nullable for circuit id is given false it is not applied,
#to apply this we should use other functions

In [4]:
# circuit_df = spark.read.csv("circuits.csv",header=True,inferSchema =True)
circuit_df = spark.read.csv(r"E:\unused\Udemy\Spark_practice\raw\raw_files\circuits.csv"
                            ,header=True,schema =circuits_schema)
#default header is false but if false the first row is not identified as header
#inferSchema : identifies the datatype of data by itself(it reads thg all data)
#if massive data inferschema is not an efficent method
# circuit_df = spark.read.option("header",True).option("inferSchema",True).csv("circuits.csv")
# circuit_df = spark.read.option("header",True).schem(circuits_schema).csv("circuits.csv")

### Datatypes in spark
StructType:StructType is going to consist many StructFields.
the type is a wrapper around all your StructFields
this is basically representing a row
StructType represents your row and your StructFields represent your fields or individual columns.

In [5]:
type(circuit_df)

pyspark.sql.dataframe.DataFrame

In [6]:
# display(circuit_df)
circuit_df.show()

+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|circuitId|    circuitRef|                name|    location|  country|     lat|      lng|alt|                 url|
+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968| 10|http://en.wikiped...|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738| 18|http://en.wikiped...|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|  7|http://en.wikiped...|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|109|http://en.wikiped...|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|130|http://en.wikiped...|
|        6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 43.7347| 

In [7]:
# to get the schema of the data
circuit_df.printSchema()

root
 |-- circuitId: integer (nullable = true)
 |-- circuitRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- country: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lng: double (nullable = true)
 |-- alt: integer (nullable = true)
 |-- url: string (nullable = true)



In [8]:
circuit_df.describe().show()

+-------+------------------+----------+-------+---------+---------+------------------+-----------------+-----------------+--------------------+
|summary|         circuitId|circuitRef|   name| location|  country|               lat|              lng|              alt|                 url|
+-------+------------------+----------+-------+---------+---------+------------------+-----------------+-----------------+--------------------+
|  count|                77|        77|     77|       77|       77|                77|               77|               77|                  77|
|   mean|              39.0|      null|   null|     null|     null| 33.72035103896102|3.551302597402597|247.4935064935065|                null|
| stddev|22.371857321197094|      null|   null|     null|     null|22.885969000074535| 64.8766790440326|363.2672505910991|                null|
|    min|                 1|       BAK|A1-Ring|Abu Dhabi|Argentina|          -37.8497|         -118.189|               -7|http://en.wiki

In [9]:
circuit_df.describe

<bound method DataFrame.describe of DataFrame[circuitId: int, circuitRef: string, name: string, location: string, country: string, lat: double, lng: double, alt: int, url: string]>

In [10]:
circuit_df.describe()

DataFrame[summary: string, circuitId: string, circuitRef: string, name: string, location: string, country: string, lat: string, lng: string, alt: string, url: string]

### Select only the required columns

In [11]:
circuits_selected_df = circuit_df.select("circuitId", "circuitRef","name", "location","country", "lat","lng","alt")
#can not apply any col based functions. But can apply for the below ones

In [12]:
circuits_selected_df.show(2)

+---------+-----------+--------------------+------------+---------+--------+-------+---+
|circuitId| circuitRef|                name|    location|  country|     lat|    lng|alt|
+---------+-----------+--------------------+------------+---------+--------+-------+---+
|        1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|144.968| 10|
|        2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|101.738| 18|
+---------+-----------+--------------------+------------+---------+--------+-------+---+
only showing top 2 rows



In [13]:
circuits_selected_df = circuit_df.select(circuit_df.circuitId,circuit_df.circuitRef,circuit_df.name,circuit_df.location.alias("new_loc"),circuit_df.country,circuit_df.lat,circuit_df.lng,circuit_df.alt)

In [14]:
circuits_selected_df.show(2)

+---------+-----------+--------------------+------------+---------+--------+-------+---+
|circuitId| circuitRef|                name|     new_loc|  country|     lat|    lng|alt|
+---------+-----------+--------------------+------------+---------+--------+-------+---+
|        1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|144.968| 10|
|        2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|101.738| 18|
+---------+-----------+--------------------+------------+---------+--------+-------+---+
only showing top 2 rows



In [15]:
circuits_selected_df = circuit_df.select(circuit_df["circuitId"],circuit_df["circuitRef"],circuit_df["name"],circuit_df["location"],circuit_df["country"],circuit_df["lat"],circuit_df["lng"],circuit_df["alt"])

In [16]:
circuits_selected_df.show(2)

+---------+-----------+--------------------+------------+---------+--------+-------+---+
|circuitId| circuitRef|                name|    location|  country|     lat|    lng|alt|
+---------+-----------+--------------------+------------+---------+--------+-------+---+
|        1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|144.968| 10|
|        2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|101.738| 18|
+---------+-----------+--------------------+------------+---------+--------+-------+---+
only showing top 2 rows



In [17]:
from pyspark.sql.functions import col

In [18]:
circuits_selected_df = circuit_df.select(col("circuitId"), col("circuitRef"), col("name"), col("location"), col("country"), col("lat"), col("lng"), col("alt"))

In [19]:
circuits_selected_df.show(2)

+---------+-----------+--------------------+------------+---------+--------+-------+---+
|circuitId| circuitRef|                name|    location|  country|     lat|    lng|alt|
+---------+-----------+--------------------+------------+---------+--------+-------+---+
|        1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|144.968| 10|
|        2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|101.738| 18|
+---------+-----------+--------------------+------------+---------+--------+-------+---+
only showing top 2 rows



### Rename The columns

In [20]:
circuits_renamed_df = circuits_selected_df.withColumnRenamed("circuitId", "circuit_id") \
.withColumnRenamed("circuitRef", "circuit_ref") \
.withColumnRenamed("lat", "latitude") \
.withColumnRenamed("lng", "longitude") \
.withColumnRenamed("alt", "altitude") 

In [21]:
circuits_renamed_df.show(2)

+----------+-----------+--------------------+------------+---------+--------+---------+--------+
|circuit_id|circuit_ref|                name|    location|  country|latitude|longitude|altitude|
+----------+-----------+--------------------+------------+---------+--------+---------+--------+
|         1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10|
|         2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18|
+----------+-----------+--------------------+------------+---------+--------+---------+--------+
only showing top 2 rows



### Add the column

In [22]:
from pyspark.sql.functions import current_timestamp, lit

In [23]:
circuits_final_df = circuits_renamed_df.withColumn("ingestion_date", current_timestamp()) 
# col name and value of column as parameters

In [24]:
# circuits_final_df.show(2)

In [25]:
# circuits_final_df.printSchema()

In [26]:
# circuits_final_df = circuits_renamed_df.withColumn("env", lit("prod")) 


In [27]:
# circuits_final_df.show(2)

### write data as parquet

In [28]:
circuits_renamed_df.write.mode('overwrite').format('parquet').saveAsTable('f1_processed.circuits')