## INCREMENTAL LOAD

In [21]:
file_date = '2021-04-18'

<div style="max-width:1400px;margin-center: auto">
<img src="images\pitstop.png" width="600"/>
</div>

In [22]:
import findspark
findspark.init()
import pyspark
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

#  Create a spark session with Delta
builder = pyspark.sql.SparkSession.builder.appName("dbcreation") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Create spark context
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [23]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [24]:
pit_stops_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("stop", StringType(), True),
                                      StructField("lap", IntegerType(), True),
                                      StructField("time", StringType(), True),
                                      StructField("duration", StringType(), True),
                                      StructField("milliseconds", IntegerType(), True)
                                     ])


In [25]:
pit_stops_df = spark.read.json(f'raw files\\{file_date}\\pit_stops.json', 
                             schema = pit_stops_schema, 
                            multiLine = True)
pit_stops_df.show(2)

+------+--------+----+---+--------+--------+------------+
|raceId|driverId|stop|lap|    time|duration|milliseconds|
+------+--------+----+---+--------+--------+------------+
|  1053|     839|   1|  1|15:05:16|  30.866|       30866|
|  1053|      20|   1|  3|15:10:09|  32.024|       32024|
+------+--------+----+---+--------+--------+------------+
only showing top 2 rows



In [26]:
from pyspark.sql.functions import current_timestamp,count,desc

In [27]:
final_df = pit_stops_df.withColumnRenamed("driverId", "driver_id") \
.withColumnRenamed("raceId", "race_id") \
.withColumn("ingestion_date", current_timestamp())

In [28]:
%run "common_functions.ipynb"

In [29]:
merge_condition = "tgt.race_id = src.race_id AND tgt.driver_id = src.driver_id AND tgt.stop = src.stop AND tgt.race_id = src.race_id"
merge_delta_data(final_df, 'default', 'pit_stops', 'E:/unused/Udemy/Spark_practice/raw/Delta%20lake/spark-warehouse',merge_condition,'race_id')

In [30]:
df = spark.sql('SELECT * FROM default.pit_stops')
print('df_file1_count for each raceId : ')
df.groupBy('race_id').agg(count('*')).orderBy(desc('race_id')).show()

df_file1_count for each raceId : 
+-------+--------+
|race_id|count(1)|
+-------+--------+
|   1053|      56|
|   1052|      40|
|   1047|      23|
|   1046|      39|
|   1045|      57|
|   1044|      38|
|   1043|      30|
|   1042|      25|
|   1041|      33|
|   1040|      24|
|   1039|      66|
|   1038|      37|
|   1037|      20|
|   1036|      35|
|   1035|      41|
|   1034|      22|
|   1033|      45|
|   1032|      21|
|   1031|      38|
|   1030|      25|
+-------+--------+
only showing top 20 rows

