## INCREMENTAL LOAD

In [1]:
file_date = '2021-03-21'

<div style="max-width:1400px;margin-center: auto">
<img src="images\laptime.png" width="600"/>
</div>

In [2]:
import findspark
findspark.init()
import pyspark
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

#  Create a spark session with Delta
builder = pyspark.sql.SparkSession.builder.appName("dbcreation") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Create spark context
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
from pyspark.sql.types import StructField, StructType, IntegerType, StringType
from pyspark.sql.functions import col, current_timestamp,desc, lit, count

In [4]:
lap_time_schema = StructType( fields = [StructField("raceId", IntegerType(), False),
                                        StructField("driverId", IntegerType(), True),
                                        StructField("lap", IntegerType(), True),
                                        StructField("position", IntegerType(), True),
                                        StructField("time", StringType(), True),
                                        StructField("milliseconds", IntegerType(), True)])

In [5]:
lap_time_df = spark.read.csv(f'raw files\\{file_date}\\lap_times', schema = lap_time_schema)
lap_time_df.show(2)

+------+--------+---+--------+--------+------------+
|raceId|driverId|lap|position|    time|milliseconds|
+------+--------+---+--------+--------+------------+
|   841|      20|  1|       1|1:38.109|       98109|
|   841|      20|  2|       1|1:33.006|       93006|
+------+--------+---+--------+--------+------------+
only showing top 2 rows



In [6]:
lap_time_df.count()

490904

In [7]:
lap_time_fnl_df = lap_time_df.withColumnRenamed("raceId","race_id").\
                              withColumnRenamed("driverId","driver_id").\
                              withColumn("ingestion_date",current_timestamp()).\
                              withColumn("file_date",lit(file_date))
lap_time_fnl_df.show(2)

+-------+---------+---+--------+--------+------------+--------------------+----------+
|race_id|driver_id|lap|position|    time|milliseconds|      ingestion_date| file_date|
+-------+---------+---+--------+--------+------------+--------------------+----------+
|    841|       20|  1|       1|1:38.109|       98109|2024-02-11 15:13:...|2021-03-21|
|    841|       20|  2|       1|1:33.006|       93006|2024-02-11 15:13:...|2021-03-21|
+-------+---------+---+--------+--------+------------+--------------------+----------+
only showing top 2 rows



In [8]:
%run "common_functions.ipynb"

In [10]:
merge_condition = "tgt.race_id = src.race_id AND tgt.driver_id = src.driver_id AND tgt.lap = src.lap AND tgt.race_id = src.race_id"
merge_delta_data(lap_time_fnl_df, 'default', 'lap_times', 'E:/unused/Udemy/Spark_practice/raw/Delta%20lake/spark-warehouse',merge_condition,'race_id')

In [None]:
df = spark.sql('SELECT * FROM default.lap_times')
print('df_file1_count for each raceId : ')
df.groupBy('race_id').agg(count('*')).orderBy(desc('race_id')).show()