## INCREMENTAL LOAD

In [20]:
file_date = '2021-03-21'

<div style="max-width:1400px;margin-center: auto">
<img src="images\qualifying.png" width="600"/>
</div>

In [21]:
import findspark
findspark.init()
import pyspark
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

#  Create a spark session with Delta
builder = pyspark.sql.SparkSession.builder.appName("dbcreation") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Create spark context
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [22]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import current_timestamp,count,desc,lit

In [23]:
qualifying_schema = StructType(fields=[StructField("qualifyId", IntegerType(), False),
                                      StructField("raceId", IntegerType(), True),
                                      StructField("driverId", StringType(), True),
                                      StructField("constructorId", IntegerType(), True),
                                      StructField("number", IntegerType(), True),
                                      StructField("position", IntegerType(), True),
                                      StructField("q1", StringType(), True),
                                      StructField("q2", StringType(), True),
                                      StructField("q3", StringType(), True)
                                     ])

In [24]:
qualifying_df = spark.read.json(f'raw files\\{file_date}\\qualifying', multiLine = True, schema = qualifying_schema)
qualifying_df.show(2)

+---------+------+--------+-------------+------+--------+--------+--------+--------+
|qualifyId|raceId|driverId|constructorId|number|position|      q1|      q2|      q3|
+---------+------+--------+-------------+------+--------+--------+--------+--------+
|        1|    18|       1|            1|    22|       1|1:26.572|1:25.187|1:26.714|
|        2|    18|       9|            2|     4|       2|1:26.103|1:25.315|1:26.869|
+---------+------+--------+-------------+------+--------+--------+--------+--------+
only showing top 2 rows



In [25]:
final_df = qualifying_df.withColumnRenamed("qualifyId", "qualify_id") \
                        .withColumnRenamed("raceId", "race_id") \
                        .withColumnRenamed("driverId", "driver_id") \
                        .withColumnRenamed("constructorId", "constructor_id") \
                        .withColumn("ingestion_date", current_timestamp())\
                        .withColumn("file_date",lit(file_date))

In [26]:
%run "common_functions.ipynb"

In [27]:
merge_condition = "tgt.qualify_id = src.qualify_id AND tgt.race_id = src.race_id"
merge_delta_data(final_df, 'default', 'qualifying', 'E:/unused/Udemy/Spark_practice/raw/Delta%20lake/spark-warehouse',merge_condition,'race_id')

In [28]:
df = spark.sql('SELECT * FROM default.qualifying')
print('df_file1_count for each raceId : ')
df.groupBy('race_id').agg(count('*')).orderBy(desc('race_id')).show()

df_file1_count for each raceId : 
+-------+--------+
|race_id|count(1)|
+-------+--------+
|   1053|      20|
|   1052|      20|
|   1047|      20|
|   1046|      20|
|   1045|      20|
|   1044|      20|
|   1043|      20|
|   1042|      20|
|   1041|      20|
|   1040|      20|
|   1039|      20|
|   1038|      20|
|   1037|      20|
|   1036|      20|
|   1035|      20|
|   1034|      20|
|   1033|      20|
|   1032|      20|
|   1031|      20|
|   1030|      20|
+-------+--------+
only showing top 20 rows

