### FULL LOAD

In [17]:
file_date = '2021-04-18'

<div style="max-width:1400px;margin-center: auto">
<img src="images\races.png" width="600"/>
</div>

In [18]:
import findspark
findspark.init()
import pyspark
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

#  Create a spark session with Delta
builder = pyspark.sql.SparkSession.builder.appName("races") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Create spark context
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [19]:
spark.catalog.listDatabases()

[Database(name='default', description='default database', locationUri='file:/E:/unused/Udemy/Spark_practice/raw/Delta%20lake/spark-warehouse')]

In [20]:
from pyspark.sql.types import StructType, StructField,IntegerType, StringType, DoubleType,DateType
from pyspark.sql.functions import current_timestamp, lit, to_timestamp, concat, col

In [21]:
race_schema = StructType( fields = [StructField("raceId", IntegerType(), False),
                                    StructField("year", IntegerType(), True),
                                    StructField("round", IntegerType(), True),
                                    StructField("circuitId", IntegerType(), True),
                                    StructField("name", StringType(), True),
                                    StructField("date", DateType(), True),
                                    StructField("time", StringType(), True),
                                    StructField("url", StringType(), True)])

In [22]:
race_df = spark.read.csv(f'raw files\\{file_date}\\races.csv',header=True,schema=race_schema)
race_df.show(2)

+------+----+-----+---------+--------------------+----------+--------+--------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
only showing top 2 rows



In [23]:
race_df.printSchema()

root
 |-- raceId: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- round: integer (nullable = true)
 |-- circuitId: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- date: date (nullable = true)
 |-- time: string (nullable = true)
 |-- url: string (nullable = true)



In [24]:
race_df.describe().show()

+-------+------------------+-----------------+------------------+------------------+--------------------+--------+--------------------+
|summary|            raceId|             year|             round|         circuitId|                name|    time|                 url|
+-------+------------------+-----------------+------------------+------------------+--------------------+--------+--------------------+
|  count|              1058|             1058|              1058|              1058|                1058|    1058|                1058|
|   mean| 531.2315689981097|1990.780718336484| 8.382797731568997| 22.20132325141777|                null|    null|                null|
| stddev|308.16570918807656|19.73008802240494|5.0002806845260235|17.287816736617504|                null|    null|                null|
|    min|                 1|             1950|                 1|                 1|70th Anniversary ...|03:00:00|http://en.wikiped...|
|    max|              1073|             2021|  

In [25]:
race_transformed_df = race_df.withColumn("ingestion_date",current_timestamp())\
                        .withColumn("race_timestamp",to_timestamp(concat(col("date"),lit(" "),col('time')),'yyyy-MM-dd HH:mm:ss'))\
                        .withColumn('file_date',lit(file_date))
race_transformed_df.show(2)

+------+----+-----+---------+--------------------+----------+--------+--------------------+--------------------+-------------------+----------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|      ingestion_date|     race_timestamp| file_date|
+------+----+-----+---------+--------------------+----------+--------+--------------------+--------------------+-------------------+----------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|2024-02-11 13:45:...|2009-03-29 06:00:00|2021-04-18|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|2024-02-11 13:45:...|2009-04-05 09:00:00|2021-04-18|
+------+----+-----+---------+--------------------+----------+--------+--------------------+--------------------+-------------------+----------+
only showing top 2 rows



In [26]:
race_final_df = race_transformed_df.select(
    col("raceId").alias("race_id"),
    col("year").alias("race_year"),
    col("round"),
    col("circuitId").alias("circuit_id"),
    col("name"),
    col("race_timestamp"),
    col("ingestion_date"))
race_final_df.show(2)

+-------+---------+-----+----------+--------------------+-------------------+--------------------+
|race_id|race_year|round|circuit_id|                name|     race_timestamp|      ingestion_date|
+-------+---------+-----+----------+--------------------+-------------------+--------------------+
|      1|     2009|    1|         1|Australian Grand ...|2009-03-29 06:00:00|2024-02-11 13:45:...|
|      2|     2009|    2|         2|Malaysian Grand Prix|2009-04-05 09:00:00|2024-02-11 13:45:...|
+-------+---------+-----+----------+--------------------+-------------------+--------------------+
only showing top 2 rows



In [27]:
race_final_df.write.mode('overwrite').format('delta').saveAsTable('default.races')

In [29]:
spark.sql('SELECT count(*) FROM default.races').show()

+--------+
|count(1)|
+--------+
|    1058|
+--------+

