In [84]:
file_date = '2021-03-28'

<div style="max-width:1400px;margin-center: auto">
<img src="images\results.png" width="600"/>
</div>

In [85]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local') \
    .appName("result") \
    .config("spark.sql.catalogImplementation", "hive") \
    .getOrCreate()
import os
os.chdir(os.getcwd())

In [86]:
from pyspark.sql.types import IntegerType, FloatType, StringType, StructField, StructType
from pyspark.sql.functions import col, current_timestamp,count,max,lit,desc

In [87]:
results_schema = StructType(fields=[StructField("resultId", IntegerType(), False),
                                    StructField("raceId", IntegerType(), True),
                                    StructField("driverId", IntegerType(), True),
                                    StructField("constructorId", IntegerType(), True),
                                    StructField("number", IntegerType(), True),
                                    StructField("grid", IntegerType(), True),
                                    StructField("position", IntegerType(), True),
                                    StructField("positionText", StringType(), True),
                                    StructField("positionOrder", IntegerType(), True),
                                    StructField("points", FloatType(), True),
                                    StructField("laps", IntegerType(), True),
                                    StructField("time", StringType(), True),
                                    StructField("milliseconds", IntegerType(), True),
                                    StructField("fastestLap", IntegerType(), True),
                                    StructField("rank", IntegerType(), True),
                                    StructField("fastestLapTime", StringType(), True),
                                    StructField("fastestLapSpeed", FloatType(), True),
                                    StructField("statusId", StringType(), True)])

In [88]:
results_df = spark.read.json(f'raw files\\{file_date}\\results.json', schema = results_schema)
results_df.show(2)

+--------+------+--------+-------------+------+----+--------+------------+-------------+------+----+-----------+------------+----------+----+--------------+---------------+--------+
|resultId|raceId|driverId|constructorId|number|grid|position|positionText|positionOrder|points|laps|       time|milliseconds|fastestLap|rank|fastestLapTime|fastestLapSpeed|statusId|
+--------+------+--------+-------------+------+----+--------+------------+-------------+------+----+-----------+------------+----------+----+--------------+---------------+--------+
|   24966|  1052|       1|          131|    44|   2|       1|           1|            1|  25.0|  56|1:32:03.897|     5523897|        44|   4|      1:34.015|        207.235|       1|
|   24967|  1052|     830|            9|    33|   1|       2|           2|            2|  18.0|  56|     +0.745|     5524642|        41|   2|      1:33.228|        208.984|       1|
+--------+------+--------+-------------+------+----+--------+------------+-------------+--

In [89]:
cut_over_df = spark.read.json(f'raw files\\2021-03-21\\results.json', schema = results_schema)
df_file1 = spark.read.json(f'raw files\\2021-03-28\\results.json', schema = results_schema)
print('cut_over count of distinct_race_id : ',cut_over_df.select('raceId').distinct().count())
print('df_file1 count of distinct_race_id  : ',df_file1.select('raceId').distinct().count())
print('cut_over_count for each raceId : ')
cut_over_df.groupBy('raceId').agg(count('*')).show(3)
print('df_file1_count for each raceId : ')
df_file1.groupBy('raceId').agg(count('*')).show()
print('cut_over_max of raceId : ')
cut_over_df.agg(max('raceId')).show()

cut_over count of distinct_race_id :  1035
df_file1 count of distinct_race_id  :  1
cut_over_count for each raceId : 
+------+--------+
|raceId|count(1)|
+------+--------+
|   148|      22|
|   463|      29|
|   471|      32|
+------+--------+
only showing top 3 rows

df_file1_count for each raceId : 
+------+--------+
|raceId|count(1)|
+------+--------+
|  1052|      20|
+------+--------+

cut_over_max of raceId : 
+-----------+
|max(raceId)|
+-----------+
|       1047|
+-----------+



In [90]:
results_with_columns_df = results_df.withColumnRenamed("resultId", "result_id") \
                                    .withColumnRenamed("raceId", "race_id") \
                                    .withColumnRenamed("driverId", "driver_id") \
                                    .withColumnRenamed("constructorId", "constructor_id") \
                                    .withColumnRenamed("positionText", "position_text") \
                                    .withColumnRenamed("positionOrder", "position_order") \
                                    .withColumnRenamed("fastestLap", "fastest_lap") \
                                    .withColumnRenamed("fastestLapTime", "fastest_lap_time") \
                                    .withColumnRenamed("fastestLapSpeed", "fastest_lap_speed") \
                                    .withColumn("ingestion_date", current_timestamp())\
                                    .withColumn("file_date",lit(file_date))
results_with_columns_df.show(2)

+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------+--------------------+----------+
|result_id|race_id|driver_id|constructor_id|number|grid|position|position_text|position_order|points|laps|       time|milliseconds|fastest_lap|rank|fastest_lap_time|fastest_lap_speed|statusId|      ingestion_date| file_date|
+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------+--------------------+----------+
|    24966|   1052|        1|           131|    44|   2|       1|            1|             1|  25.0|  56|1:32:03.897|     5523897|         44|   4|        1:34.015|          207.235|       1|2024-01-31 19:44:...|2021-03-28|
|    24967|   1052|      830|             9|    33|   1|       2|            2|             2|  18.0

In [91]:
results_final_df = results_with_columns_df.drop(col("statusId"))
results_final_df.show(2)

+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------------------+----------+
|result_id|race_id|driver_id|constructor_id|number|grid|position|position_text|position_order|points|laps|       time|milliseconds|fastest_lap|rank|fastest_lap_time|fastest_lap_speed|      ingestion_date| file_date|
+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------------------+----------+
|    24966|   1052|        1|           131|    44|   2|       1|            1|             1|  25.0|  56|1:32:03.897|     5523897|         44|   4|        1:34.015|          207.235|2024-01-31 19:44:...|2021-03-28|
|    24967|   1052|      830|             9|    33|   1|       2|            2|             2|  18.0|  56|     +0.745|     5524642|     

## Method 1

collect () :collect takes all the data and put it into the driver node's memory.so use it carefully

In [92]:
for race_id in results_final_df.select('race_id').distinct().collect():
    if (spark._jsparkSession.catalog().tableExists("f1_processed.results")):
        spark.sql(f"ALTER TABLE f1_processed.results DROP IF EXISTS PARTITION (race_id ={race_id.race_id})")

In [93]:
results_final_df.write.mode("append").partitionBy('race_id').format('parquet').saveAsTable('f1_processed.results')

# Method 2

there is no way to specify the order of columns being inserted , but Spark expects the last column in the list to be the partitioned column when you're InsertInto

In [94]:
results_final_df.columns

['result_id',
 'race_id',
 'driver_id',
 'constructor_id',
 'number',
 'grid',
 'position',
 'position_text',
 'position_order',
 'points',
 'laps',
 'time',
 'milliseconds',
 'fastest_lap',
 'rank',
 'fastest_lap_time',
 'fastest_lap_speed',
 'ingestion_date',
 'file_date']

In [95]:
results_final_df = results_final_df.select('result_id',
'driver_id',
'constructor_id',
'number',
'grid',
'position',
'position_text',
'position_order',
'points',
'laps',
'time',
'milliseconds',
'fastest_lap',
'rank',
'fastest_lap_time',
'fastest_lap_speed',
'ingestion_date',
'file_date',
'race_id')

In [96]:
spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic")
# else will overwrite all the partition. if set to dynamic only rewrites the partition with new data

In [97]:
if (spark._jsparkSession.catalog().tableExists('f1_processed.results')):
    results_final_df.write.mode("overwrite").insertInto('f1_processed.results')
else:
    results_final_df.write.mode("overwrite").partitionBy(partition_column).format("parquet").saveAsTable(f"{db_name}.{table_name}")
# else will only run in the first attempt

In [98]:
df = spark.sql('SELECT * FROM f1_processed.results')
print('df_file1_count for each raceId : ')
df.groupBy('race_id').agg(count('*')).orderBy(desc('race_id')).show()

df_file1_count for each raceId : 
+-------+--------+
|race_id|count(1)|
+-------+--------+
|   1053|      20|
|   1052|      20|
|   1047|      20|
|   1046|      20|
|   1045|      20|
|   1044|      20|
|   1043|      20|
|   1042|      20|
|   1041|      20|
|   1040|      20|
|   1039|      20|
|   1038|      20|
|   1037|      20|
|   1036|      20|
|   1035|      20|
|   1034|      20|
|   1033|      20|
|   1032|      20|
|   1031|      20|
|   1030|      20|
+-------+--------+
only showing top 20 rows

