In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StructField, StructType, StringType, LongType, DoubleType, IntegerType, DateType

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Circuits_pipeline")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Load the circuits data

# define schema
circuit_schema = StructType([
    StructField("circuitID", StringType(), True),
    StructField("circuitRef", StringType(), True),
    StructField("name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("country", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("lng", DoubleType(), True),
    StructField("alt", LongType(), True),
    StructField("url", StringType(), True),
])

gsc_file_path_circuits = 'gs://data-group1-ass2/circuits.csv'

# Create data frame
circuits = spark.read.format("csv").schema(circuit_schema).option("header", "true").load(gsc_file_path_circuits)
circuits.show(3)

# Load the drivers data

# define schema
drivers_schema = StructType([
    StructField("driverID", StringType(), True),
    StructField("driverRef", StringType(), True),
    StructField("number", LongType(), True),
    StructField("code", StringType(), True),
    StructField("forename", StringType(), True), 
    StructField("surname", StringType(), True),
    StructField("dob", DateType(), True),
    StructField("nationality", StringType(), True),
    StructField("url", StringType(), True),
])

gsc_file_path_drivers = 'gs://data-group1-ass2/drivers.csv'

# Create data frame
drivers = spark.read.format("csv").schema(drivers_schema).option("header", "true").load(gsc_file_path_drivers)
drivers.show(3)

# Load the lap_times data

# define schema
lap_times_schema = StructType([
    StructField("raceID", StringType(), True),
    StructField("driverID", StringType(), True),
    StructField("lap", LongType(), True),
    StructField("position", LongType(), True),
    StructField("time", StringType(), True), #StructField("time", DayTimeIntervalType("MINUTE", "SECOND"), True),
    StructField("miliseconds", LongType(), True),
])

gsc_file_path_lap_times = 'gs://data-group1-ass2/lap_times.csv'

# Create data frame
lap_times = spark.read.format("csv").schema(lap_times_schema).option("header", "true").load(gsc_file_path_lap_times)
lap_times.show(3)

# Load the races data

# define schema
races_schema = StructType([
    StructField("raceID", StringType(), True),
    StructField("year", IntegerType(), True),
    StructField("round", IntegerType(), True),
    StructField("circuitID", StringType(), True),
    StructField("name", StringType(), True),
    StructField("date", DateType(), True),
    StructField("time", StringType(), True), #StructField("time", DayTimeIntervalType("MINUTE", "SECOND"), True),
    StructField("url", StringType(), True),
    StructField("fp1_date", DateType(), True),
    StructField("fp1_time", StringType(), True), #StructField("time", DayTimeIntervalType("MINUTE", "SECOND"), True),
])

gsc_file_path_races = 'gs://data-group1-ass2/races.csv'

# Create data frame
races = spark.read.format("csv").schema(races_schema).option("header", "true").load(gsc_file_path_races)
races.show(3)

+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
|circuitID| circuitRef|                name|    location|  country|     lat|    lng|alt|                 url|
+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
|        1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|144.968| 10|http://en.wikiped...|
|        2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|101.738| 18|http://en.wikiped...|
|        3|    bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|50.5106|  7|http://en.wikiped...|
+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
only showing top 3 rows

+--------+---------+------+----+--------+--------+----------+-----------+--------------------+
|driverID|driverRef|number|code|forename| surname|       dob|nationality|                 url|
+--------+-----

In [2]:
from pyspark.sql.functions import concat, lit

circuits = circuits.select("circuitID", "name")
circuits.na.drop("any", subset=["circuitID"])
drivers = drivers.select("driverID", concat("forename",lit(" "),  "surname").alias("full_name"))
drivers.na.drop("any", subset=["driverID"])
lap_times = lap_times.select("raceID", "driverID", "time", "miliseconds")
lap_times.na.drop("any", subset=["raceID", "driverID"])
races = races.select("raceID", "circuitID", "date")
races.na.drop("any", subset=["raceID", "circuitID"])

DataFrame[raceID: string, circuitID: string, date: date]

Find the fastest lap per race

In [3]:
from pyspark.sql.functions import min, dense_rank,col
from pyspark.sql import Row, Window

window_find_fasest = Window.partitionBy("raceID").orderBy(col("miliseconds").asc())

lap_times_with_rank = lap_times.withColumn("rank_asc_per_race", dense_rank().over(window_find_fasest))
lap_times_top3_per_race = lap_times_with_rank.where((col('rank_asc_per_race') == 1) | 
                                                    (col('rank_asc_per_race') == 2) | 
                                                    (col('rank_asc_per_race') == 3))

find the fastes lap per circuit

In [None]:
races_with_top_3_laps = races.join(lap_times_top3_per_race, ['raceID'])

window_find_fasest_circuit = Window.partitionBy("circuitID").orderBy(col("miliseconds").asc())

races_with_lap_rank = races_with_top_3_laps.withColumn("rank_asc_per_circuit", dense_rank().over(window_find_fasest_circuit))
lap_times_top3_per_circuit = races_with_lap_rank.where((col('rank_asc_per_circuit') == 1) | 
                                                       (col('rank_asc_per_circuit') == 2) | 
                                                       (col('rank_asc_per_circuit') == 3))

lap_times_top3_per_circuit.show(50)

add driver name and circuit name

In [None]:
from pyspark.sql.functions import expr

fastest_laps_combined_data = lap_times_top3_per_circuit.join(drivers, ['driverID']).join(circuits, ['circuitID'])

fastest_lap_final_data = fastest_laps_combined_data.selectExpr("name as circuit_name",
                                                               "rank_asc_per_circuit as rank",
                                                               "full_name",
                                                               "time as lap_time",
                                                               "date as race_date",)
fastest_lap_final_data.sort('circuit_name', 'rank').show(50)

In [None]:
# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "temp-group1-ass2"
spark.conf.set('temporaryGcsBucket', bucket)

# Saving the data to BigQuery
fastest_lap_final_data.sort('circuit_name').write.format('bigquery').option('table', 'deassignment2.Output_processing_pipeline.fastest_laps')\
    .mode("overwrite").save()

In [None]:
# Stop the spark context
spark.stop()