# Importing the necessary libraries

In [1]:
import yaml
import os
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType
import json

# Transforming the drivers table:

## getting all possible keys in all yaml files

In [19]:
def extractInfo(year, dataYaml, entrant):
    entryDrivers = []
    if 'constructorId' in dataYaml.keys():
        constructor = dataYaml['constructorId']
        entryDrivers.extend(drivers(year, dataYaml, entrant, constructor))
    else:
        for constructor in dataYaml['constructors']:
            entryDrivers.extend(drivers(year, constructor, entrant, constructor['constructorId']))
    return entryDrivers

def collectDriverInfo(year, entrant, constructor, dataYaml):
    data = {
            'year': int(year),
            'entrant' : entrant,
            'constructor' : constructor,
            'driver' : dataYaml['driverId'],
            'tookPart': False if dataYaml['rounds'] == None else True,
            'TestDriver' : dataYaml.get('testDriver', False)
        }
    return data

def drivers(year, dataYaml, entrant, constructor):
    driversList = []
    if 'driverId' in dataYaml.keys():
        driversList.append(collectDriverInfo(year, entrant, constructor, dataYaml))
    else:
        for driver in dataYaml['drivers']:
            driversList.append(collectDriverInfo(year, entrant, constructor, driver))
    return driversList


def getAllDrivers(folderPath = "/home/floppabox/f1/f1db/src/data/seasons"):
    allDrivers = []
    years = [year for year in os.listdir(folderPath)]
    for year in sorted(years, key=int):
        file_path = os.path.join(folderPath, year, 'entrants.yml')
        with open(file_path, 'r') as file:
            data=yaml.safe_load(file)
            for entry in data:
                allDrivers.extend(extractInfo(year, entry, entry['entrantId']))
    return allDrivers

print(getAllDrivers())


[{'year': 1950, 'entrant': 'alfa-romeo-spa', 'constructor': 'alfa-romeo', 'driver': 'juan-manuel-fangio', 'tookPart': True, 'TestDriver': False}, {'year': 1950, 'entrant': 'alfa-romeo-spa', 'constructor': 'alfa-romeo', 'driver': 'luigi-fagioli', 'tookPart': True, 'TestDriver': False}, {'year': 1950, 'entrant': 'alfa-romeo-spa', 'constructor': 'alfa-romeo', 'driver': 'nino-farina', 'tookPart': True, 'TestDriver': False}, {'year': 1950, 'entrant': 'alfa-romeo-spa', 'constructor': 'alfa-romeo', 'driver': 'reg-parnell', 'tookPart': True, 'TestDriver': False}, {'year': 1950, 'entrant': 'alfa-romeo-spa', 'constructor': 'alfa-romeo', 'driver': 'consalvo-sanesi', 'tookPart': True, 'TestDriver': False}, {'year': 1950, 'entrant': 'alfa-romeo-spa', 'constructor': 'alfa-romeo', 'driver': 'piero-taruffi', 'tookPart': True, 'TestDriver': False}, {'year': 1950, 'entrant': 'andy-granatelli', 'constructor': 'kurtis-kraft', 'driver': 'pat-flaherty', 'tookPart': True, 'TestDriver': False}, {'year': 1950,

In [None]:
folderPath = "/home/floppabox/f1/f1db/src/data/seasons"

years = [year for year in os.listdir(folderPath)]
keysSeen = {}

for fileName in fileNames:
    file_path = os.path.join(folderPath, fileName)
    with open(file_path, 'r') as file:
        data=yaml.safe_load(file)
        for key in data.keys():
            if key not in keysSeen:
                keysSeen[key]=None
print(keysSeen)


In [2]:
spark = SparkSession.builder.appName("YAML to CSV").getOrCreate()


25/01/02 20:53:10 WARN Utils: Your hostname, Obuntu resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
25/01/02 20:53:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/02 20:53:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pyspark.sql.types import StringType, StructType, StructField, IntegerType, BooleanType

schema = StructType([
    StructField("year", IntegerType(), True),
    StructField("entrantId", StringType(), True),
    StructField("constructorId", StringType(), True),
    StructField("driverId", StringType(), True),
    StructField("tookPart", BooleanType(), True),
    StructField("testDriver", BooleanType(), True)
])
#year,entrant,constructor,driver,tookPart,TestDriver

readDataset = spark.read.schema(schema).option("header", True).csv("/home/floppabox/f1/f1-data-project-gr/csv_datasets/driversAllYears")


In [59]:
#SQL query for debut years of each driver

readDataset.createOrReplaceGlobalTempView("driversAllYears")

spark.sql("""SELECT MIN(year) as debut_year, driverId
          FROM global_temp.driversAllYears
          WHERE tookPart == True
          GROUP BY driverId""").createOrReplaceGlobalTempView("debut")

In [5]:
#SQL query for reirement years (to adjust, since all the 2024 entries are considered as retired)

spark.sql("""SELECT *
          FROM (
            SELECT MAX(year) as retirement_year, driverId
            FROM global_temp.driversAllYears
            WHERE tookPart == True
            GROUP BY driverId
          ) AS temp_ret
          WHERE retirement_year < (SELECT MAX(year) as maximum FROM global_temp.driversAllYears)""").show(300)

                                                                                

+---------------+--------------------+
|retirement_year|            driverId|
+---------------+--------------------+
|           2015|       roberto-merhi|
|           2012|    pedro-de-la-rosa|
|           2012|    jerome-dambrosio|
|           2003|       nicolas-kiesa|
|           2013|         charles-pic|
|           2009|giancarlo-fisichella|
|           2009|    nelson-piquet-jr|
|           2006|            yuji-ide|
|           2021|      nikita-mazepin|
|           2007|   markus-winkelhock|
|           2000|      johnny-herbert|
|           2014|     kamui-kobayashi|
|           2024|        pierre-gasly|
|           2020|        daniil-kvyat|
|           2021|      kimi-raikkonen|
|           2013|         mark-webber|
|           2024|    daniel-ricciardo|
|           2024|      logan-sargeant|
|           2015|        will-stevens|
|           2003|        ralph-firman|
|           2018|   stoffel-vandoorne|
|           2024|        lance-stroll|
|           2006|  juan-p

In [19]:
#sql queries for transfers

transferDB = spark.sql("""
    SELECT TransferOut.driverId, const_out, transfer_out, const_in, transfer_in
    FROM (
        SELECT MAX(year) AS transfer_out, driverId, constructorId AS const_out
        FROM global_temp.driversAllYears
        WHERE tookPart = True
        GROUP BY driverId, constructorId
    ) AS TransferOut
    INNER JOIN (
        SELECT MIN(year) AS transfer_in, driverId, constructorId AS const_in
        FROM global_temp.driversAllYears
        WHERE tookPart = True
        GROUP BY driverId, constructorId
    ) AS TransIn
    ON TransferOut.driverId = TransIn.driverId 
       AND TransferOut.transfer_out = TransIn.transfer_in - 1
""")

transferDB.createOrReplaceGlobalTempView("transfer")

In [15]:
#sql queries for breaks (needing some updates)

spark.sql("""
    SELECT Break.driverId, const_out, break_year, const_in, return_year, return_year - break_year as gap
    FROM (
        SELECT MAX(year) AS break_year, driverId, constructorId AS const_out
        FROM global_temp.driversAllYears
        WHERE tookPart = True
        GROUP BY driverId, constructorId
    ) AS Break
    INNER JOIN (
        SELECT MIN(year) AS return_year, driverId, constructorId AS const_in
        FROM global_temp.driversAllYears
        WHERE tookPart = True
        GROUP BY driverId, constructorId
    ) AS Return
    ON Break.driverId = Return.driverId 
       AND Break.break_year < Return.return_year - 1
    GROUP BY Break.driverId, const_out, break_year, const_in, return_year
""").createOrReplaceGlobalTempView("gapBreak")


In [23]:
spark.catalog.listTables("global_temp")

[Table(name='driversAllYears', catalog=None, namespace=['global_temp'], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='gapBreak', catalog=None, namespace=['global_temp'], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='transfer', catalog=None, namespace=['global_temp'], description=None, tableType='TEMPORARY', isTemporary=True)]

In [43]:
spark.sql("SELECT * FROM global_temp.transfer WHERE driverId == 'nico-hulkenberg'").show(3000)

                                                                                

+---------------+------------+------------+------------+-----------+
|       driverId|   const_out|transfer_out|    const_in|transfer_in|
+---------------+------------+------------+------------+-----------+
|nico-hulkenberg|aston-martin|        2022|        haas|       2023|
|nico-hulkenberg| force-india|        2016|     renault|       2017|
|nico-hulkenberg|     renault|        2019|racing-point|       2020|
+---------------+------------+------------+------------+-----------+



In [41]:
spark.sql("""
    SELECT gb.*
    FROM global_temp.gapBreak gb
    LEFT ANTI JOIN global_temp.transfer t
    ON gb.driverId = t.driverId AND gb.const_in = t.const_in AND gb.return_year == t.transfer_in
""").createOrReplaceGlobalTempView("filteredGap")

In [51]:
spark.sql("""
    SELECT fg.*
    FROM global_temp.filteredGap fg 
        INNER JOIN (SELECT driverId, const_in, return_year, MIN(gap) AS act
                    FROM global_temp.filteredGap fg
                    GROUP BY driverId, const_in, return_year) AS ab
        ON ab.driverId == fg.driverId AND ab.const_in == fg.const_in AND ab.return_year == fg.return_year AND fg.gap == ab.act
""").createOrReplaceGlobalTempView("filteredGap")

spark.sql("""
    SELECT fg.*
    FROM global_temp.filteredGap fg 
        INNER JOIN (SELECT driverId, const_out, break_year, MIN(gap) AS act
                    FROM global_temp.filteredGap fg
                    GROUP BY driverId, const_out, break_year) AS ab
        ON ab.driverId == fg.driverId AND ab.const_out == fg.const_out AND ab.break_year == fg.break_year AND fg.gap == ab.act
""").show(3000)

                                                                                

+--------------------+--------------------+----------+-------------+-----------+---+
|            driverId|           const_out|break_year|     const_in|return_year|gap|
+--------------------+--------------------+----------+-------------+-----------+---+
|         jan-lammers|            theodore|      1982|        march|       1992| 10|
|       nigel-mansell|               lotus|      1984|      ferrari|       1989|  5|
|         luca-badoer|             minardi|      1999|      ferrari|       2009| 10|
|       olivier-panis|               prost|      1999|          bar|       2001|  2|
|       derek-warwick|               lotus|      1990|     footwork|       1993|  3|
|        troy-ruttman|            lesovsky|      1950|        kuzma|       1952|  2|
|        louis-rosier|         talbot-lago|      1951|     maserati|       1954|  3|
|       ron-flockhart|           connaught|      1956|       cooper|       1958|  2|
|   gabriele-tarquini|              coloni|      1988|    fondmet

In [61]:
spark.sql("""SELECT db.*, dr.constructorId
            FROM global_temp.driversAllYears dr
                INNER JOIN global_temp.debut db
                ON db.debut_year == dr.year AND db.driverId == dr.driverId""").show(3000)

                                                                                

+----------+--------------------+--------------------+
|debut_year|            driverId|       constructorId|
+----------+--------------------+--------------------+
|      2000|       jenson-button|            williams|
|      2000|       nick-heidfeld|               prost|
|      2000|       luciano-burti|              jaguar|
|      2000|    gaston-mazzacane|             minardi|
|      2001|  juan-pablo-montoya|            williams|
|      2001|     fernando-alonso|             minardi|
|      2001|          alex-yoong|             minardi|
|      2001|    enrique-bernoldi|              arrows|
|      2001|          tomas-enge|               prost|
|      2001|      kimi-raikkonen|              sauber|
|      2002|         takuma-sato|              jordan|
|      2002|         mark-webber|             minardi|
|      2002|    anthony-davidson|             minardi|
|      2002|        allan-mcnish|              toyota|
|      2002|        felipe-massa|              sauber|
|      200

In [None]:
drivers_data, drivers_relationships_data = [], []

# Extracting all drivers data

for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)

    with open(file_path, 'r') as file:
        content_drivers=yaml.safe_load(file)
        record ={}

        for key in keys:
            if key != 'familyRelationships':
                record[key]= content_drivers.get(key)

        print(record)
        drivers_data.append(record)


In [None]:

id=0
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r') as file:
        content_drivers=yaml.safe_load(file)
        
        if 'familyRelationships' in content_drivers.keys():
            for rel in content_drivers['familyRelationships']:
                record ={}
                record['id']=id
                id=id+1
                record['driverId']=content_drivers.get('id')
                record['relationId']=rel.get('driverId')
                record['type']=rel.get('type')
                drivers_relationships_data.append(record)




In [None]:
real_keys= [x for x in keys if x != 'familyRelationships']

drivers= spark.createDataFrame(drivers_data).select(real_keys)

drivers_relationships = spark.createDataFrame(drivers_relationships_data).select(['id','driverId', 'relationId', 'type'])

drivers_relationships.show()