## Populate FASTianF1 RDF database

This notebook reports the main steps to download CSV files, process them and create an RDF dataset from them accordingly to an ontology.

To measure execution time in Jupyter notebooks: <code>pip install ipython-autotime</code>

In [1]:
# required libraries
import pandas as pd
import os
from pathlib import Path

In [2]:
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD

In [3]:
# CHECK DATE 
import datetime

# Paths

In [9]:
# parameters and URLs
print(str(Path(os.path.abspath(os.getcwd())).parent.absolute()))
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())

circuitsUrl = path + '\FASTianF1\data\DatasetF1\circuits.csv'
constructor_resultsUrl = path + '\FASTianF1\data\DatasetF1\constructor_results.csv'
constructor_standingsUrl = path + '\FASTianF1\data\DatasetF1\constructor_standings.csv'
constructorsUrl = path + '\FASTianF1\data\DatasetF1\constructors.csv'
driver_standingsUrl = path + '\FASTianF1\data\DatasetF1\driver_standings.csv'
driversUrl = path + '\FASTianF1\data\DatasetF1\drivers.csv'
lap_timesUrl = path + '\FASTianF1\data\DatasetF1\lap_times.csv'
pit_stopsUrl = path + '\FASTianF1\data\DatasetF1\pit_stops.csv'
qualifyingUrl = path + '\FASTianF1\data\DatasetF1\qualifying.csv'
racesUrl = path + '\FASTianF1\data\DatasetF1\\races.csv'
resultsUrl = path + '\FASTianF1\data\DatasetF1\\results.csv'
sprint_resultsUrl = path + '\FASTianF1\data\DatasetF1\sprint_results.csv'
statusUrl = path + '\FASTianF1\data\DatasetF1\status.csv'
ratingsUrl = path + '\FASTianF1\data\DatasetF1\\ratings.csv'
seasonsUrl = path + '\FASTianF1\data\DatasetF1\seasons.csv'

# country codes and nationalities conversion
countriesURL = path + '\FASTianF1\data\countryCodes\wikipedia-iso-country-codes.csv'
nationalitiesURL = path + '\FASTianF1\data\countryCodes\\nationalities.csv'

# saving folder
savePath =  path + '\FASTianF1\data\\rdf\\'

C:\Users\chris\Documents\INGEGNERIA\ANNO 5\DB 2\FASTianF1


# Namespaces

In [5]:
# Construct the country and the movie ontology namespaces not known by RDFlib
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
FO = Namespace("https://www.dei.unipd.it/db2/groupProject/FASTianF1#")

# Countries

In [6]:
#load the country codes
# we need to convert NaN values to something else otherwise NA strings are converted to NaN -> problem with Namibia
countries = pd.read_csv(countriesURL, sep=',', index_col='English short name lower case', keep_default_na=False, na_values=['_'])

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\chris\\Documents\\INGEGNERIA\\ANNO 5\\DB 2\\FASTianF1\\FASTianF1\\data\\countryCodes\\wikipedia-iso-country-codes.csv'

# Drivers

In [7]:
# Load the CSV files in memory
drivers = pd.read_csv(driversUrl, sep=',', index_col='driverId')
nationalities = pd.read_csv(nationalitiesURL, sep=',', index_col='num_code')
# cast year to int. If type(year) = str --> Literal= year-01-01
# movies.astype({'year': 'int32'}).dtypes

In [8]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("fo", FO)

In [9]:
%%time 
#measure execution time

#iterate over the drivers dataframe
for index, row in drivers.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + "driver" + the driver id as URI
    Driver = URIRef(FO["driver"+str(index)])
    # Add triples using store's add() method.
    g.add((Driver, RDF.type, FO.Driver))
    g.add((Driver, FO['hasDriverRef'], Literal(row['driverRef'], datatype=XSD.string)))
    if(str(row['number']) != '\\N'):
        g.add((Driver, FO['hasDriverNumber'], Literal(row['number'], datatype=XSD.integer)))
    if(str(row['code']) != '\\N'):
        g.add((Driver, FO['hasCode'], Literal(row['code'], datatype=XSD.string)))
    g.add((Driver, FO['hasForename'], Literal(row['forename'], datatype=XSD.string)))
    g.add((Driver, FO['hasSurname'], Literal(row['surname'], datatype=XSD.string)))
    g.add((Driver, FO['hasURL'], Literal(row['url'], datatype=XSD.string)))
    
    #Check that the date has the year-month-day format otherwise print error
    try:
        datetime.datetime.strptime(str(row['dob']), '%Y-%m-%d')
        g.add((Driver, FO['hasDateOfBirth'], Literal(row['dob'], datatype=XSD.date)))
    except ValueError:
        print("Incorrect date format")

    ## handle nationality
    # there can be more than one nationality per driver
    for nationality in str(row['nationality']).split('-'):
        nationalityName = nationality.strip()
        # check if the nationality exists in the nationalities dataframe
        # str.contains() returns an array of booleans, thus we need to use the any() method
        if((nationalities['nationality'].str.contains(nationalityName, case=False)).any() == True):
            #get the country code, convert to string and get the lower case to match the country codes in the ontology
            #There are multiple countries that correspond to American nationality, then the code for Americans is manually set to "us"
            if(nationalityName != "American"):
                code = str(nationalities[nationalities['nationality'].str.contains(nationalityName, case=False)]['alpha_2_code'].values[0]).lower()
            else:
                code = "us"
            # create the RDF node for Country
            Country = URIRef(CNS[code])
            # add the edge connecting the Driver and the Country 
            g.add((Driver, FO['hasNation'], Country))

CPU times: total: 438 ms
Wall time: 1 s


In [10]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'drivers.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 46.9 ms
Wall time: 271 ms


# Circuits

In [11]:
# Load the CSV files in memory
circuits = pd.read_csv(circuitsUrl, sep=',', index_col='circuitId')
# cast year to int. If type(year) = str --> Literal= year-01-01
# movies.astype({'year': 'int32'}).dtypes

In [12]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("fo", FO)

In [13]:
%%time 
#measure execution time

substitutions = {'ó': 'o', 'ü': 'u', 'ã': 'a', ' ': ''}
substitutions2 = {'UAE': 'United Arab Emirates', 'USA': 'United States', 'UK': 'United Kingdom', 'Russia': 'Russian Federation', 'Korea': 'Korea, Republic of'}

#iterate over the circuits dataframe
for index, row in circuits.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + "circuits" + the circuit id as URI
    Circuit = URIRef(FO["circuit"+str(index)])
    loc = str(row['location'])
    # substitution of special characters with standard characters
    # special characters are not allowed in URIs
    for old, new in substitutions.items():
        loc = loc.replace(old, new)
    # create the RDF node for location
    Location = URIRef(FO["location"+loc])
    # Add triples using store's add() method.
    g.add((Location, RDF.type, FO.Location))
    g.add((Circuit, RDF.type, FO.Circuit))
    g.add((Circuit, FO['hasCircuitRef'], Literal(row['circuitRef'], datatype=XSD.string)))
    g.add((Circuit, FO['hasName'], Literal(row['name'], datatype=XSD.string)))
    g.add((Circuit, FO['hasLat'], Literal(row['lat'], datatype=XSD.float)))
    g.add((Circuit, FO['hasLng'], Literal(row['lng'], datatype=XSD.float)))
    if(str(row['alt']) != '\\N'):
        g.add((Circuit, FO['hasAlt'], Literal(row['alt'], datatype=XSD.float)))
    g.add((Circuit, FO['hasURL'], Literal(row['url'], datatype=XSD.string)))
    # add the edge connecting the Circuit and the Location 
    g.add((Circuit, FO['hasLocation'], Location))

    ## handle country
    countryName = str(row['country'])
    # substitution of abbreviations in country names for full names
    for old, new in substitutions2.items():
        countryName = countryName.replace(old, new)
    # check if the country exists
    # str.contains() returns an array of booleans, thus we need to use the any() method
    if((countries.index == countryName).any() == True):
        #get the country code, convert to string and get the lower case to match the country codes in the ontology 
        code = str(countries[countries.index == countryName]['Alpha-2 code'][0]).lower()
        # create the RDF node for Country
        Country = URIRef(CNS[code])
        # add the edge connecting the Location and the Country 
        g.add((Location, FO['hasCountry'], Country))

CPU times: total: 0 ns
Wall time: 74.6 ms


In [14]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'circuits.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 15.6 ms
Wall time: 52.7 ms


# Constructors

In [15]:
# Load the CSV files in memory
constructors = pd.read_csv(constructorsUrl, sep=',', index_col='constructorId')
# cast year to int. If type(year) = str --> Literal= year-01-01
# movies.astype({'year': 'int32'}).dtypes

In [16]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("fo", FO)

In [17]:
%%time 
#measure execution time

#iterate over the constructors dataframe
for index, row in constructors.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + "constructor" + the constructor id as URI
    Constructor = URIRef(FO["constructor"+str(index)])
    # Add triples using store's add() method.
    g.add((Constructor, RDF.type, FO.Constructor))
    g.add((Constructor, FO['hasConstructorRef'], Literal(row['constructorRef'], datatype=XSD.string)))
    g.add((Constructor, FO['hasName'], Literal(row['name'], datatype=XSD.string)))
    g.add((Constructor, FO['hasURL'], Literal(row['url'], datatype=XSD.string)))

    ## handle nationality
    #there can be more than one nationality per constructor
    for nationality in str(row['nationality']).split('-'):
        nationalityName = nationality.strip()
        # check if the nationality exists
        # str.contains() returns an array of booleans, thus we need to use the any() method
        if((nationalities['nationality'].str.contains(nationalityName, case=False)).any() == True):
            #get the country code, convert to string and get the lower case to match the country codes in the ontology 
            #There are multiple countries that correspond to American nationality, then the code for Americans is manually set to "us"
            if(nationalityName != "American"):
                code = str(nationalities[nationalities['nationality'].str.contains(nationalityName, case=False)]['alpha_2_code'].values[0]).lower()
            else:
                code = "us"
            # create the RDF node for country
            Country = URIRef(CNS[code])
            # add the edge connecting the Constructor and the Country 
            g.add((Constructor, FO['hasNation'], Country))

CPU times: total: 62.5 ms
Wall time: 221 ms


In [18]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'constructors.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 0 ns
Wall time: 72.5 ms


# Status and Season

In [19]:
# Load the CSV files in memory
status = pd.read_csv(statusUrl, sep=',', index_col='statusId')
seasons = pd.read_csv(seasonsUrl, sep=',', index_col='year')
# cast year to int. If type(year) = str --> Literal= year-01-01
# movies.astype({'year': 'int32'}).dtypes

In [20]:
%%time 
#measure execution time

#iterate over the status dataframe
for index, row in status.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + "status" + the status id as URI
    Status = URIRef(FO["status"+str(index)])
    # Add triples using store's add() method.
    g.add((Status, RDF.type, FO.Status))
    g.add((Status, FO['hasName'], Literal(row['status'], datatype=XSD.string)))
    
#iterate over the seasons dataframe
for index, row in seasons.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + "season" + the season id as URI
    Season = URIRef(FO["season"+str(index)])
    # Add triples using store's add() method.
    g.add((Season, RDF.type, FO.Season))
    g.add((Season, FO['hasYear'], Literal(int(index), datatype=XSD.integer)))
    g.add((Season, FO['hasURL'], Literal(row['url'], datatype=XSD.string)))

CPU times: total: 0 ns
Wall time: 38.5 ms


In [21]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'status_seasons.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 0 ns
Wall time: 75 ms


# Race

In [22]:
# Load the CSV files in memory
races = pd.read_csv(racesUrl, sep=',', index_col='raceId')
# cast year to int. If type(year) = str --> Literal= year-01-01
# movies.astype({'year': 'int32'}).dtypes

In [23]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("fo", FO)

In [24]:
%%time 
#measure execution time

#iterate over the races dataframe
for index, row in races.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + "race" + the race id as URI
    Race = URIRef(FO["race"+str(index)])
    # Add triples using store's add() method.
    g.add((Race, RDF.type, FO.Race))
    g.add((Race, FO['hasRound'], Literal(row['round'], datatype=XSD.integer)))
    g.add((Race, FO['hasName'], Literal(row['name'], datatype=XSD.string)))
    g.add((Race, FO['hasDate'], Literal(row['date'], datatype=XSD.date)))
    if(str(row['time']) != '\\N'):
        g.add((Race, FO['hasTime'], Literal(row['time'], datatype=XSD.time)))
    g.add((Race, FO['hasURL'], Literal(row['url'], datatype=XSD.string)))
    
    if(str(row['fp1_date']) != '\\N'):
        g.add((Race, FO['hasFp1Date'], Literal(row['fp1_date'], datatype=XSD.date)))
    if(str(row['fp1_time']) != '\\N'):
        g.add((Race, FO['hasFp1Time'], Literal(row['fp1_time'], datatype=XSD.time)))
    if(str(row['fp2_date']) != '\\N'):
        g.add((Race, FO['hasFp2Date'], Literal(row['fp2_date'], datatype=XSD.date)))
    if(str(row['fp2_time']) != '\\N'):
        g.add((Race, FO['hasFp2Time'], Literal(row['fp2_time'], datatype=XSD.time)))
    if(str(row['fp3_date']) != '\\N'):
        g.add((Race, FO['hasFp3Date'], Literal(row['fp3_date'], datatype=XSD.date)))
    if(str(row['fp3_time']) != '\\N'):
        g.add((Race, FO['hasFp3Time'], Literal(row['fp3_time'], datatype=XSD.time)))
    
    # create the RDF node for circuit
    Circuit = URIRef(FO["circuit"+str(row['circuitId'])])
    # add the edge connecting the Race and the Circuit 
    g.add((Race, FO['hasCircuit'], Circuit))
    
    # create the RDF node for season
    Season = URIRef(FO["season"+str(row['year'])])
    # add the edge connecting the Race and the Season 
    g.add((Race, FO['inSeason'], Season))

CPU times: total: 156 ms
Wall time: 382 ms


In [25]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'race.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 188 ms
Wall time: 372 ms


# Race (partecipations)

In [26]:
# Load the CSV files in memory
r_partecipations = pd.read_csv(resultsUrl, sep=',', index_col='resultId')
constructor_results = pd.read_csv(constructor_resultsUrl, sep=',', index_col="constructorResultsId")
# join is a dataframe containing the outer join of participation results (r_partecipations) 
# and driver results (driver_standings)
# raceId and driverId are used as join keys
join = r_partecipations.merge(constructor_results, how='left', on=['raceId','constructorId'], suffixes=('', 'Constructor')).fillna('\\N')
laps = pd.read_csv(lap_timesUrl, sep=',')
# cast year to int. If type(year) = str --> Literal= year-01-01
# movies.astype({'year': 'int32'}).dtypes

In [27]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("fo", FO)

In [28]:
# Function that transforms a time to standard %H:%M:%S.%f format, 
# adds the zeros and the missing colon to the beginning and end of the string.
def time_formatter(splitted_time):
    return "00:00:00"[0:8-len(splitted_time[0])] + splitted_time[0] + "." + splitted_time[1].ljust(3,"0")

# Function that transforms a gap time to standard %H:%M:%S.%f format, 
# adds the zeros and the missing colon to the beginning and end of the string.
def gap_formatter(gap: str):
    splitted_gap = gap.strip().split('.')
    if ((':' not in splitted_gap[0]) and (int(splitted_gap[0])>59)):
        delta = datetime.timedelta(seconds = int(splitted_gap[0]))
        h, mod = divmod(delta.seconds, 3600)
        m, s = divmod(mod, 60)
        #print("{:02d}:{:02d}:{:02d}.{:03d}".format(h, m, s, int(splitted_gap[1].ljust(3,"0"))))
        return "{:02d}:{:02d}:{:02d}.{:03d}".format(h, m, s, int(splitted_gap[1].ljust(3,"0")))
    else:
        return time_formatter(splitted_gap)

# Function that calculates a driver's actual arrival time using the winner's arrival time and the time distance from it.
def time_converter(time_gap: str,race: str,pos: str):
    
    #FIRST_TIME RETRIEVAL
    #print("FIRST_TIME", end=" ")
    tmp = join[(join['raceId'] == int(race)) & (join['positionText'] == '1')]
    #print("Error") if (tmp.shape[0]!=1) else None
    first_splitted = str(tmp.iloc[0]['time']).strip().split('.')
    first_time = datetime.datetime.strptime(time_formatter(first_splitted), "%H:%M:%S.%f")
    
    #DELTA RETRIEVAL
    #print("DELTA", end=" ")
    splitted_time_gap = time_gap.strip().split('.')
    if ':' in (splitted_time_gap[0]):
        formatted_time_gap = time_formatter(splitted_time_gap)
        (h, m, s) = formatted_time_gap.split('.')[0].split(':')
    else:
        (h, m, s) = (0,0, splitted_time_gap[0])
    delta = datetime.timedelta(hours=int(h), minutes=int(m), seconds=int(s), milliseconds=int(splitted_time_gap[1]))
    
    '''
    print("NEW_TIME")
    if(int(race) < 22):
        print("POSITION-->",pos,"FIRST TIME-->",first_time, end=" ")
        print("DELTA-->",delta,"NEW TIME-->",((first_time + delta).strftime("%H:%M:%S.%f"))[:12])
    '''
    
    return first_time + delta

In [29]:
%%time 
#measure execution time

#iterate over the join dataframe
for index, row in join.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + "r_partecipation" + the partecipation id as URI
    R_partecipation = URIRef(FO["r_partecipation"+str(index)])
    # Add triples using store's add() method.
    g.add((R_partecipation, RDF.type, FO.RacePartecipation))
    if(str(row['number']) != '\\N'):
        g.add((R_partecipation, FO['hasCarNumber'], Literal(row['number'], datatype=XSD.integer)))
    if(str(row['grid']) != '\\N'):
        g.add((R_partecipation, FO['hasStartingGridPosition'], Literal(int(row['grid']), datatype=XSD.integer)))
    if(str(row['position']) != '\\N'):
        g.add((R_partecipation, FO['hasPosition'], Literal(row['position'], datatype=XSD.integer)))
    if(str(row['positionText']) != '\\N'):
        g.add((R_partecipation, FO['hasPositionText'], Literal(row['positionText'], datatype=XSD.string)))
    if(str(row['positionOrder']) != '\\N'):
        g.add((R_partecipation, FO['hasPositionOrder'], Literal(int(row['positionOrder']), datatype=XSD.integer)))
    if(str(row['points']) != '\\N'):
        g.add((R_partecipation, FO['hasPoints'], Literal(int(row['points']), datatype=XSD.integer)))
    if(str(row['laps']) != '\\N'):
        g.add((R_partecipation, FO['hasLaps'], Literal(int(row['laps']), datatype=XSD.integer)))
    if((str(row['time']) != '\\N') and (str(row['time']) != '+1:10')):
        #print(row['raceId'],row['driverId'], end="   ")
        if row['time'][0] == "+":
            new_time = time_converter(str(row['time'])[1:], str(row['raceId']),str(row['positionText']))
            g.add((R_partecipation, FO['hasResultTime'], Literal(new_time.strftime("%H:%M:%S.%f")[:12], datatype=XSD.time)))
            g.add((R_partecipation, FO['hasResultGap'], Literal(gap_formatter(str(row['time'])[1:]), datatype=XSD.time)))
        else:
            splitted = str(row['time']).strip().split('.')
            g.add((R_partecipation, FO['hasResultTime'], Literal(time_formatter(splitted), datatype=XSD.time)))
            g.add((R_partecipation, FO['hasResultGap'], Literal("00:00:00.000", datatype=XSD.time)))
            #if(int(row['raceId']) < 22):
                #print("POSITION-->",str(row['positionText']),"FIRST TIME-->",str(row['time']))
    if(str(row['milliseconds']) != '\\N'):
        g.add((R_partecipation, FO['hasMillisecondsResultTime'], Literal(row['milliseconds'], datatype=XSD.integer)))
    if(str(row['fastestLap']) != '\\N'):
        # Get the rows of the laps dataframe with the raceId, driverId and lap values matching those in the current row.
        tmp = laps[(laps['raceId'] == row['raceId']) & (laps['driverId'] == row['driverId']) & (laps['lap'] == int(row['fastestLap']))]
        #iterate over the rows found
        for index2, row2 in tmp.iterrows():
            # create the RDF node for lap
            Lap = URIRef(FO["lap"+str(index2)])
            # add the edge connecting the R_partecipation and the Lap 
            g.add((R_partecipation, FO['hasFastestLap'], Lap))
    if(str(row['rank']) != '\\N'):
        g.add((R_partecipation, FO['hasFastestLapRank'], Literal(row['rank'], datatype=XSD.integer)))
    if(str(row['fastestLapTime']) != '\\N'):
        g.add((R_partecipation, FO['hasFastestLapTime'], Literal("00:00:00.000"[0:12-len(str(row['fastestLapTime']))] + 
                                                                 str(row['fastestLapTime']), datatype=XSD.time)))
    if(str(row['fastestLapSpeed']) != '\\N'):
        g.add((R_partecipation, FO['hasFastestLapSpeed'], Literal(float(row['fastestLapSpeed']), datatype=XSD.decimal)))
    if(str(row['pointsConstructor']) != '\\N'):
        g.add((R_partecipation, FO['hasConstructorPoints'], Literal(int(row['pointsConstructor']), datatype=XSD.integer)))
    
    # create the RDF node for driver
    Driver = URIRef(FO["driver"+str(row['driverId'])])
    # add the edge connecting the Partecipation and the Driver 
    g.add((R_partecipation, FO['hasDriver'], Driver))
    
    if(str(row['constructorId']) != '\\N'):
        # create the RDF node for constructor
        Constructor = URIRef(FO["constructor"+str(int(row['constructorId']))])
        # add the edge connecting the Partecipation and the Constructor 
        g.add((R_partecipation, FO['hasConstructor'], Constructor))
    
    # create the RDF node for race
    Race = URIRef(FO["race"+str(row['raceId'])])
    # add the edge connecting the Partecipation and the Race 
    g.add((R_partecipation, FO['partecipatedInRace'], Race))
    
    if(str(row['statusId']) != '\\N'):
        # create the RDF node for status
        Status = URIRef(FO["status"+str(int(row['statusId']))])
        # add the edge connecting the Partecipation and the Status 
        g.add((R_partecipation, FO['hasStatus'], Status))

CPU times: total: 32.7 s
Wall time: 44.3 s


In [30]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'race_partecipations.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 8.86 s
Wall time: 11 s


# Qualifying

In [31]:
# Load the CSV files in memory
q_partecipations = pd.read_csv(qualifyingUrl, sep=',', index_col='qualifyId')
# cast year to int. If type(year) = str --> Literal= year-01-01
# movies.astype({'year': 'int32'}).dtypes

In [32]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("fo", FO)

In [33]:
%%time 
#measure execution time

#iterate over the q_partecipations dataframe
for index, row in q_partecipations.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + "q_partecipation" + the qualifying partecipation id as URI
    Q_partecipation = URIRef(FO["q_partecipation"+str(index)])
    # Add triples using store's add() method.
    g.add((Q_partecipation, RDF.type, FO.QualifPartecipation))
    g.add((Q_partecipation, FO['hasCarNumber'], Literal(row['number'], datatype=XSD.integer)))
    g.add((Q_partecipation, FO['hasPosition'], Literal(row['position'], datatype=XSD.integer)))
    if(str(row['q1']) != '\\N'):
        g.add((Q_partecipation, FO['hasQ1Time'], Literal("00:00:00.000"[0:12-len(str(row['q1']))] + 
                                                         str(row['q1']), datatype=XSD.time)))
    if(str(row['q2']) != '\\N'):
        g.add((Q_partecipation, FO['hasQ2Time'], Literal("00:00:00.000"[0:12-len(str(row['q2']))] + 
                                                         str(row['q2']), datatype=XSD.time)))
    if(str(row['q3']) != '\\N'):
        g.add((Q_partecipation, FO['hasQ3Time'], Literal("00:00:00.000"[0:12-len(str(row['q3']))] + 
                                                         str(row['q3']), datatype=XSD.time)))

    # create the RDF node for driver
    Driver = URIRef(FO["driver"+str(row['driverId'])])
    # add the edge connecting the qualifyingPartecipation and the Driver 
    g.add((Q_partecipation, FO['hasDriver'], Driver))
    
    # create the RDF node for constructor
    Constructor = URIRef(FO["constructor"+str(row['constructorId'])])
    # add the edge connecting the Partecipation and the Constructor 
    g.add((Q_partecipation, FO['hasConstructor'], Constructor))
    
    # create the RDF node for qualifying
    Qualifying = URIRef(FO["qualifying"+str(row['raceId'])])
    # Add triples using store's add() method.
    g.add((Qualifying, RDF.type, FO.Qualifying))
    # add the edge connecting the qualifyingPartecipation and the Qualifying 
    g.add((Q_partecipation, FO['partecipatedInQualif'], Qualifying))
    
    # create the RDF node for race
    Race = URIRef(FO["race"+str(row['raceId'])])
    # add the edge connecting the Race and the Qualifying 
    g.add((Race, FO['hasA'], Qualifying))
    
    # Qualifying starting dates and times are stored in the races dataframe,
    # then we retrieve them using the matching raceId
    Q_date_time = races[races.index == row['raceId']]
    if(str(Q_date_time['quali_date'].values[0]) != "\\N"):
        g.add((Qualifying, FO['hasDate'], Literal(Q_date_time['quali_date'].values[0], datatype=XSD.date)))
    if(str(Q_date_time['quali_time'].values[0]) != "\\N"):
        g.add((Qualifying, FO['hasTime'], Literal(Q_date_time['quali_time'].values[0], datatype=XSD.time)))

CPU times: total: 4.94 s
Wall time: 5.77 s


In [34]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'qualifying.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 1.8 s
Wall time: 2.55 s


# Sprint

In [35]:
# Load the CSV files in memory
s_partecipations = pd.read_csv(sprint_resultsUrl, sep=',', index_col='resultId')
# cast year to int. If type(year) = str --> Literal= year-01-01
# movies.astype({'year': 'int32'}).dtypes

In [36]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("fo", FO)

In [37]:
%%time 
#measure execution time

#iterate over the s_partecipations dataframe
for index, row in s_partecipations.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + "s_partecipation" + the sprint partecipation id as URI
    S_partecipation = URIRef(FO["s_partecipation"+str(index)])
    # Add triples using store's add() method.
    g.add((S_partecipation, RDF.type, FO.SprintPartecipation))
    g.add((S_partecipation, FO['hasCarNumber'], Literal(row['number'], datatype=XSD.integer)))
    g.add((S_partecipation, FO['hasStartingGridPosition'], Literal(row['grid'], datatype=XSD.integer)))
    if(str(row['position']) != '\\N'):
        g.add((S_partecipation, FO['hasPosition'], Literal(row['position'], datatype=XSD.integer)))
    g.add((S_partecipation, FO['hasPositionText'], Literal(row['positionText'], datatype=XSD.string)))
    g.add((S_partecipation, FO['hasPositionOrder'], Literal(row['positionOrder'], datatype=XSD.integer)))
    g.add((S_partecipation, FO['hasPoints'], Literal(row['points'], datatype=XSD.integer)))
    g.add((S_partecipation, FO['hasLaps'], Literal(row['laps'], datatype=XSD.integer)))
    if(str(row['time']) != '\\N'):
        if row['time'][0] == "+":
            new_time = time_converter(str(row['time'])[1:], str(row['raceId']),str(row['positionText']))
            g.add((S_partecipation, FO['hasResultTime'], Literal(new_time.strftime("%H:%M:%S.%f")[:12], datatype=XSD.time)))
            g.add((S_partecipation, FO['hasResultGap'], Literal(gap_formatter(str(row['time'])[1:]), datatype=XSD.time)))
        else:
            splitted = str(row['time']).strip().split('.')
            g.add((S_partecipation, FO['hasResultTime'], Literal(time_formatter(splitted), datatype=XSD.time)))
            g.add((S_partecipation, FO['hasResultGap'], Literal("00:00:00.000", datatype=XSD.time)))
    if(str(row['milliseconds']) != '\\N'):
        g.add((S_partecipation, FO['hasMillisecondsResultTime'], Literal(row['milliseconds'], datatype=XSD.integer)))
    if(str(row['fastestLap']) != '\\N'):
        g.add((S_partecipation, FO['hasFastestLap'], Literal(row['fastestLap'], datatype=XSD.integer)))
    if(str(row['fastestLapTime']) != '\\N'):
        g.add((S_partecipation, FO['hasFastestLapTime'], Literal("00:00:00.000"[0:12-len(str(row['fastestLapTime']))] + 
                                                                 str(row['fastestLapTime']), datatype=XSD.time)))
    
    # create the RDF node for driver
    Driver = URIRef(FO["driver"+str(row['driverId'])])
    # add the edge connecting the sprintPartecipation and the Driver
    g.add((S_partecipation, FO['hasDriver'], Driver))
    
    # create the RDF node for constructor
    Constructor = URIRef(FO["constructor"+str(row['constructorId'])])
    # add the edge connecting the sprintPartecipation and the Constructor
    g.add((S_partecipation, FO['hasConstructor'], Constructor))
    
    # create the RDF node for sprint
    Sprint = URIRef(FO["sprint"+str(row['raceId'])])
    # Add triples using store's add() method.
    g.add((Sprint, RDF.type, FO.Sprint))
    # add the edge connecting the sprintPartecipation and the Sprint 
    g.add((S_partecipation, FO['partecipatedInSprint'], Sprint))
    
    # create the RDF node for race
    Race = URIRef(FO["race"+str(row['raceId'])])
    # add the edge connecting the Race and the Sprint 
    g.add((Race, FO['hasA'], Sprint))
    
    # create the RDF node for status
    Status = URIRef(FO["status"+str(row['statusId'])])
    # add the edge connecting the sprintPartecipation and the Sprint 
    g.add((S_partecipation, FO['hasStatus'], Status))
    
    # Sprint starting dates and times are stored in the races dataframe,
    # then we retrieve them using the matching raceId
    S_date_time = races[races.index == row['raceId']]
    if(str(S_date_time['sprint_date'].values[0]) != "\\N"):
        g.add((Sprint, FO['hasDate'], Literal(S_date_time['sprint_date'].values[0], datatype=XSD.date)))
    if(str(S_date_time['sprint_time'].values[0]) != "\\N"):
        g.add((Sprint, FO['hasTime'], Literal(S_date_time['sprint_time'].values[0], datatype=XSD.time)))

CPU times: total: 391 ms
Wall time: 525 ms


In [38]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'sprint.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 93.8 ms
Wall time: 129 ms


# Standings

In [39]:
# Load the CSV files in memory
driver_standings = pd.read_csv(driver_standingsUrl, sep=',', index_col="driverStandingsId")
constructor_standings = pd.read_csv(constructor_standingsUrl, sep=',', index_col="constructorStandingsId")
# cast year to int. If type(year) = str --> Literal= year-01-01
# movies.astype({'year': 'int32'}).dtypes

In [40]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("fo", FO)

In [41]:
%%time 
#measure execution time

#iterate over the driver_standings dataframe
for index, row in driver_standings.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + "standing" + the standing id as URI
    Standing = URIRef(FO["d_standing"+str(index)])
    # Add triples using store's add() method.
    g.add((Standing, RDF.type, FO.DriverStanding))
    g.add((Standing, FO['hasTotalPoints'], Literal(int(row['points']), datatype=XSD.integer)))
    g.add((Standing, FO['hasTotalPosition'], Literal(int(row['position']), datatype=XSD.integer)))
    g.add((Standing, FO['hasTotalPositionText'], Literal(row['positionText'], datatype=XSD.string)))
    g.add((Standing, FO['hasDriversWins'], Literal(int(row['wins']), datatype=XSD.integer)))

    # create the RDF node for driver
    Driver = URIRef(FO["driver"+str(row['driverId'])])
    # add the edge connecting the Standing and the Driver 
    g.add((Standing, FO['hasDriver'], Driver))
    
    # create the RDF node for race
    Race = URIRef(FO["race"+str(row['raceId'])])
    # add the edge connecting the Standing and the Race 
    g.add((Standing, FO['hasRace'], Race))
    
#iterate over the constructor_standings dataframe
for index, row in constructor_standings.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + "standing" + the standing id as URI
    Standing = URIRef(FO["c_standing"+str(index)])
    # Add triples using store's add() method.
    g.add((Standing, RDF.type, FO.ConstructorStanding))
    g.add((Standing, FO['hasTotalPoints'], Literal(int(row['points']), datatype=XSD.integer)))
    g.add((Standing, FO['hasTotalPosition'], Literal(int(row['position']), datatype=XSD.integer)))
    g.add((Standing, FO['hasTotalPositionText'], Literal(row['positionText'], datatype=XSD.string)))
    g.add((Standing, FO['hasWins'], Literal(int(row['wins']), datatype=XSD.integer)))

    # create the RDF node for constructor
    Constructor = URIRef(FO["constructor"+str(row['constructorId'])])
    # add the edge connecting the Standing and the Constructor
    g.add((Standing, FO['hasConstructor'], Constructor))
    
    # create the RDF node for race
    Race = URIRef(FO["race"+str(row['raceId'])])
    # add the edge connecting the Standing and the Race 
    g.add((Standing, FO['hasRace'], Race))

CPU times: total: 8.67 s
Wall time: 10.5 s


In [42]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'standings.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 7.88 s
Wall time: 13.3 s


# Laps

In [43]:
# Load the CSV files in memory
# Csv already uploaded in Race partecipation section

In [44]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("fo", FO)

In [45]:
%%time 
#measure execution time

#iterate over the laps dataframe
for index, row in laps.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + "lap" + the lap id as URI
    Lap = URIRef(FO["lap"+str(index)])
    # Add triples using store's add() method.
    g.add((Lap, RDF.type, FO.Lap))
    g.add((Lap, FO['hasLapNumber'], Literal(row['lap'], datatype=XSD.integer)))
    g.add((Lap, FO['hasLapPosition'], Literal(row['position'], datatype=XSD.integer)))
    g.add((Lap, FO['hasLapTime'], Literal("00:00:00.000"[0:12-len(str(row['time']))] + 
                                          str(row['time']), datatype=XSD.time)))
    g.add((Lap, FO['hasMillisecondsTime'], Literal(row['milliseconds'], datatype=XSD.integer)))
    
    # Get the rows of the join dataframe with the raceId and driverId values matching those in the current row.
    tmp = join[(join['raceId'] == row['raceId']) & (join['driverId'] == row['driverId'])]
    #iterate over the rows found
    for index2, row2 in tmp.iterrows():
        # create the RDF node for racePartecipation
        R_partecipation = URIRef(FO["r_partecipation"+str(index2)])
        # add the edge connecting the racePartecipation and the Lap 
        g.add((R_partecipation, FO['hasLap'], Lap))

CPU times: total: 6min 16s
Wall time: 7min 32s


In [46]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'laps.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 1min 31s
Wall time: 1min 53s


# Pit stop

In [47]:
# Load the CSV files in memory
pit_stops = pd.read_csv(pit_stopsUrl, sep=',')
# cast year to int. If type(year) = str --> Literal= year-01-01
# movies.astype({'year': 'int32'}).dtypes

In [48]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("fo", FO)

In [49]:
%%time 
#measure execution time

#iterate over the pit_stops dataframe
for index, row in pit_stops.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + "stop" + the movie id as URI
    PitStop = URIRef(FO["stop"+str(index)])
    # Add triples using store's add() method.
    g.add((PitStop, RDF.type, FO.PitStop))
    g.add((PitStop, FO['hasStopNumber'], Literal(row['stop'], datatype=XSD.integer)))
    g.add((PitStop, FO['hasPitStopTimeOfDay'], Literal(str(row['time']), datatype=XSD.time)))
    g.add((PitStop, FO['hasMillisecondsTime'], Literal(row['milliseconds'], datatype=XSD.integer)))
    g.add((PitStop, FO['hasDuration'], Literal("00:00:00.000"[0:12-len(str(row['duration']))] + 
                                               str(row['duration']), datatype=XSD.time)))
    
    # Get the rows of the laps dataframe with the raceId, driverId and lap values matching those in the current row.
    tmp = laps[(laps['raceId'] == row['raceId']) & (laps['driverId'] == row['driverId']) & (laps['lap'] == row['lap'])]
    #iterate over the rows found
    for index2, row2 in tmp.iterrows():
        # create the RDF node for lap
        Lap = URIRef(FO["lap"+str(index2)])
        # add the edge connecting the PitStop and the Lap 
        g.add((PitStop, FO['hasPitStopLap'], Lap))
        
    # Get the rows of the join dataframe with the raceId and driverId values matching those in the current row.
    tmp2 = join[(join['raceId'] == row['raceId']) & (join['driverId'] == row['driverId'])]
    #iterate over the rows found
    for index2, row2 in tmp2.iterrows():
        # create the RDF node for racePartecipation
        R_partecipation = URIRef(FO["r_partecipation"+str(index2)])
        # add the edge connecting the racePartecipation and the PitStop 
        g.add((R_partecipation, FO['hasPitStop'], PitStop))

CPU times: total: 27.3 s
Wall time: 35 s


In [50]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'stops.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 2.08 s
Wall time: 3.53 s


# Ratings

In [51]:
# Load the CSV files in memory
ratings = pd.read_csv(ratingsUrl, sep=',')
# cast year to int. If type(year) = str --> Literal= year-01-01
# movies.astype({'year': 'int32'}).dtypes

In [52]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("fo", FO)

In [53]:
%%time 
#measure execution time

#iterate over the ratings dataframe
for index, row in ratings.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + "rating" + the movie id as URI
    Rating = URIRef(FO["rating"+str(index)])
    # Add triples using store's add() method.
    g.add((Rating, RDF.type, FO.Rating))
    g.add((Rating, FO['hasPeriod'], Literal(row['Period'], datatype=XSD.date)))
    g.add((Rating, FO['hasRating'], Literal(row['Rating'], datatype=XSD.integer)))
    g.add((Rating, FO['hasExperience'], Literal(row['Experience'], datatype=XSD.integer)))
    g.add((Rating, FO['hasRaceCraft'], Literal(row['Race Craft'], datatype=XSD.integer)))
    g.add((Rating, FO['hasAwareness'], Literal(row['Awareness'], datatype=XSD.integer)))
    g.add((Rating, FO['hasPace'], Literal(row['Pace'], datatype=XSD.integer)))
    g.add((Rating, FO['hasContractCost'], Literal(row['Contract Cost'], datatype=XSD.long)))
    g.add((Rating, FO['hasSalary'], Literal(row['Salary'], datatype=XSD.long)))
    g.add((Rating, FO['hasBuyout'], Literal(row['Buyout'], datatype=XSD.long)))
    
    # create the RDF node for season
    Season = URIRef(FO["season"+str(row['Year'])])
    # add the edge connecting the Rating and the Season 
    g.add((Rating, FO['inSeason'], Season))
    
    # in ratings csv names are full-names, then they are splitted in forename and surname
    name = str(row['Driver']).split(' ')
    forename = name[0].strip()
    surname = name[1].strip()
    # Get the rows of the drivers dataframe with the forename value (rating csv) contained in forename column of the current row (driver csv).
    subCsv = drivers[drivers['forename'].str.contains(forename, case=False)]
    # If exists at least one row in subCsv that contain also the surname of the rating
    if((subCsv['surname'].str.contains(surname, case=False)).any() == True):
        # Get the rows of the drivers dataframe with the forename and surname values contained in the matching column of the current row.
        dId = drivers[(drivers['forename'].str.contains(forename, case=False)) & (drivers['surname'].str.contains(surname, case=False))].index.values[0]
        # create the RDF node for driver
        Driver = URIRef(FO["driver"+str(dId)])
        # add the edge connecting the Rating and the Driver 
        g.add((Rating, FO['hasDriver'], Driver))

CPU times: total: 328 ms
Wall time: 448 ms


In [54]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'ratings.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 31.2 ms
Wall time: 96.4 ms
