## Populate FASTianF1 RDF database

This notebook reports the main steps to download CSV files, process them and create an RDF dataset from them accordingly to an ontology.

To measure execution time in Jupyter notebooks: <code>pip install ipython-autotime</code>

In [None]:
# required libraries
import pandas as pd
import os
from pathlib import Path

In [None]:
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD

In [None]:
# CHECK DATE 
import datetime

# Paths

In [None]:
# parameters and URLs
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())
circuitsUrl = path + '/data/kaggle/circuits.csv'
constructor_resultsUrl = path + '/data/kaggle/constructor_results.csv'
constructor_standingsUrl = path + '/data/kaggle/constructor_standings.csv'
constructorsUrl = path + '/data/kaggle/constructors.csv'
driver_standingsUrl = path + '/data/kaggle/driver_standings.csv'
driversUrl = path + '/data/kaggle/drivers.csv'
lap_timesUrl = path + '/data/kaggle/lap_times.csv'
pit_stopsUrl = path + '/data/kaggle/pit_stops.csv'
qualifyingUrl = path + '/data/kaggle/qualifying.csv'
racesUrl = path + '/data/kaggle/races.csv'
resultsUrl = path + '/data/kaggle/results.csv'
sprint_resultsUrl = path + '/data/kaggle/sprint_results.csv'

# country codes
countriesURL = path + '/data/countryCodes/wikipedia-iso-country-codes.csv'

# saving folder
savePath =  path + '/data/rdf/'

# Costruction of the graph

In [None]:
# Construct the country and the movie ontology namespaces not known by RDFlib
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
FO = Namespace("http://www.dei.unipd.it/database2/FASTianF1ontology#")

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("fo", FO)

# Countries

In [None]:
#load the country codes
# we need to convert NaN values to something else otherwise NA strings are converted to NaN -> problem with Namibia
countries = pd.read_csv(countriesURL, sep=',', index_col='Name', keep_default_na=False, na_values=['_'])

# Drivers

In [None]:
# Load the CSV files in memory
drivers = pd.read_csv(driversUrl, sep=',', index_col='driverId')
# cast year to int. If type(year) = str --> Literal= year-01-01
# movies.astype({'year': 'int32'}).dtypes

In [None]:
%%time 
#measure execution time

#iterate over the movies dataframe
for index, row in drivers.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the movie id as URI
    Driver = URIRef(FO[index])
    # Add triples using store's add() method.
    g.add((Driver, RDF.type, FO.Driver))
    g.add((Driver, FO['hasDriverRef'], Literal(row['driverRef'], datatype=XSD.string)))
    g.add((Driver, FO['hasNumber'], Literal(row['number'], datatype=XSD.integer)))
    g.add((Driver, FO['hasCode'], Literal(row['code'], datatype=XSD.string)))
    g.add((Driver, FO['hasForename'], Literal(row['forename'], datatype=XSD.string)))
    g.add((Driver, FO['hasSurname'], Literal(row['surname'], datatype=XSD.string)))
    g.add((Driver, FO['hasDOB'], Literal(row['dob'], datatype=XSD.string)))
    g.add((Driver, FO['hasURL'], Literal(row['url'], datatype=XSD.string)))
    g.add((Driver, FO['hasNationality'], Literal(row['nationality'], datatype=XSD.string)))
   
    try:
        datetime.datetime.strptime(str(row['dob']), '%Y-%m-%d')
        g.add((Driver, FO['hasDOB'], Literal(row['dob'], datatype=XSD.date)))
    except ValueError:
        # probably it's the year alone
        # check length
        if (len(row['dob'])==4):
            #it is a year
            g.add((Driver, FO['hasDOB'], Literal(row['dob']+"-01-01", datatype=XSD.date)))

    ## handle country
    #there can be more than one country per movie
    for c in str(row['nationality']).split('-'):
        cName = c.strip()
        # check if the country exists
        # country.index == x returns an array of booleans, thus we need to use the any() method
        if((countries.index == cName).any() == True):
            #get the country code, convert to string and get the lower case to match the country codes in the ontology 
            code = str(countries[countries.index == cName]['Alpha-2 code'][0]).lower()
            # create the RDF node
            Country = URIRef(CNS[code])
            # add the edge connecting the Movie and the Country 
            g.add((Movie, MO['hasNationality'], Country))