# Populating cricket ontology

In [1]:
!pip install rdflib
!pip install pyyaml

Collecting rdflib
  Downloading rdflib-7.1.1-py3-none-any.whl.metadata (11 kB)
Collecting isodate<1.0.0,>=0.7.2 (from rdflib)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading rdflib-7.1.1-py3-none-any.whl (562 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m562.4/562.4 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.7.2 rdflib-7.1.1


In [2]:
# required libraries
import pandas as pd
import yaml
import os
from pathlib import Path
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD, SKOS
# CHECK DATE
import datetime

In [3]:
# saving folder
path = str(os.getcwd())
savePath =  path + '/data/rdf/'

In [4]:
# Construct the cricket ontology not known by RDFlib
CRI = Namespace("https://www.dei.unipd.it/lodb/cri/")

In [5]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("skos", SKOS)
g.bind("cri", CRI)

In [6]:
# path joining version for other paths
DIR = path + '/data/matches/'
print(len([name for name in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, name))]))

1232


## Outcome

In [7]:
%%time
#measure execution time

# assign directory
matchDir = path + '/data/matches/'

# iterate over files in that directory
for filename in os.listdir(matchDir):
  # Load the YAML files in memory
  with open(os.path.join(matchDir, filename), 'r') as file:
    matchData = yaml.safe_load(file)
    #print(matchData['info']['outcome'])
    # Get the filename without the extension
    filename_without_extension = os.path.splitext(os.path.basename(filename))[0]

    # Create the node to add to the Graph
    # Creating an instance of a Match
    # the node has the namespace + the match id(filename) as URI
    idM = "match"+filename_without_extension
    Match = URIRef(CRI[idM])
    # Add triples using store's add() method.
    g.add((Match, RDF.type, CRI.Match))

    # Creating an instance of an Outcome
    # the node has the namespace + the outcome id(filename) as URI
    idO = "outcome"+filename_without_extension
    Outcome = URIRef(CRI[idO])
    # Add triples using store's add() method.
    #g.add((Outcome, RDF.type, CRI.Outcome))
    g.add((Match, CRI['hasOutcome'], Outcome))

    # Creating 2 instances of a Team
    # the nodes have the namespace + the team name as URI
    """idT0 = "team"+(matchData['info']['teams'][0]).replace(" ","")
    idT1 = "team"+(matchData['info']['teams'][1]).replace(" ","")
    Team0 = URIRef(CRI[idT0])
    Team1 = URIRef(CRI[idT1])
    # Add triples using store's add() method.
    g.add((Team0, RDF.type, CRI.Team))
    g.add((Team1, RDF.type, CRI.Team))"""

    # Checking if match has been won or not
    if "by" in matchData['info']['outcome']:
        # Creating an instance of a Win
        # the node has the namespace + the win id(filename) as URI
        #idW = "win"+filename_without_extension
        #Win = URIRef(CRI[idW])
        # Add triples stating that is a win
        #g.add((Win, RDF.type, CRI.Win))
        g.add((Outcome, CRI['outcomeDescription'], Literal("win", datatype=XSD.string)))
        # Adding triple that states which Team is the winner
        idWinner = "team"+(matchData['info']['outcome']['winner']).replace(" ","")
        Winner = URIRef(CRI[idWinner])
        g.add((Outcome, CRI['wonBy'], Winner))

        if "runs" in matchData['info']['outcome']['by']:
          # Creating an instance of a WinByRuns
          # the node has the namespace + the win id(filename) as URI
          #idRuns = "winByRuns"+filename_without_extension
          #WinByRuns = URIRef(CRI[idRuns])
          # Add triples using store's add() method.
          #g.add((WinByRuns, RDF.type, CRI.WinByRuns))
          g.add((Outcome, RDF.type, CRI.WinByRuns))
          #g.add((Outcome, CRI['wonBy'], Winner))
          g.add((Outcome, CRI['runs'], Literal(matchData['info']['outcome']['by']['runs'], datatype=XSD.integer)))
          #g.add((Match, CRI['hasOutcome'], WinByRuns))
        elif "wickets" in matchData['info']['outcome']['by']:
          # Creating an instance of a WinByRuns
          # the node has the namespace + the win id(filename) as URI
          #idWickets = "winByWickets"+filename_without_extension
          #WinByWickets = URIRef(CRI[idWickets])
          # Add triples using store's add() method.
          g.add((Outcome, RDF.type, CRI.WinByWickets))
          g.add((Outcome, CRI['wickets'], Literal(matchData['info']['outcome']['by']['wickets'], datatype=XSD.integer)))
          #g.add((Match, CRI['hasOutcome'], WinByWickets))

    elif "result" in matchData['info']['outcome']:
      if matchData['info']['outcome']['result'] == "tie":
        # Creating an instance of a Tie
        # the node has the namespace + the Tie id(filename) as URI
        #idTie = "tie"+filename_without_extension
        #Tie = URIRef(CRI[idTie])
        #g.add((Tie, RDF.type, CRI.Tie))
        g.add((Outcome, RDF.type, CRI.Tie))
        #g.add((Match, CRI['hasOutcome'], Tie))
        g.add((Outcome, CRI['outcomeDescription'], Literal(matchData['info']['outcome']['result'], datatype=XSD.string)))
        if "eliminator" in matchData['info']['outcome']:
          idEliminator = "team"+(matchData['info']['outcome']['eliminator']).replace(" ","")
          Eliminator = URIRef(CRI[idEliminator])
          #g.add((Tie, CRI['hasTieBreakWinner'], Eliminator))
          g.add((Outcome, CRI['hasTieBreakWinner'], Eliminator))
        elif "bowl_out" in matchData['info']['outcome']:
          idBowlOut = "team"+(matchData['info']['outcome']['bowl_out']).replace(" ","")
          BowlOut = URIRef(CRI[idBowlOut])
          #g.add((Tie, CRI['hasTieBreakWinner'], BowlOut))
          g.add((Outcome, CRI['hasTieBreakWinner'], BowlOut))
      elif matchData['info']['outcome']['result'] == "no result":
        g.add((Match, CRI['hasOutcome'], Outcome))
        g.add((Outcome, CRI['outcomeDescription'], Literal(matchData['info']['outcome']['result'], datatype=XSD.string)))


CPU times: user 6min 4s, sys: 1.28 s, total: 6min 5s
Wall time: 6min 15s


In [8]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
# may have to change to match
with open(savePath + 'newoutcomes.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))
    #.decode("utf-8")


--- saving serialization ---
CPU times: user 450 ms, sys: 994 µs, total: 451 ms
Wall time: 456 ms
