# Football Data

## Structure of the graph created

```
@prefix wd: <http://www.wikidata.org/entity/> .
@prefix schema: <http://schema.org/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix ex: <http://example.org/> .

<player_uri> 
    a schema:Person ;
    schema:name "Player Name" ;
    schema:nationality <country_uri> ;
    ex:position <position_uri>;
    schema:memberOf <team_uri> ;
    ex:hasTenure <tenure_uri> . # Multiple tenures

<position_uri> 
    a ex:PlayerPosition ;
    schema:name "Position Name" .

<country_uri> 
    a schema:Country ;
    schema:name "Country Name" .

<team_uri> 
    a schema:SportsTeam ;
    schema:name "Team Name" ;
    schema:location <country_uri> ;
    schema:partOf <league_uri> .

<tenure_uri> 
    a schema:Role ;
    ex:atTeam <team_uri> ;
    schema:startDate "YYYY-MM-DD"^^xsd:date ;
    schema:endDate "YYYY-MM-DD"^^xsd:date. # Optional

<league_uri> 
    a schema:SportsOrganization ;
    schema:name "League Name" .

<tournament_uri> 
    a schema:SportsSeason ;
    schema:name "Tournament Name" ;
    schema:numParticipants  X^^xsd:integer;
    schema:location <country_uri> ;
    schema:winner <team_uri> ;
    schema:partOf <league_uri> ;
    schema:organizer <organizer_uri> ;
    schema:startDate "YYYY-MM-DD"^^xsd:date ;
    schema:endDate "YYYY-MM-DD"^^xsd:date ;
    schema:participant <team_uri> . # Multiple participants

<organizer_uri> 
    a schema:Organization ;
    schema:name "Organizer Name" .

<match_uri> 
    a ex:Match ;
    schema:name "Match Name" ;
    schema:organizer <organizer_uri> ;
    schema:location <country_uri> ;
    schema:startDate "YYYY-MM-DD"^^xsd:date ;
    schema:winner <team_uri> ;
    schema:partOf <tournament_uri> .
    schema:participant <team_uri> . # Multiple participants
    schema:participant <player_uri> . # Multiple participants

<goals_uri> 
    a ex:Score ;
    ex:player <player_uri> ;
    ex:goals X^^xsd:integer;
    ex:match <match_uri> .
```

In [1]:
import requests
from rdflib import Graph, Literal, Namespace, RDF, URIRef, BNode
from rdflib.namespace import FOAF, XSD
import re
import hashlib

def fetch_query(query):
    url = 'https://query.wikidata.org/sparql'
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, params={'query': query, 'format': 'json'})
    data = response.json()
    return data

def assert_iso8601_zulu(string):
    pattern = r"\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])T(0\d|1\d|2[0-3]):[0-5]\d:[0-5]\dZ"
    return re.match(pattern, string)


In [2]:
WD = Namespace("http://www.wikidata.org/entity/")
SCHEMA = Namespace("http://schema.org/")
EX = Namespace("http://example.org/")
g = Graph()
g.bind("wd", WD)
g.bind("schema", SCHEMA)
g.bind("foaf", FOAF)
g.bind("ex", EX)
# load from football_data.ttl
prev_g = Graph()
#prev_g.parse("football_data.ttl", format="turtle");

## Players
```
@prefix wd: <http://www.wikidata.org/entity/> .
@prefix schema: <http://schema.org/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix ex: <http://example.org/> .

# Definition of a player
<player_uri> a schema:Person ;
    schema:name "Player Name" ;
    schema:nationality <country_uri> ;
    ex:position <position_uri> .

# Definition of the player's position
<position_uri> a ex:PlayerPosition ;
    schema:name "Position Name" .

# Definition of the player's nationality
<country_uri> a schema:Country ;
    schema:name "Country Name" .
```

In [3]:
player_query = """
SELECT ?player ?playerLabel ?nationality ?nationalityLabel ?position ?positionLabel WHERE {
    ?player wdt:P106 wd:Q937857; # Instance of soccer player
            wdt:P27 ?nationality; # Nationality
            wdt:P21 wd:Q6581097;
            wdt:P413 ?position.
    SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
player_uris = set()
for i in range(0, 20):
    player_results = fetch_query(player_query+f"LIMIT {100} OFFSET {i*100}")
    
    for player in player_results["results"]["bindings"]:
        player_uri = URIRef(player["player"]['value'])
        player_uris.add(player["player"]['value'])
        player_name = Literal(player["playerLabel"]['value'])
        nationality = Literal(player["nationalityLabel"]['value'])
        nationality_uri = URIRef(player["nationality"]['value'])
        position_uri = URIRef(player["position"]['value'])
        position = Literal(player["positionLabel"]['value'])
        
        # Adding triples for each player
        g.add((player_uri, RDF.type, SCHEMA.Person))
        g.add((player_uri, SCHEMA.name, player_name))
        g.add((player_uri, SCHEMA.nationality, nationality_uri))
        g.add((player_uri, EX.position, position_uri))
        
        g.add((position_uri, RDF.type, EX.PlayerPosition))
        g.add((position_uri, SCHEMA.name, position))
        
        g.add((nationality_uri, RDF.type, SCHEMA.Country))
        g.add((nationality_uri, SCHEMA.name, nationality))
        
    print(f"{i} - {len(g)} triples. {len(player_uris)} players.".ljust(50), end="\r")

19 - 4720 triples. 1059 players.                  

In [4]:
# remove player_uri if they are already in the graph
player_uris = player_uris - set([str(player) for player in prev_g.subjects(RDF.type, SCHEMA.Person)])

print('Number of triples:', len(g), '\nNumber of players:', len(player_uris))

Number of triples: 4720 
Number of players: 1059


## Teams

```
@prefix wd: <http://www.wikidata.org/entity/> .
@prefix schema: <http://schema.org/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix ex: <http://example.org/> .

<team_uri> a schema:SportsTeam ;
    schema:name "Team Name" .

<player_uri> schema:memberOf <team_uri> ;
             ex:hasTenure <tenure_uri> . # Multiple tenures

<tenure_uri> a schema:Role ;
    ex:atTeam <team_uri> ;
    schema:startDate "YYYY-MM-DD"^^xsd:date ;
    schema:endDate "YYYY-MM-DD"^^xsd:date. # Optional
```

In [5]:
team_query_template = """
SELECT ?team ?teamLabel ?startTime ?endTime WHERE {
  BIND(%s AS ?player)
  ?player p:P54 ?playerTeamStatement.
  ?playerTeamStatement ps:P54 ?team;
                    pq:P580 ?startTime;
OPTIONAL { ?playerTeamStatement pq:P582 ?endTime. }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""

def generate_tenure_uri(player_uri, team_uri, start_date):
    # Use a consistent method to concatenate player_uri, team_uri, and start_date
    # Hashing is used here to ensure the result is a fixed length and URI-friendly
    uri_str = f"{player_uri}-{team_uri}-{start_date}"
    hash_uri = hashlib.sha256(uri_str.encode()).hexdigest()
    return f"http://example.org/tenure/{hash_uri}"

team_uris = set()
player_uris = list(player_uris)
for i in range(0, len(player_uris)):
    player_uri = player_uris[i]
    team_query = team_query_template % f"<{player_uri}>"
    team_results = fetch_query(team_query)
    # Process team_results and add data to the knowledge graph
    for result in team_results["results"]["bindings"]:
        team_uri = URIRef(result["team"]["value"])
        team_uris.add(result["team"]["value"])
        team_name = Literal(result["teamLabel"]["value"])
        start_time = Literal(result["startTime"]["value"], datatype=XSD.date)
        end_time = result.get("endTime", {}).get("value")  # Handle potential missing end time
        player_ref = URIRef(player_uri)
        
        # Adding team information
        g.add((team_uri, RDF.type, SCHEMA.SportsTeam))
        g.add((team_uri, SCHEMA.name, team_name))
        
        g.add((player_ref, SCHEMA.memberOf, team_uri))

        tenure_uri = generate_tenure_uri(player_uri, team_uri, start_time)
        tenure_ref = URIRef(tenure_uri)
        player_ref = URIRef(player_uri)
        
        g.add((tenure_ref, RDF.type, SCHEMA.Role))
        g.add((tenure_ref, EX.atTeam, team_uri))
        g.add((player_ref, EX.hasTenure, tenure_ref))
        g.add((tenure_ref, SCHEMA.startDate, start_time))
        if end_time and assert_iso8601_zulu(end_time):  # If there's an end date
            end_time = Literal(end_time, datatype=XSD.date)
            g.add((tenure_ref, SCHEMA.endDate, end_time))
        
    print(f"{len(g)} triples. {len(team_uris)} teams. {100*(i+1)/len(player_uris):.2f}%".ljust(50), end="\r")
player_uris = set(player_uris)

59032 triples. 2378 teams. 100.00%                

In [6]:
# remove teams that are already in the graph
team_uris = team_uris - set([str(team) for team in prev_g.subjects(RDF.type, SCHEMA.SportsTeam)])

print('Number of triples:', len(g), '\nNumber of players:', len(player_uris), '\nNumber of teams:', len(team_uris))

Number of triples: 59032 
Number of players: 1059 
Number of teams: 2378


#### Team details

```
@prefix wd: <http://www.wikidata.org/entity/> .
@prefix schema: <http://schema.org/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix ex: <http://example.org/> .

# Team information expanded with country and league
<team_uri> a schema:SportsTeam ;
    schema:location <country_uri> ;
    schema:partOf <league_uri> .

# Definition of the country the team is based in
<country_uri> a schema:Country ;
    schema:name "Country Name" .

# Definition of the league the team competes in
<league_uri> a schema:SportsOrganization ;
    schema:name "League Name" .
```

In [7]:
team_detail_query_template = """
SELECT ?teamLabel ?country ?league ?leagueLabel WHERE {
  BIND(<TEAM_URI> AS ?team)
  ?team wdt:P17 ?country; # Country the team is based in
        wdt:P118 ?league. # League the team competes in
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""

def execute_team_detail_query(team_uri):
    # Prepare the query by replacing <TEAM_URI> with the current team's URI
    query = team_detail_query_template.replace("<TEAM_URI>", f"<{team_uri}>")
    # Execute the query and return the results
    return fetch_query(query)

team_uris = list(team_uris)
for i in range(0, len(team_uris)):
    team_uri = team_uris[i]
    # Execute query for the current team
    team_detail_results = execute_team_detail_query(team_uri)
    
    # Process results and add to the RDF graph
    for result in team_detail_results["results"]["bindings"]:
        # Assuming you have a function to convert the URI to a RDFLib URIRef if needed
        team_uri_ref = URIRef(team_uri)
        country_uri = URIRef(result["country"]["value"])
        league_uri = URIRef(result["league"]["value"])
        league = Literal(result["leagueLabel"]["value"])
        
        # Add country and league to the graph as instances of their respective types
        g.add((country_uri, RDF.type, SCHEMA.Country))
        g.add((league_uri, RDF.type, SCHEMA.SportsOrganization))
        g.add((league_uri, SCHEMA.name, league))
        
        # Link team to country and league
        g.add((team_uri_ref, SCHEMA.location, country_uri))
        g.add((team_uri_ref, SCHEMA.partOf, league_uri))
    print(f"{len(g)} triples. {100*(i+1)/len(team_uris):.2f}%".ljust(50), end="\r")
team_uris = set(team_uris)

63670 triples. 100.00%                            

In [8]:
print('Number of triples:', len(g), '\nNumber of players:', len(player_uris), '\nNumber of teams:', len(team_uris))

Number of triples: 63670 
Number of players: 1059 
Number of teams: 2378


#### Team victories
    
```
@prefix wd: <http://www.wikidata.org/entity/> .
@prefix schema: <http://schema.org/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix ex: <http://example.org/> .

# Definition of a tournament (sports season) that a team has won
<tournament_uri> a schema:SportsSeason ;
    schema:name "Tournament Name" ;
    schema:winner <team_uri> ;
    schema:partOf <league_uri> ;
    schema:organizer <organizer_uri> ;
    schema:startDate "YYYY-MM-DD"^^xsd:date ;
    schema:endDate "YYYY-MM-DD"^^xsd:date .

# Definition of the league associated with the tournament
<league_uri> a schema:SportsOrganization ;
    schema:name "League Name" .

# Definition of the organizer of the tournament
<organizer_uri> a schema:Organization ;
    schema:name "Organizer Name" .
```

In [9]:
team_victory_query_template = """
SELECT ?team ?tournament ?tournamentLabel ?organizer ?organizerLabel ?league ?leagueLabel ?startTime ?endTime WHERE {
  BIND(<TEAM_URI> AS ?team)
  ?team wdt:P2522 ?tournament.
  ?tournament wdt:P31 wd:Q27020041; # Instance of sports season
              wdt:P664 ?organizer;
              wdt:P3450 ?league.
                
    OPTIONAL {
        ?tournament wdt:P580 ?startTime;
                    wdt:P582 ?endTime.
    }
    OPTIONAL {
        ?tournament wdt:P585 ?startTime;
    }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""

def execute_team_victory_query(team_uri):
    # Prepare the query by inserting the current team URI
    query = team_victory_query_template.replace("<TEAM_URI>", f"<{team_uri}>")
    # Execute the query
    return fetch_query(query)

# Initialize a set to store all unique tournament URIs encountered
tournament_uris = set()
team_uris = list(team_uris)
# Loop through each team URI to execute the victory query
for i in range(0, len(team_uris)):
    team_uri = team_uris[i]
    # Execute the query for the current team
    team_victory_results = execute_team_victory_query(team_uri)
    
    # Process the results for each team's victories
    for result in team_victory_results["results"]["bindings"]:
        tournament_uri = URIRef(result["tournament"]["value"])
        tournament_uris.add(result["tournament"]["value"])  # Collect tournament URIs
        
        # Remaining processing, including adding data to the RDF graph
        tournament_label = Literal(result["tournamentLabel"]["value"])
        organizer_uri = URIRef(result["organizer"]["value"])
        organizer_label = Literal(result["organizerLabel"]["value"])
        league_uri = URIRef(result["league"]["value"])
        league_label = Literal(result["leagueLabel"]["value"])
        start_time = result.get("startTime", {}).get("value")
        end_time = result.get("endTime", {}).get("value")
        
        g.add((tournament_uri, RDF.type, SCHEMA.SportsSeason))
        g.add((tournament_uri, SCHEMA.name, tournament_label))
        g.add((tournament_uri, SCHEMA.winner, URIRef(team_uri)))
        g.add((tournament_uri, SCHEMA.partOf, league_uri))
        g.add((tournament_uri, SCHEMA.organizer, organizer_uri))
        
        if start_time:
            start_time = Literal(start_time, datatype=XSD.date)
            g.add((tournament_uri, SCHEMA.startDate, start_time))
        if end_time:
            end_time = Literal(end_time, datatype=XSD.date)
            g.add((tournament_uri, SCHEMA.endDate, end_time))
            
        g.add((league_uri, RDF.type, SCHEMA.SportsOrganization))
        g.add((league_uri, SCHEMA.name, league_label))
        
        g.add((organizer_uri, RDF.type, SCHEMA.Organization))
        g.add((organizer_uri, SCHEMA.name, organizer_label))
        
    print(f"{len(g)} triples. {len(tournament_uris)} tournaments. {100*(i+1)/len(team_uris):.2f}%".ljust(50), end="\r")
team_uris = set(team_uris)

68378 triples. 689 tournaments. 100.00%           

In [10]:
# remove tournament_uris that are already in the graph
tournament_uris = tournament_uris - set([str(tournament) for tournament in prev_g.subjects(RDF.type, SCHEMA.SportsSeason)])

print('Number of triples:', len(g), '\nNumber of players:', len(player_uris), '\nNumber of teams:', len(team_uris), '\nNumber of tournaments:', len(tournament_uris))

Number of triples: 68378 
Number of players: 1059 
Number of teams: 2378 
Number of tournaments: 689


## Tournaments

```
@prefix wd: <http://www.wikidata.org/entity/> .
@prefix schema: <http://schema.org/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix ex: <http://example.org/> .

<tournament_uri> a ex:Tournament ;
    schema:name "Tournament Name" ;
    schema:numParticipants <number_of_participants>;
    schema:location <country_uri> ;
    schema:organizer <organizer_uri> ;
    schema:winner <winning_team_uri> ;
    schema:participant <team_uri> . # Multiple participants

<organizer_uri> a schema:Organization ;
    schema:name "Organizer Name" .

<winner_uri> a schema:SportsTeam ;
    schema:name "Team Name" .

<team_uri> a schema:SportsTeam ;
    schema:name "Team Name" .
```

In [11]:
tournament_details_query_template = """
SELECT ?tournamentLabel ?country ?numParticipants ?organizer ?organizerLabel ?winner ?winnerLabel WHERE {
  BIND(<TOURNAMENT_URI> AS ?tournament)
  ?tournament wdt:P17 ?country; # Country the tournament took place in
              wdt:P1132 ?numParticipants;
              wdt:P664 ?organizer;
              wdt:P1346 ?winner.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""

participating_teams_query_template = """
SELECT ?participatingTeam ?participatingTeamLabel WHERE {
  BIND(<TOURNAMENT_URI> AS ?tournament)
  ?tournament wdt:P1923 ?participatingTeam.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""

def execute_query_for_tournament(tournament_uri, query_template):
    # Replace <TOURNAMENT_URI> with the current tournament URI in the query template
    query = query_template.replace("<TOURNAMENT_URI>", f"<{tournament_uri}>")
    # Execute the query
    return fetch_query(query)

# Loop through each tournament URI and execute both queries
tournament_uris = list(tournament_uris)
for i in range(0, len(tournament_uris)):
    tournament_uri = tournament_uris[i]
    # Assuming tournaments_uri contains string representations of URIs
    tournament_details_results = execute_query_for_tournament(tournament_uri, tournament_details_query_template)
    participating_teams_results = execute_query_for_tournament(tournament_uri, participating_teams_query_template)
    
    tournament_uri = URIRef(tournament_uri)
    for result in tournament_details_results["results"]["bindings"]:
        tournament_name = Literal(result["tournamentLabel"]["value"])
        num_participants = Literal(result["numParticipants"]["value"], datatype=XSD.integer)
        
        organizer = URIRef(result["organizer"]["value"])
        organizer_name = Literal(result["organizerLabel"]["value"])
        
        winner = URIRef(result["winner"]["value"])
        winner_name = Literal(result["winnerLabel"]["value"])
        
        country = URIRef(result["country"]["value"])
        
        g.add((tournament_uri, SCHEMA.location, country))
        g.add((tournament_uri, RDF.type, EX.Tournament))
        g.add((tournament_uri, SCHEMA.name, tournament_name))
        g.add((tournament_uri, SCHEMA.numParticipants, num_participants))
        g.add((tournament_uri, SCHEMA.organizer, organizer))
        g.add((tournament_uri, SCHEMA.winner, winner))
        
        g.add((organizer, RDF.type, SCHEMA.Organization))
        g.add((organizer, SCHEMA.name, organizer_name))

        g.add((winner, RDF.type, SCHEMA.SportsTeam))
        g.add((winner, SCHEMA.name, winner_name))
    
    for result in participating_teams_results["results"]["bindings"]:
        if "participatingTeam" not in result:
            continue
        team_uri = URIRef(result["participatingTeam"]["value"])
        team_name = Literal(result["participatingTeamLabel"]["value"])
        
        g.add((team_uri, RDF.type, SCHEMA.SportsTeam))
        g.add((team_uri, SCHEMA.name, team_name))
        g.add((tournament_uri, SCHEMA.participant, team_uri))
    print(f"{len(g)} triples. {100*(i+1)/len(tournament_uris):.2f}%".ljust(50), end="\r")
tournament_uris = set(tournament_uris)

77349 triples. 100.00%                            

In [12]:
print('Number of triples:', len(g), '\nNumber of players:', len(player_uris), '\nNumber of teams:', len(team_uris), '\nNumber of tournaments:', len(tournament_uris))

Number of triples: 77349 
Number of players: 1059 
Number of teams: 2378 
Number of tournaments: 689


## Matches

```
@prefix wd: <http://www.wikidata.org/entity/> .
@prefix schema: <http://schema.org/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix ex: <http://example.org/> .

<match_uri> a ex:Match ;
    schema:name "Match Name" ;
    schema:organizer <organizer_uri> ;
    schema:location <country_uri> ;
    schema:startDate "YYYY-MM-DD"^^xsd:date ;
    schema:winner <winning_team_uri> ;
    schema:partOf <tournament_uri> .
    schema:participant <team_uri> . # Multiple participants
    schema:participant <player_uri> . # Multiple participants

<organizer_uri> a schema:Organization ;
    schema:name "Organizer Name" .

<country_uri> a schema:Country .

<winning_team_uri> a schema:SportsTeam .

<tournament_uri> a schema:SportsSeason .
```

In [13]:
match_query = """
SELECT ?match ?matchLabel ?organizer ?location ?date ?winner ?tournament WHERE{
BIND(%s AS ?player)
?match wdt:P710 ?player;
       wdt:P664 ?organizer;
       wdt:P17 ?location;
       wdt:P585 ?date;
       wdt:P1346 ?winner;
       wdt:P361|wdt:P179 ?tournament.
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""


def execute_match_query(player_uri):
    query = match_query % f"<{player_uri}>"
    return fetch_query(query)

match_uris = set()
player_uris = list(player_uris)
for i in range(0, len(player_uris)):
    player_uri = player_uris[i]
    match_results = execute_match_query(player_uri)
    
    for match in match_results["results"]["bindings"]:
        match_uri = URIRef(match["match"]["value"])
        match_uris.add(match["match"]["value"])
        match_name = Literal(match["matchLabel"]["value"])
        organizer_uri = URIRef(match["organizer"]["value"])
        location_uri = URIRef(match["location"]["value"])
        date = Literal(match["date"]["value"], datatype=XSD.date)
        winner_uri = URIRef(match["winner"]["value"])
        tournament_uri = URIRef(match["tournament"]["value"])
        
        g.add((match_uri, RDF.type, EX.Match))
        g.add((match_uri, SCHEMA.name, match_name))
        g.add((match_uri, SCHEMA.organizer, organizer_uri))
        g.add((match_uri, SCHEMA.location, location_uri))
        g.add((match_uri, SCHEMA.startDate, date))
        g.add((match_uri, SCHEMA.winner, winner_uri))
        g.add((match_uri, SCHEMA.partOf, tournament_uri))
        
        g.add((organizer_uri, RDF.type, SCHEMA.Organization))
        g.add((location_uri, RDF.type, SCHEMA.Country))
        g.add((winner_uri, RDF.type, SCHEMA.SportsTeam))
        g.add((tournament_uri, RDF.type, SCHEMA.SportsSeason))
    print(f"{len(g)} triples. {len(match_uris)} matches. {100*(i+1)/len(player_uris):.2f}%".ljust(50), end="\r")
player_uris = set(player_uris)

106577 triples. 3496 matches. 100.00%             

In [14]:
# remove matches already in graph
match_uris = match_uris - set([str(match) for match in prev_g.subjects(RDF.type, SCHEMA.SportsEvent)])

print('Number of triples:', len(g), '\nNumber of players:', len(player_uris), '\nNumber of teams:', len(team_uris), '\nNumber of tournaments:', len(tournament_uris), '\nNumber of matches:', len(match_uris))

Number of triples: 106577 
Number of players: 1059 
Number of teams: 2378 
Number of tournaments: 689 
Number of matches: 3496


#### Get match participants and participating teams

In [15]:
match_team_query = """
SELECT ?match ?team WHERE{
BIND(%s AS ?match)
?match wdt:P1923 ?team.
}
"""

match_player_query = """
SELECT ?match ?player WHERE{
BIND(%s AS ?match)
?match wdt:P710 ?player.
}
"""

def execute_extra_match_query(query, match_uri):
    query = query % f"<{match_uri}>"
    return fetch_query(query)

match_uris = list(match_uris)
for i in range(0, len(match_uris)):
    match_uri = match_uris[i]
    match_team_results = execute_extra_match_query(match_team_query, match_uri)
    match_player_results = execute_extra_match_query(match_player_query, match_uri)
    
    match_uri = URIRef(match_uri)
    for team in match_team_results["results"]["bindings"]:
        team_uri = URIRef(team["team"]["value"])
        g.add((team_uri, RDF.type, SCHEMA.SportsTeam))
        g.add((match_uri, SCHEMA.participant, team_uri))
    
    for player in match_player_results["results"]["bindings"]:
        player_uri = URIRef(player["player"]["value"])
        g.add((player_uri, RDF.type, SCHEMA.Person))
        g.add((match_uri, SCHEMA.participant, player_uri))
    print(f"{len(g)} triples. {100*(i+1)/len(match_uris):.2f}%".ljust(50), end="\r")
match_uris = set(match_uris)

223338 triples. 100.00%                           

#### Goals

```
@prefix wd: <http://www.wikidata.org/entity/> .
@prefix schema: <http://schema.org/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix ex: <http://example.org/> .

<goals_uri> a ex:Score ;
    ex:player <player_uri> ;
    ex:goals <number_of_goals> ;
    ex:match <match_uri> .

<player_uri> a schema:Person .
<match_uri> a ex:Match .
```

In [16]:
goal_query = """
SELECT ?player (COUNT(?player) AS ?goals) WHERE{
BIND(%s AS ?match)
?match wdt:P1363 ?player. # point scored by
}
GROUP BY ?player
"""

def generate_goals_uri(player_uri, match_uri):
    # Use a consistent method to concatenate player_uri, team_uri, and start_date
    # Hashing is used here to ensure the result is a fixed length and URI-friendly
    uri_str = f"{player_uri}-{match_uri}"
    hash_uri = hashlib.sha256(uri_str.encode()).hexdigest()
    return f"http://example.org/scores/{hash_uri}"

def execute_goal_query(match_uri):
    query = goal_query % f"<{match_uri}>"
    return fetch_query(query)

match_uris = list(match_uris)
for i in range(0, len(match_uris)):
    match_uri = match_uris[i]
    goal_results = execute_goal_query(match_uri)
    match_uri = URIRef(match_uri)
    for goal in goal_results["results"]["bindings"]:
        player_uri = URIRef(goal["player"]["value"])
        goals = Literal(goal["goals"]["value"], datatype=XSD.integer)
        
        goals_uri = generate_goals_uri(player_uri, match_uri)
        goals_ref = URIRef(goals_uri)
        
        g.add((goals_ref, RDF.type, EX.Score))
        g.add((goals_ref, EX.player, player_uri))
        g.add((goals_ref, EX.goals, goals))
        g.add((goals_ref, EX.match, match_uri))
    print(f"{len(g)} triples. {100*(i+1)/len(match_uris):.2f}%".ljust(50), end="\r")
match_uris = set(match_uris)

259602 triples. 100.00%                           

In [17]:
print('Number of triples:', len(g), '\nNumber of players:', len(player_uris), '\nNumber of teams:', len(team_uris), '\nNumber of tournaments:', len(tournament_uris), '\nNumber of matches:', len(match_uris))

Number of triples: 259602 
Number of players: 1059 
Number of teams: 2378 
Number of tournaments: 689 
Number of matches: 3496


In [18]:
# merge the two graphs
g.parse(data=prev_g.serialize(format="turtle"), format="turtle")

<Graph identifier=Nff96ca215ea249d09627420db7ce8ba2 (<class 'rdflib.graph.Graph'>)>

In [19]:
# Save the graph to a file
g.serialize("football_data_small.ttl", format="turtle")

<Graph identifier=Nff96ca215ea249d09627420db7ce8ba2 (<class 'rdflib.graph.Graph'>)>