# Devoir maison pour le compte du cours de DBS

In [1]:
#!pip install pandas rdflib
#!pip install SPARQLWrapper

In [2]:
# Import la librarie rdflib
from rdflib import Graph, Literal, RDF, URIRef, Namespace, XSD

## Partie 0:  Choix des jeux de données

### Les sources de données

#### 1) NBA games data, Nathan Lugan (Kaggle)
Link : https://www.kaggle.com/datasets/nathanlauga/nba-games?select=games.csv

Colonnes d'intéret : 

GAME_DATE_EST : Le jour du match

GAME_ID : L'identifiant du match

HOME_TEAM_ID : L'identifiant de l'équipe à domicile

VISITOR_TEAM_ID : L'identifiant de l'équipe à l'exterieur

SEASON : La saison

PTS_home : Nombre de points dans le match de l'équipe à domicile

PTS_away : Nombre de points dans le match de l'équipe à l'exterieur

HOME_TEAM_WINS : Valeur booléenne qui si l'équipe à domicile à gagné le match (1) ou non (0)

#### 2) NBA Database, Wyatt Walsh (Kaggle). Lien : https://www.kaggle.com/datasets/wyattowalsh/basketball

person_id : Identifiant de chaque joueur

first_name : Prénom du joueur

last_name : Nom du joueur

birthdate : Date d'anniversaire

school : L'université de laquelle le joueur vient

height : La taille du joueur

position : Sa position

team_id : L'identifiant de son équipe

team_name : Le nom de son équipe

team_city : Ville de son équipe

team_abbreviation : L'abréviation de son équipe

draft_year : L'année de draft du joueur

draft_round : Le tour auquel le joueur a été drafté



# Partie 1 : Préprocessing des données

In [3]:
import pandas as pd
pd.set_option("display.max_rows", None) # Afficher toutes les lignes
pd.set_option("display.max_columns", None) # Afficher toutes les colonnes

In [4]:
# Import the dataset of players

players = pd.read_csv("./csv_datasets/Players.csv")
players = players[["person_id", "first_name", "last_name", "birthdate", "school", "height", "position", "team_id", "team_name", "team_city", "team_abbreviation", "draft_year", "draft_round"]]
players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4171 entries, 0 to 4170
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   person_id          4171 non-null   int64 
 1   first_name         4171 non-null   object
 2   last_name          4171 non-null   object
 3   birthdate          4171 non-null   object
 4   school             4156 non-null   object
 5   height             4075 non-null   object
 6   position           4108 non-null   object
 7   team_id            4171 non-null   int64 
 8   team_name          3469 non-null   object
 9   team_city          3469 non-null   object
 10  team_abbreviation  3469 non-null   object
 11  draft_year         4171 non-null   object
 12  draft_round        4007 non-null   object
dtypes: int64(2), object(11)
memory usage: 423.7+ KB


In [5]:
# Import the Dataset games
games = pd.read_csv("./csv_datasets/Games.csv")
games = games[["GAME_DATE_EST", "GAME_ID", "HOME_TEAM_ID", "VISITOR_TEAM_ID", "SEASON", "PTS_home", "PTS_away", "HOME_TEAM_WINS"]]
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26651 entries, 0 to 26650
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   GAME_DATE_EST    26651 non-null  object 
 1   GAME_ID          26651 non-null  int64  
 2   HOME_TEAM_ID     26651 non-null  int64  
 3   VISITOR_TEAM_ID  26651 non-null  int64  
 4   SEASON           26651 non-null  int64  
 5   PTS_home         26552 non-null  float64
 6   PTS_away         26552 non-null  float64
 7   HOME_TEAM_WINS   26651 non-null  int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 1.6+ MB


In [6]:
games.dropna(inplace=True)
games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26552 entries, 0 to 26650
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   GAME_DATE_EST    26552 non-null  object 
 1   GAME_ID          26552 non-null  int64  
 2   HOME_TEAM_ID     26552 non-null  int64  
 3   VISITOR_TEAM_ID  26552 non-null  int64  
 4   SEASON           26552 non-null  int64  
 5   PTS_home         26552 non-null  float64
 6   PTS_away         26552 non-null  float64
 7   HOME_TEAM_WINS   26552 non-null  int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 1.8+ MB


In [7]:
players.dropna(inplace=True)
players.info()


<class 'pandas.core.frame.DataFrame'>
Index: 3311 entries, 0 to 4169
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   person_id          3311 non-null   int64 
 1   first_name         3311 non-null   object
 2   last_name          3311 non-null   object
 3   birthdate          3311 non-null   object
 4   school             3311 non-null   object
 5   height             3311 non-null   object
 6   position           3311 non-null   object
 7   team_id            3311 non-null   int64 
 8   team_name          3311 non-null   object
 9   team_city          3311 non-null   object
 10  team_abbreviation  3311 non-null   object
 11  draft_year         3311 non-null   object
 12  draft_round        3311 non-null   object
dtypes: int64(2), object(11)
memory usage: 362.1+ KB


In [8]:
players["team_id"].nunique()    

45

In [9]:
# Sets de team_id présents dans games
games_teams = set(games["HOME_TEAM_ID"].unique()).union(set(games["VISITOR_TEAM_ID"].unique()))

# Filtrer players pour ne garder que les team_id présents dans games
players_cleaned = players[players["team_id"].isin(games_teams)]

# Remplacer les valeurs "undrafted" par None dans la colonne draft_round
players_cleaned["draft_round"] = players_cleaned["draft_round"].replace("Undrafted", "-1")

# Convertir la colonne birthday en datetime
players_cleaned["birthdate"] = pd.to_datetime(players_cleaned["birthdate"])

# Sauvegarder si besoin
players_cleaned.to_csv("players_cleaned.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_cleaned["draft_round"] = players_cleaned["draft_round"].replace("Undrafted", "-1")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_cleaned["birthdate"] = pd.to_datetime(players_cleaned["birthdate"])


In [10]:
players = pd.read_csv("players_cleaned.csv")

In [11]:
# Extraire les colonnes équipe depuis player.csv
teams_from_players = players[["team_id", "team_name", "team_city", "team_abbreviation"]].drop_duplicates()

# Extraire les équipes présentes dans game.csv
home_teams = games[["HOME_TEAM_ID"]].rename(columns={"HOME_TEAM_ID": "team_id"})
visitor_teams = games[["VISITOR_TEAM_ID"]].rename(columns={"VISITOR_TEAM_ID": "team_id"})
teams_from_games = pd.concat([home_teams, visitor_teams]).drop_duplicates()

# Fusionner pour avoir toutes les équipes
teams = pd.merge(teams_from_games, teams_from_players, on="team_id", how="left")

# Sauvegarder en CSV
teams_unique = teams.groupby("team_id").agg(lambda x: x.mode()[0]).reset_index()
teams_unique.to_csv("team.csv", index=False)


In [12]:
games.columns

Index(['GAME_DATE_EST', 'GAME_ID', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'SEASON',
       'PTS_home', 'PTS_away', 'HOME_TEAM_WINS'],
      dtype='object')

In [13]:
game = pd.read_csv("./csv_datasets/Games.csv")

In [14]:
game.isna().sum() 

GAME_DATE_EST        0
GAME_ID              0
GAME_STATUS_TEXT     0
HOME_TEAM_ID         0
VISITOR_TEAM_ID      0
SEASON               0
TEAM_ID_home         0
PTS_home            99
FG_PCT_home         99
FT_PCT_home         99
FG3_PCT_home        99
AST_home            99
REB_home            99
TEAM_ID_away         0
PTS_away            99
FG_PCT_away         99
FT_PCT_away         99
FG3_PCT_away        99
AST_away            99
REB_away            99
HOME_TEAM_WINS       0
dtype: int64

In [15]:
player = pd.read_csv("./csv_datasets/player.csv")


In [16]:
players.isna().sum()

person_id            0
first_name           0
last_name            0
birthdate            0
school               0
height               0
position             0
team_id              0
team_name            0
team_city            0
team_abbreviation    0
draft_year           0
draft_round          0
dtype: int64

In [17]:
player.isna().sum()

id            0
full_name     0
first_name    6
last_name     0
is_active     0
dtype: int64

In [18]:
games.isna().sum()

GAME_DATE_EST      0
GAME_ID            0
HOME_TEAM_ID       0
VISITOR_TEAM_ID    0
SEASON             0
PTS_home           0
PTS_away           0
HOME_TEAM_WINS     0
dtype: int64

In [19]:
player.head()

Unnamed: 0,id,full_name,first_name,last_name,is_active
0,76001,Alaa Abdelnaby,Alaa,Abdelnaby,0
1,76002,Zaid Abdul-Aziz,Zaid,Abdul-Aziz,0
2,76003,Kareem Abdul-Jabbar,Kareem,Abdul-Jabbar,0
3,51,Mahmoud Abdul-Rauf,Mahmoud,Abdul-Rauf,0
4,1505,Tariq Abdul-Wahad,Tariq,Abdul-Wahad,0


In [20]:
games.to_csv("games_cleaned.csv", index=False)
players.to_csv("players_cleaned.csv", index=False)

# Part 2 : Conversion de CSV à RDF

L'objectif de cette partie est de definir un graphe RDF noté g, qui va capturer toutes les données de nos tables csv Games, Players et Teams qui seront représentées en RDF comme des classes ayant des propriétés.

Chaque individu aura un identifiant unique appelé URI formé de l'url du namespace et de son id unique, et des valeurs des propriétés de sa classe

In [21]:
games.info()
games.head()

<class 'pandas.core.frame.DataFrame'>
Index: 26552 entries, 0 to 26650
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   GAME_DATE_EST    26552 non-null  object 
 1   GAME_ID          26552 non-null  int64  
 2   HOME_TEAM_ID     26552 non-null  int64  
 3   VISITOR_TEAM_ID  26552 non-null  int64  
 4   SEASON           26552 non-null  int64  
 5   PTS_home         26552 non-null  float64
 6   PTS_away         26552 non-null  float64
 7   HOME_TEAM_WINS   26552 non-null  int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 1.8+ MB


Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,PTS_away,HOME_TEAM_WINS
0,2022-12-22,22200477,1610612740,1610612759,2022,126.0,117.0,1
1,2022-12-22,22200478,1610612762,1610612764,2022,120.0,112.0,1
2,2022-12-21,22200466,1610612739,1610612749,2022,114.0,106.0,1
3,2022-12-21,22200467,1610612755,1610612765,2022,113.0,93.0,1
4,2022-12-21,22200468,1610612737,1610612741,2022,108.0,110.0,0


In [22]:
players.info()
players.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3130 entries, 0 to 3129
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   person_id          3130 non-null   int64 
 1   first_name         3130 non-null   object
 2   last_name          3130 non-null   object
 3   birthdate          3130 non-null   object
 4   school             3130 non-null   object
 5   height             3130 non-null   object
 6   position           3130 non-null   object
 7   team_id            3130 non-null   int64 
 8   team_name          3130 non-null   object
 9   team_city          3130 non-null   object
 10  team_abbreviation  3130 non-null   object
 11  draft_year         3130 non-null   object
 12  draft_round        3130 non-null   int64 
dtypes: int64(3), object(10)
memory usage: 318.0+ KB


Unnamed: 0,person_id,first_name,last_name,birthdate,school,height,position,team_id,team_name,team_city,team_abbreviation,draft_year,draft_round
0,76001,Alaa,Abdelnaby,1968-06-24,Duke,6-10,Forward,1610612757,Trail Blazers,Portland,POR,1990,1
1,76003,Kareem,Abdul-Jabbar,1947-04-16,UCLA,7-2,Center,1610612747,Lakers,Los Angeles,LAL,1969,1
2,1505,Tariq,Abdul-Wahad,1974-11-03,San Jose State,6-6,Forward-Guard,1610612758,Kings,Sacramento,SAC,1997,1
3,949,Shareef,Abdur-Rahim,1976-12-11,California,6-9,Forward,1610612763,Grizzlies,Vancouver,VAN,1996,1
4,76005,Tom,Abernethy,1954-05-06,Indiana,6-7,Forward,1610612744,Warriors,Golden State,GOS,1976,3


In [23]:
players["birthdate"] = pd.to_datetime(players["birthdate"])

In [24]:
teams_unique.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   team_id            30 non-null     int64 
 1   team_name          30 non-null     object
 2   team_city          30 non-null     object
 3   team_abbreviation  30 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.1+ KB


In [25]:
# Definir un namespace pour notre graphe
NBA = Namespace("https://example.org/nba/")

# Créer un graphe RDF
g = Graph()

# Ajouter des namespaces au graphe
g.bind("nba", NBA)
g.bind("rdf", RDF)
g.bind("xsd", XSD)

# Ajouter des équipes au graphe
for _, row in teams_unique.iterrows():
    # Definir l'URI de l'équipe
    teams_uri = URIRef(NBA[f"{row["team_id"]}"]) 
    
    # Ajouter le type de l'équipe
    g.add((teams_uri, RDF.type, NBA.Team)) 
    
    # Ajouter les propriétés de l'équipe
    g.add((teams_uri, NBA.teamName, Literal(row["team_name"], datatype=XSD.string))) # Ajouter le nom de l'équipe
    g.add((teams_uri, NBA.teamCity, Literal(row["team_city"], datatype=XSD.string))) # Ajouter la ville de l'équipe
    g.add((teams_uri, NBA.teamAbbreviation, Literal(row["team_abbreviation"], datatype=XSD.string))) # Ajouter l'abréviation de l'équipe
    

In [26]:
# Ajout des joueurs au graphe
for _, row in players.iterrows():
    # Definir l'URI du joueur
    player_uri = URIRef(NBA[f"{row["person_id"]}"])
    
    # Ajouter le type du joueur
    g.add((player_uri, RDF.type, NBA.Player))
    
    # Ajouter les propriétés du joeur
    g.add((player_uri, NBA.firstName, Literal(row["first_name"], datatype=XSD.string))) # Ajouter le prénom du joueur
    g.add((player_uri, NBA.lastName, Literal(row["last_name"], datatype=XSD.string))) # Ajouter le nom du joueur
    g.add((player_uri, NBA.birthdate, Literal(row["birthdate"], datatype=XSD.date))) # Ajouter la date de naissance du joueur
    g.add((player_uri, NBA.school, Literal(row["school"], datatype=XSD.string))) # Ajouter l'école du joueur
    g.add((player_uri, NBA.height, Literal(row["height"], datatype=XSD.string))) # Ajouter la taille du joueur
    g.add((player_uri, NBA.position, Literal(row["position"], datatype=XSD.string))) # Ajouter la position du joueur
    g.add((player_uri, NBA.draftYear, Literal(row["draft_year"], datatype=XSD.gYear))) # Ajouter l'année de draft du joueur
    g.add((player_uri, NBA.draftRound, Literal(row["draft_round"], datatype=XSD.integer))) # Ajouter le round de draft du joueur
    g.add((player_uri, NBA.playsFor, URIRef(NBA[f"{row['team_id']}"]))) # Lier le joueur à son équipe
    
    

In [27]:
# Ajout the matchs au graphe
for _, row in games.iterrows():
    # Definir l'URI du match
    game_uri = URIRef(NBA[f"{row["GAME_ID"]}"])
    
    # Ajouter le type du match
    g.add((game_uri, RDF.type, NBA.Game))
    
    # Ajouter les propriétés du match
    g.add((game_uri, NBA.gameDate, Literal(row["GAME_DATE_EST"], datatype=XSD.date))) # Ajouter la date du match
    g.add((game_uri, NBA.season, Literal(row["SEASON"], datatype=XSD.gYear))) # Ajouter la saison du match
    g.add((game_uri, NBA.homeTeam, URIRef(NBA[f"{row["HOME_TEAM_ID"]}"]))) # Ajouter l'équipe à domicile
    g.add((game_uri, NBA.visitorTeam, URIRef(NBA[f"{row["VISITOR_TEAM_ID"]}"]))) # Ajouter l'équipe à l'extérieur
    g.add((game_uri, NBA.ptsHome, Literal(row["PTS_home"], datatype=XSD.integer))) # Ajouter les points de l'équipe à domicile
    g.add((game_uri, NBA.ptsAway, Literal(row["PTS_away"], datatype=XSD.integer))) # Ajouter les points de l'équipe à l'extérieur
    g.add((game_uri, NBA.homeTeamWins, Literal(row["HOME_TEAM_WINS"], datatype=XSD.boolean))) # Ajouter si l'équipe à domicile a gagné
    

KeyboardInterrupt: 

In [None]:
# Sauvegarder le graphe RDF dans un fichier Turtle
g.serialize(destination='./turtles_files/nba_graph.ttl', format='turtle')
print("RDF intégré sauvegardé dans nba_graph.ttl")

# Partie 3 : REQUETES SPARQL

Le but de cette partie est d'extraire du graphe nouvellement crée certaines informations sur nos données grace à des requetes Sparql

## Requete 1 : Afficher les pages wikipedia des équipes en fonction de leur nombre de points lors de la saison 2018

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON, XML
from rdflib import Graph

# Initialize your graph
g.parse("./turtles_files/nba_graph.ttl", format="turtle")

# Use SPARQLWrapper instead of direct SERVICE clause
query_local = """
PREFIX nba: <https://example.org/nba/>
SELECT ?teamName ?maxPoints
WHERE {
  {
    SELECT ?team (MAX(?points) AS ?maxPoints)
    WHERE {
      ?game nba:gameDate ?date ;
            (nba:homeTeam | nba:visitorTeam) ?team ;
            (nba:ptsHome | nba:ptsAway) ?points .
      FILTER(YEAR(?date) = 2018)
    }
    GROUP BY ?team
  
  }
  ?team nba:teamName ?teamName .
}
ORDER BY DESC(?maxPoints)

"""

# Execute local query first
results_local = g.query(query_local)

# Then query DBpedia separately
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)

results1 = []

for row in results_local:
    team_name = str(row.teamName)
    
    # Query DBpedia for this team
    dbpedia_query = """
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    SELECT ?dbpTeam ?wikiPage
    WHERE {
        ?dbpTeam foaf:name "%s"@en ;
                 foaf:isPrimaryTopicOf ?wikiPage .
    }
    """ % team_name.replace('"', '\\"')
    
    sparql.setQuery(dbpedia_query)
    try:
        dbpedia_results = sparql.query().convert()
        for dbp_row in dbpedia_results["results"]["bindings"]:
            results1.append({
                "teamName": team_name,
                "maxPoints": row.maxPoints,
                "wikiPage": dbp_row["wikiPage"]["value"]
            })
    except Exception as e:
        print(f"Error querying DBpedia for {team_name}: {e}")

In [None]:
print(pd.DataFrame(results1))

## Requete 2 : Afficher les joueurs qui jouent au poste de center et qui ont une équipe ou non

In [None]:
g.parse("./turtles_files/nba_graph.ttl", format="turtle")
query2 = """
SELECT ?firstname ?lastname ?playerPosition ?teamName
WHERE {
    {
        SELECT ?firstname ?lastname ?teamID ?playerPosition
        WHERE {
            ?player nba:firstName ?firstname ; 
                    nba:lastName ?lastname ; 
                    nba:position ?playerPosition ; 
                    nba:playsFor ?teamID .
            FILTER(?playerPosition="Center")
        }
    }
   OPTIONAL{?teamID nba:teamName ?teamName} .
}
"""
results2 = g.query(query2)


In [None]:
# Show the results
for row in results2:
    print(f"{row.firstname} {row.lastname} - Position: {row.playerPosition} - Team: {row.teamName}")

## Requete 3 : Afficher tous les joueurs qui n'ont pas étudié à l'université de californie

In [28]:
query3 = """
PREFIX nba: <https://example.org/nba/>

SELECT ?firstname ?lastname ?school
WHERE {
  ?player nba:firstName ?firstname ;
          nba:lastName ?lastname ;
          nba:school ?school .

  FILTER ( !CONTAINS(LCASE(STR(?school)), "chicago") )
}

"""
results3 = g.query(query3)

In [29]:
# Show the results
for row in results3:
    print(f"{row.firstname} {row.lastname} - School: {row.school}")

Alaa Abdelnaby - School: Duke
Kareem Abdul-Jabbar - School: UCLA
Tariq Abdul-Wahad - School: San Jose State
Shareef Abdur-Rahim - School: California
Tom Abernethy - School: Indiana
Tom Barker - School: Hawaii
Tom Black - School: South Dakota State
Tom Boswell - School: South Carolina
Tom Burleson - School: North Carolina State
Tom Copa - School: Marquette
Tom Garrick - School: Rhode Island
Tom Gugliotta - School: North Carolina State
Tom Hawkins - School: Notre Dame
Tom Henderson - School: Hawaii
Tom Hoover - School: Villanova
Tom Hovasse - School: Penn State
Tom Ingelsby - School: Villanova
Tom Kozelko - School: Toledo
Tom Kron - School: Kentucky
Tom LaGarde - School: North Carolina
Tom Marshall - School: Western Kentucky
Tom McMillen - School: Maryland
Tom Meschery - School: St. Mary's (CA)
Tom Owens - School: South Carolina
Tom Payne - School: Kentucky
Tom Piotrowski - School: La Salle
Tom Riker - School: South Carolina
Tom Scheffler - School: Purdue
Tom Sewell - School: Lamar
Tom S

## Requete 4 : Afficher les équipes qui n'ont pas pour villes : Utah et New Orleans (résultat attendu: 28)

In [None]:
query4="""
PREFIX nba: <https://example.org/nba/>

SELECT DISTINCT ?teamName ?teamCity
WHERE {
  ?team nba:teamName ?teamName ;
        nba:teamCity ?teamCity .

  MINUS { ?team nba:teamCity "Utah" . }
  MINUS { ?team nba:teamCity "New Orleans" . }
}

"""
results4 = g.query(query4)

In [None]:
# Show the results
for row in results4:
    print(f"Team Name : {row.teamName} - Team City: {row.teamCity}")

# Requete 5 : Afficher les équipes qui ont joué plus de 82 matchs lors de la saison 2020

In [54]:
query5 = """
PREFIX nba: <https://example.org/nba/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?team ?teamName (COUNT(?game) AS ?nbMatchs)
WHERE {
  ?game (nba:homeTeam | nba:visitorTeam) ?team ;
        nba:season ?seasonRaw .

  # Conversion de la saison en entier
  BIND(STRDT(STR(?seasonRaw), xsd:integer) AS ?season)

  FILTER(?season = 2020)

  ?team nba:teamName ?teamName .
}
GROUP BY ?team ?teamName
HAVING (COUNT(?game) > 82)
ORDER BY DESC(?nbMatchs)
"""
results5 = g.query(query5)

In [55]:
# Show the results
for row in results5:
    print(f"Team ID : {row.team} - Team Name : {row.teamName} - Number of game: {row.nbMatchs}")

Team ID : https://example.org/nba/1610612756 - Team Name : Suns - Number of game: 98
Team ID : https://example.org/nba/1610612749 - Team Name : Bucks - Number of game: 98
Team ID : https://example.org/nba/1610612737 - Team Name : Hawks - Number of game: 94
Team ID : https://example.org/nba/1610612746 - Team Name : Clippers - Number of game: 94
Team ID : https://example.org/nba/1610612762 - Team Name : Jazz - Number of game: 86
Team ID : https://example.org/nba/1610612755 - Team Name : 76ers - Number of game: 86
Team ID : https://example.org/nba/1610612751 - Team Name : Nets - Number of game: 86
Team ID : https://example.org/nba/1610612743 - Team Name : Nuggets - Number of game: 85
Team ID : https://example.org/nba/1610612747 - Team Name : Lakers - Number of game: 83
Team ID : https://example.org/nba/1610612763 - Team Name : Grizzlies - Number of game: 83


# Requete 6 : Matchs où l'équipe à domicile a gagné

In [79]:
query6 = """
PREFIX nba: <https://example.org/nba/>

SELECT ?game ?w
WHERE {
  ?game a nba:Game ;
        nba:homeTeamWins ?w .
  FILTER( STR(?w) = "1")
}
LIMIT 20


"""
results6 = g.query(query6)
for row in results6:
    print(f"Game ID : {row.game} - Home Team Wins : {row.w}")


Game ID : https://example.org/nba/22200477 - Home Team Wins : 1
Game ID : https://example.org/nba/22200478 - Home Team Wins : 1
Game ID : https://example.org/nba/22200466 - Home Team Wins : 1
Game ID : https://example.org/nba/22200467 - Home Team Wins : 1
Game ID : https://example.org/nba/22200470 - Home Team Wins : 1
Game ID : https://example.org/nba/22200474 - Home Team Wins : 1
Game ID : https://example.org/nba/22200475 - Home Team Wins : 1
Game ID : https://example.org/nba/22200476 - Home Team Wins : 1
Game ID : https://example.org/nba/22200463 - Home Team Wins : 1
Game ID : https://example.org/nba/22200465 - Home Team Wins : 1
Game ID : https://example.org/nba/22200452 - Home Team Wins : 1
Game ID : https://example.org/nba/22200453 - Home Team Wins : 1
Game ID : https://example.org/nba/22200454 - Home Team Wins : 1
Game ID : https://example.org/nba/22200456 - Home Team Wins : 1
Game ID : https://example.org/nba/22200458 - Home Team Wins : 1
Game ID : https://example.org/nba/222004

# Requête 7: faire un classement des équipes sur une saison à partir des matchs, en calculant des indicateurs statistiques avancés.

In [77]:
query7 = """
PREFIX nba: <https://example.org/nba/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?teamName
       (COUNT(?game) AS ?gamesPlayed)
       (SUM(?win) AS ?wins)
       ((SUM(?win) * 100.0) / COUNT(?game) AS ?winPct)
       (AVG(?pf) AS ?avgPointsFor)
       (AVG(?pa) AS ?avgPointsAgainst)
       ((AVG(?pf) - AVG(?pa)) AS ?avgPointDiff)
WHERE {
  {
    # Cas 1 : équipe à domicile
    ?game a nba:Game ;
          nba:season ?season ;
          nba:homeTeam ?team ;
          nba:ptsHome ?pf ;
          nba:ptsAway ?pa ;
          nba:homeTeamWins ?w .
    FILTER(STR(?season) = "2020")
    BIND(xsd:integer(?w) AS ?win)
  }
  UNION
  {
    # Cas 2 : équipe à l'extérieur
    ?game a nba:Game ;
          nba:season ?season ;
          nba:visitorTeam ?team ;
          nba:ptsAway ?pf ;
          nba:ptsHome ?pa ;
          nba:homeTeamWins ?w .
    FILTER(STR(?season) = "2020")
    BIND(1 - xsd:integer(?w) AS ?win)
  }

  ?team nba:teamName ?teamName .
}
GROUP BY ?teamName
HAVING (COUNT(?game) >= 50)
ORDER BY DESC(?winPct) DESC(?avgPointDiff)
LIMIT 5




""" 

In [78]:
results7 = g.query(query7)

print("Team | Games | Wins | Win% | Avg PF | Avg PA | Avg Diff")
print("-" * 65)

for row in results7:
    teamName = row.teamName
    games    = int(row.gamesPlayed)
    wins     = int(row.wins)
    winPct   = float(row.winPct)
    avgPF    = float(row.avgPointsFor)
    avgPA    = float(row.avgPointsAgainst)
    avgDiff  = float(row.avgPointDiff)

    print(f"{teamName:5} {games:5} {wins:5} {winPct:6.1f}% {avgPF:8.1f} {avgPA:8.1f} {avgDiff:8.1f}")


Team | Games | Wins | Win% | Avg PF | Avg PA | Avg Diff
-----------------------------------------------------------------
Jazz     86    61   70.9%    116.5    108.3      8.3
76ers    86    58   67.4%    113.9    108.0      5.9
Suns     98    65   66.3%    113.4    108.5      4.9
Nets     86    57   66.3%    117.7    112.7      5.0
Bucks    98    62   63.3%    117.5    112.4      5.1


## Requete 8: Requete renvoyant les équipes ayant participé à aumoins un match que ce soit à domicile ou à l'extérieur(property path |)

In [80]:
query8 = """
PREFIX nba: <https://example.org/nba/>

SELECT DISTINCT ?team ?teamName
WHERE {
  ?game a nba:Game ;
        (nba:homeTeam | nba:visitorTeam) ?team .
  ?team nba:teamName ?teamName .
}
ORDER BY ?teamName

""" 
result8 = g.query(query8)
for row in result8:
    print(f"Team ID : {row.team} - Team Name : {row.teamName}")

Team ID : https://example.org/nba/1610612755 - Team Name : 76ers
Team ID : https://example.org/nba/1610612749 - Team Name : Bucks
Team ID : https://example.org/nba/1610612764 - Team Name : Bullets
Team ID : https://example.org/nba/1610612741 - Team Name : Bulls
Team ID : https://example.org/nba/1610612739 - Team Name : Cavaliers
Team ID : https://example.org/nba/1610612738 - Team Name : Celtics
Team ID : https://example.org/nba/1610612746 - Team Name : Clippers
Team ID : https://example.org/nba/1610612763 - Team Name : Grizzlies
Team ID : https://example.org/nba/1610612737 - Team Name : Hawks
Team ID : https://example.org/nba/1610612748 - Team Name : Heat
Team ID : https://example.org/nba/1610612740 - Team Name : Hornets
Team ID : https://example.org/nba/1610612766 - Team Name : Hornets
Team ID : https://example.org/nba/1610612762 - Team Name : Jazz
Team ID : https://example.org/nba/1610612758 - Team Name : Kings
Team ID : https://example.org/nba/1610612752 - Team Name : Knicks
Team ID

# 4-inférence: Schéma dans schema_rdfs.ttl et inférence avec la librairie owlrl

In [None]:
!pip install owlrl


In [None]:
from rdflib import Graph
from owlrl import RDFSClosure
from owlrl.AxiomaticTriples import RDFS_Axiomatic_Triples, RDFS_D_Axiomatic_Triples

DATA_TTL = "./turtles_files/nba_graph.ttl"
SCHEMA_TTL = "./turtles_files/schema_rdfs_complet.ttl"

 
g_base = Graph()
g_base.parse(DATA_TTL, format="turtle")
g_base.parse(SCHEMA_TTL, format="turtle")

 
g_inf = Graph()
for t in g_base:
    g_inf.add(t)

RDFSClosure.RDFS_Semantics(g_inf, RDFS_Axiomatic_Triples, RDFS_D_Axiomatic_Triples).closure()





# Vérifier qu'il y a de nouveaux triplets 

In [None]:
print("Triples RAW:", len(g_base))
print("Triples INF:", len(g_inf))
print("Triples ajoutés:", len(g_inf) - len(g_base))

# Tester les inférences via des requêtes

In [None]:
def run_query(title: str, query: str, limit_rows: int = 10):
    print(f"\n===== {title} =====")
    print("---- base ----")
    raw_res = list(g_base.query(query))
    if not raw_res:
        print("(aucun résultat)")
    else:
        for i, row in enumerate(raw_res[:limit_rows], start=1):
            print(f"{i}. {row}")
        if len(raw_res) > limit_rows:
            print(f"... ({len(raw_res)} lignes au total)")

    print("---- INF ----")
    inf_res = list(g_inf.query(query))
    if not inf_res:
        print("(aucun résultat)")
    else:
        for i, row in enumerate(inf_res[:limit_rows], start=1):
            print(f"{i}. {row}")
        if len(inf_res) > limit_rows:
            print(f"... ({len(inf_res)} lignes au total)")

 

queries = [
    (
        "Q1 - subPropertyOf : compter involvesTeam (attendu RAW=0, INF>0)",
        """
        PREFIX nba: <https://example.org/nba/>
        SELECT (COUNT(*) AS ?n)
        WHERE {
          ?g a nba:Game ;
             nba:involvesTeam ?t .
        }
        """,
        10
    ),
    (
        "Q2 - subPropertyOf : afficher quelques triples inférés involvesTeam",
        """
        PREFIX nba: <https://example.org/nba/>
        SELECT ?game ?team
        WHERE {
          ?game a nba:Game ;
                nba:involvesTeam ?team .
        }
        LIMIT 10
        """,
        10
    ),
    (
        "Q3 - subClassOf : compter les Person (Player => Person)",
        """
        PREFIX nba: <https://example.org/nba/>
        SELECT (COUNT(DISTINCT ?p) AS ?n)
        WHERE { ?p a nba:Person . }
        """,
        10
    ),
    (
        "Q4 - subClassOf : joueurs qui sont à la fois Player et Person (preuve)",
        """
        PREFIX nba: <https://example.org/nba/>
        SELECT ?player
        WHERE {
          ?player a nba:Player ;
                  a nba:Person .
        }
        LIMIT 10
        """,
        10
    ),
    (
        "Q5 - domain : playsFor => Player (si domain défini sur playsFor)",
        """
        PREFIX nba: <https://example.org/nba/>
        SELECT (COUNT(DISTINCT ?p) AS ?n)
        WHERE {
          ?p nba:playsFor ?t .
          ?p a nba:Player .
        }
        """,
        10
    ),
    (
        "Q6 - range : playsFor => Team (si range défini sur playsFor)",
        """
        PREFIX nba: <https://example.org/nba/>
        SELECT (COUNT(DISTINCT ?t) AS ?n)
        WHERE {
          ?p nba:playsFor ?t .
          ?t a nba:Team .
        }
        """,
        10
    ),
    (
        "Q7 - combiné : équipes impliquées dans des matchs (via involvesTeam inféré)",
        """
        PREFIX nba: <https://example.org/nba/>
        SELECT DISTINCT ?team
        WHERE {
          ?game a nba:Game ;
                nba:involvesTeam ?team .
        }
        LIMIT 10
        """,
        10
    ),
]
 
for title, q, lim in queries:
    run_query(title, q, limit_rows=lim)

 


# Requête 9: faire une requête sur des grqphes nommés

In [2]:
from rdflib import Dataset
from owlrl import RDFSClosure
from owlrl.AxiomaticTriples import RDFS_Axiomatic_Triples, RDFS_D_Axiomatic_Triples

DATA_TTL = "./turtles_files/nba_graph.ttl"
SCHEMA_TTL = "./turtles_files/schema_rdfs_complet.ttl"

ds = Dataset()

g_base = ds.graph("urn:graph:base")   # graphe nommé BASE
g_inf  = ds.graph("urn:graph:inf")    # graphe nommé INF

# Charger BASE
g_base.parse(DATA_TTL, format="turtle")
g_base.parse(SCHEMA_TTL, format="turtle")

# Copier BASE -> INF puis inférer
for t in g_base:
    g_inf.add(t)

RDFSClosure.RDFS_Semantics(
    g_inf,
    RDFS_Axiomatic_Triples,
    RDFS_D_Axiomatic_Triples
).closure()

print("Triples BASE:", len(g_base))
print("Triples INF :", len(g_inf))
print("Ajoutés     :", len(g_inf) - len(g_base))


Triples BASE: 243669
Triples INF : 445121
Ajoutés     : 201452


# Lister les graphes nommés présents

In [83]:
q_list = """
SELECT DISTINCT ?g
WHERE { GRAPH ?g { ?s ?p ?o } }
ORDER BY ?g
"""
for row in ds.query(q_list):
    print(row.g)


urn:graph:base
urn:graph:inf


# Requête 10: compter le nombre de relations nba:involvesTeam dans chaque graphe (0 attendu dans base)

In [3]:
q_compare = """
PREFIX nba: <https://example.org/nba/>

SELECT ?source (COUNT(*) AS ?n)
WHERE {
  VALUES (?source ?gname) {
    ("BASE" <urn:graph:base>)
    ("INF"  <urn:graph:inf>)
  }
  GRAPH ?gname {
    ?game a nba:Game ;
          nba:involvesTeam ?team .
  }
}
GROUP BY ?source
"""
for row in ds.query(q_compare):
    print(row.source, row.n)


INF 53046
