## Create __Star Wars__ rdf database
__Key points:__
- the code was run by __python 3.11__
- The project must be located in the __Desktop__ folder directory. If you downloaded the file to another location, don't forget to change the path

In [1]:
# import libraries
from pathlib import Path
import pandas as pd
from rdflib import Graph, Literal, RDF, URIRef, Namespace, RDFS
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD
# regular expression
import re
# api requests
import requests

In [2]:
# file path
homePath = str(Path.home())
datasetPath = "/Desktop/MovieDB/dataset"
charactersPath = homePath + datasetPath + "/characters.csv"
planetsPath = homePath + datasetPath +"/planets.csv"
speciesPath = homePath + datasetPath +"/species.csv"
starshipsPath = homePath + datasetPath +"/starships.csv"
vehiclesPath = homePath + datasetPath +"/vehicles.csv"

# save path
savePath = homePath + "/Desktop/MovieDB/ttl/"

In [3]:
# Define namespace for ontology
namespace = Namespace("http://www.dei.unipd.it/database2/IMDbOntology/")

In [4]:
# Function to create a valid URI component from a string
def create_uri_component(string):
    return re.sub(r'\W+', '', string.replace(" ", "_"))

# Extract number from url
# https://swapi.dev/api/planets/1/ -> 1
def extract_number(string):
    numbers = re.findall(r'\d+', string)
    return numbers[-1] if numbers else None

## Starships

In [5]:
# Load the CSV file
starships = pd.read_csv(starshipsPath, sep=',', keep_default_na=False, na_values=['_'])
starships.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   name                    37 non-null     object
 1   model                   37 non-null     object
 2   manufacturer            37 non-null     object
 3   cost_in_credits         37 non-null     object
 4   length                  37 non-null     object
 5   max_atmosphering_speed  37 non-null     object
 6   crew                    37 non-null     object
 7   passengers              37 non-null     object
 8   cargo_capacity          37 non-null     object
 9   consumables             37 non-null     object
 10  hyperdrive_rating       37 non-null     object
 11  MGLT                    37 non-null     object
 12  starship_class          37 non-null     object
 13  created                 37 non-null     object
 14  edited                  37 non-null     object
 15  url     

In [6]:
#create a new graph
g = Graph()

In [7]:
#iterate over the starships dataframe
for index, row in starships.iterrows():

    # Extract id from starship api url
    starshipId = extract_number(row['url'])
    
    StarshipURI = URIRef(namespace + create_uri_component("starship"+ str(starshipId)))
    
    # Add triples using store's add() method.
    g.add((StarshipURI, RDF.type, namespace.Starship))
    if row['name']: 
        g.add((StarshipURI, RDFS.label, Literal(row['name'])))
        g.add((StarshipURI, namespace['name'], Literal(row['name'], datatype=XSD.string)))
    if row['model']: g.add((StarshipURI, namespace['model'], Literal(row['model'], datatype=XSD.string)))
    if row['manufacturer']: g.add((StarshipURI, namespace['manufacturer'], Literal(row['manufacturer'], datatype=XSD.string)))
    if row['cost_in_credits']: g.add((StarshipURI, namespace['costInCredits'], Literal(row['cost_in_credits'], datatype=XSD.integer)))
    if row['length']: g.add((StarshipURI, namespace['length'], Literal(row['length'])))
    if row['max_atmosphering_speed']: 
        maxSpeedInt = re.sub('[^0-9]', '', row['max_atmosphering_speed'])
        g.add((StarshipURI, namespace['maxAtmospheringSpeed'], Literal(maxSpeedInt, datatype=XSD.integer)))
    if row['crew']: g.add((StarshipURI, namespace['crew'], Literal(row['crew'], datatype=XSD.integer)))
    if row['passengers']: g.add((StarshipURI, namespace['passengers'], Literal(row['passengers'], datatype=XSD.integer)))
    if row['cargo_capacity']: g.add((StarshipURI, namespace['cargoCapacity'], Literal(row['cargo_capacity'], datatype=XSD.integer)))
    if row['consumables']: g.add((StarshipURI, namespace['consumables'], Literal(row['consumables'], datatype=XSD.string)))
    if row['hyperdrive_rating']: g.add((StarshipURI, namespace['hyperdriveRating'], Literal(row['hyperdrive_rating'], datatype=XSD.float)))
    if row['MGLT']: g.add((StarshipURI, namespace['MGLT'], Literal(row['MGLT'], datatype=XSD.integer)))
    if row['starship_class']: g.add((StarshipURI, namespace['starshipClass'], Literal(row['starship_class'], datatype=XSD.string)))

In [8]:
# create planets ttl file
with open(savePath + 'starships.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

## Vehicles

In [9]:
# Load the CSV file
vehicles = pd.read_csv(vehiclesPath, sep=',', keep_default_na=False, na_values=['_'])
vehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   name                    39 non-null     object
 1   model                   39 non-null     object
 2   manufacturer            39 non-null     object
 3   cost_in_credits         39 non-null     object
 4   length                  39 non-null     object
 5   max_atmosphering_speed  39 non-null     object
 6   crew                    39 non-null     int64 
 7   passengers              39 non-null     object
 8   cargo_capacity          39 non-null     object
 9   consumables             39 non-null     object
 10  vehicle_class           39 non-null     object
 11  created                 39 non-null     object
 12  edited                  39 non-null     object
 13  url                     39 non-null     object
 14  desc                    39 non-null     object
dtypes: int64

In [10]:
#create a new graph
g = Graph()

In [11]:
#iterate over the vehicles dataframe
for index, row in vehicles.iterrows():

    # Extract id from vehicle api url
    vehicleId = extract_number(row['url'])
    
    VehicleURI = URIRef(namespace + create_uri_component("vehicle"+ str(vehicleId)))
    
    # Add triples using store's add() method.
    g.add((VehicleURI, RDF.type, namespace.Vehicle))
    if row['name']: 
        g.add((VehicleURI, RDFS.label, Literal(row['name'])))
        g.add((VehicleURI, namespace['name'], Literal(row['name'], datatype=XSD.string)))
    if row['model']: g.add((VehicleURI, namespace['model'], Literal(row['model'], datatype=XSD.string)))
    if row['manufacturer']: g.add((VehicleURI, namespace['manufacturer'], Literal(row['manufacturer'], datatype=XSD.string)))
    if row['cost_in_credits']: g.add((VehicleURI, namespace['costInCredits'], Literal(row['cost_in_credits'], datatype=XSD.integer)))
    if row['length']: g.add((VehicleURI, namespace['length'], Literal(row['length'], datatype=XSD.string)))
    if row['max_atmosphering_speed']: g.add((VehicleURI, namespace['maxAtmospheringSpeed'], Literal(row['max_atmosphering_speed'], datatype=XSD.string)))
    if row['crew']: g.add((VehicleURI, namespace['crew'], Literal(row['crew'], datatype=XSD.integer)))
    if row['passengers']: g.add((VehicleURI, namespace['passengers'], Literal(row['passengers'], datatype=XSD.integer)))
    if row['cargo_capacity']: g.add((VehicleURI, namespace['cargoCapacity'], Literal(row['cargo_capacity'], datatype=XSD.string)))
    if row['consumables']: g.add((VehicleURI, namespace['consumables'], Literal(row['consumables'], datatype=XSD.string)))
    if row['vehicle_class']: g.add((VehicleURI, namespace['VehicleClass'], Literal(row['vehicle_class'], datatype=XSD.string)))

In [12]:
# create vehicle ttl file
with open(savePath + 'vehicles.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

## Planets

In [13]:
# Load the CSV file
planets = pd.read_csv(planetsPath, sep=',', keep_default_na=False, na_values=['_'])
planets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   name             61 non-null     object
 1   rotation_period  61 non-null     object
 2   orbital_period   61 non-null     object
 3   diameter         61 non-null     object
 4   climate          61 non-null     object
 5   gravity          61 non-null     object
 6   terrain          61 non-null     object
 7   surface_water    61 non-null     object
 8   population       61 non-null     object
 9   created          61 non-null     object
 10  edited           61 non-null     object
 11  url              61 non-null     object
 12  desc             61 non-null     object
dtypes: object(13)
memory usage: 6.3+ KB


In [14]:
#create a new graph
g = Graph()

In [15]:
#iterate over the planets dataframe
for index, row in planets.iterrows():

    # Extract id from planet api url
    planetId = extract_number(row['url'])
    
    PlanetURI = URIRef(namespace + create_uri_component("planet"+ str(planetId)))
    
    # Add triples using store's add() method.
    g.add((PlanetURI, RDF.type, namespace.Planet))
    g.add((PlanetURI, RDFS.label, Literal(row['name'])))
    if row['name']: g.add((PlanetURI, namespace['name'], Literal(row['name'], datatype=XSD.string)))
    if row['rotation_period']: g.add((PlanetURI, namespace['rotationPeriod'], Literal(row['rotation_period'], datatype=XSD.integer)))
    if row['orbital_period']: g.add((PlanetURI, namespace['orbitalPeriod'], Literal(row['orbital_period'], datatype=XSD.integer)))
    if row['diameter']: g.add((PlanetURI, namespace['diameter'], Literal(row['diameter'], datatype=XSD.integer)))
    if row['climate']: g.add((PlanetURI, namespace['climate'], Literal(row['climate'], datatype=XSD.string)))
    if row['gravity']: g.add((PlanetURI, namespace['gravity'], Literal(row['gravity'], datatype=XSD.string)))
        
    if row['terrain']:
        terrainsSeparated = [value.strip() for value in row['terrain'].split(',')]
        for terrain in terrainsSeparated:
            g.add((PlanetURI, namespace['terrain'], Literal(terrain, datatype=XSD.string)))
        
    if row['surface_water']: g.add((PlanetURI, namespace['surfaceWater'], Literal(row['surface_water'], datatype=XSD.float)))
    if row['population']: g.add((PlanetURI, namespace['population'], Literal(row['population'], datatype=XSD.integer)))

In [16]:
# create planets ttl file
with open(savePath + 'planets.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

## Species

In [17]:
# Load the CSV file
species = pd.read_csv(speciesPath, sep=',', keep_default_na=False, na_values=['_'])
species.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   name              37 non-null     object
 1   classification    37 non-null     object
 2   designation       37 non-null     object
 3   average_height    37 non-null     object
 4   skin_colors       37 non-null     object
 5   hair_colors       37 non-null     object
 6   eye_colors        37 non-null     object
 7   average_lifespan  37 non-null     object
 8   homeworld         37 non-null     object
 9   language          37 non-null     object
 10  created           37 non-null     object
 11  edited            37 non-null     object
 12  url               37 non-null     object
 13  desc              37 non-null     object
dtypes: object(14)
memory usage: 4.2+ KB


In [18]:
# Create a new graph
g = Graph()

In [19]:
#iterate over the species dataframe
for index, row in species.iterrows():

    # Extract id from specie api url
    specieId = extract_number(row['url'])

    Specie = URIRef(namespace + create_uri_component("specie"+ str(specieId)))
    
    # Add triples using store's add() method.
    g.add((Specie, RDF.type, namespace.Specie))
    if row['name']:
        g.add((Specie, RDFS.label, Literal(row['name'])))
        g.add((Specie, namespace['name'], Literal(row['name'], datatype=XSD.string)))
    if row['classification']: g.add((Specie, namespace['classification'], Literal(row['classification'], datatype=XSD.string)))
    if row['average_height']: g.add((Specie, namespace['averageHeight'], Literal(row['average_height'], datatype=XSD.integer)))
    
    if row['skin_colors']:
        skinColorsSeparated = [value.strip() for value in row['skin_colors'].split(',')]
        for skinColor in skinColorsSeparated:
            g.add((Specie, namespace['skinColor'], Literal(skinColor)))

    if row['hair_colors']:
        hairColorsSeparated = [value.strip() for value in row['hair_colors'].split(',')]
        for hairColor in hairColorsSeparated:
            g.add((Specie, namespace['hairColor'], Literal(hairColor)))

    if row['eye_colors']:
        eyeColorsSeparated = [value.strip() for value in row['eye_colors'].split(',')]
        for eyeColor in eyeColorsSeparated:
            g.add((Specie, namespace['eyeColor'], Literal(eyeColor)))
    
    if row['average_lifespan'] and isinstance(row['average_lifespan'], int): 
        g.add((Specie, namespace['averageLifespan'], Literal(row['average_lifespan'], datatype=XSD.integer)))
    if row['language']: g.add((Specie, namespace['language'], Literal(row['language'], datatype=XSD.string)))
    
    # Extract id from hometown api url
    planetId = extract_number(row['homeworld'])

    if planetId:
        planetName = create_uri_component("planet" + planetId)
        g.add((Specie, namespace['homeworld'], URIRef(namespace[planetName])))

In [20]:
# create species ttl file
with open(savePath + 'species.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

## Characters

In [21]:
# Load the CSV file
characters = pd.read_csv(charactersPath, sep=',', keep_default_na=False, na_values=['_'])
characters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        87 non-null     object
 1   height      87 non-null     object
 2   mass        87 non-null     object
 3   hair_color  87 non-null     object
 4   skin_color  87 non-null     object
 5   eye_color   87 non-null     object
 6   birth_year  87 non-null     object
 7   gender      87 non-null     object
 8   homeworld   87 non-null     object
 9   created     87 non-null     object
 10  edited      87 non-null     object
 11  url         87 non-null     object
 12  desc        87 non-null     object
dtypes: object(13)
memory usage: 9.0+ KB


In [22]:
# Create a new graph
g = Graph()

In [23]:
#iterate over the characters dataframe
for index, row in characters.iterrows():

    # Extract id from character api url
    characterId = extract_number(row['url'])

    Character = URIRef(namespace + create_uri_component("character"+ str(characterId)))
    
    # Add triples using store's add() method.
    g.add((Character, RDF.type, namespace.Character))
    if row['name']:
        g.add((Character, RDFS.label, Literal(row['name'])))
        g.add((Character, namespace['name'], Literal(row['name'], datatype=XSD.string)))
    if row['height']: g.add((Character, namespace['height'], Literal(row['height'], datatype=XSD.integer)))
    if row['mass']: g.add((Character, namespace['mass'], Literal(row['mass'], datatype=XSD.float)))

    if row['skin_color']:
        skinColorsSeparated = [value.strip() for value in row['skin_color'].split(',')]
        for skinColor in skinColorsSeparated:
            g.add((Character, namespace['skinColor'], Literal(skinColor, datatype=XSD.string)))

    if row['hair_color']:
        hairColorsSeparated = [value.strip() for value in row['hair_color'].split(',')]
        for hairColor in hairColorsSeparated:
            g.add((Character, namespace['hairColor'], Literal(hairColor, datatype=XSD.string)))

    if row['eye_color']:
        eyeColorsSeparated = [value.strip() for value in row['eye_color'].split(',')]
        for eyeColor in eyeColorsSeparated:
            g.add((Character, namespace['eyeColor'], Literal(eyeColor, datatype=XSD.string)))
            
    if row['birth_year']: g.add((Character, namespace['birthYear'], Literal(row['birth_year'], datatype=XSD.string)))
    if row['gender']: g.add((Character, namespace['gender'], Literal(row['gender'], datatype=XSD.string)))

    # Extract id from hometown api url
    planetId = extract_number(row['homeworld'])
    planetName = create_uri_component("planet" + planetId)
    
    g.add((Character, namespace['homeworld'], URIRef(namespace[planetName])))

    # Get additional information from character api url
    # Wait about 2 mins for all requests will be finished
    response = requests.get(row['url'])
    # Success
    if response.status_code == 200:
        data = response.json()
        starships = data.get('starships', [])
        if starships:
            for starship in starships:
                starshipId = extract_number(starship)
                starshipName = create_uri_component("starship" + starshipId)
                g.add((Character, namespace['hasStarship'], URIRef(namespace[starshipName])))
    
        vehicles = data.get('vehicles', [])
        if vehicles:
            for vehicle in vehicles:
                vehicleId = extract_number(vehicle)
                vehicleName = create_uri_component("vehicle" + vehicleId)
                g.add((Character, namespace['hasVehicle'], URIRef(namespace[vehicleName])))
                
        species = data.get('species', [])
        if species:
            for specie in species:
                specieId = extract_number(specie)
                specieName = create_uri_component("specie" + specieId)
                g.add((Character, namespace['type'], URIRef(namespace[specieName])))
        print(f"Successfully data retrieved for: {Character}")
    # Fail
    else:
        print(f"Failed to retrieve data: {response.status_code}")

Successfully data retrieved for: http://www.dei.unipd.it/database2/IMDbOntology/character1
Successfully data retrieved for: http://www.dei.unipd.it/database2/IMDbOntology/character2
Successfully data retrieved for: http://www.dei.unipd.it/database2/IMDbOntology/character3
Successfully data retrieved for: http://www.dei.unipd.it/database2/IMDbOntology/character4
Successfully data retrieved for: http://www.dei.unipd.it/database2/IMDbOntology/character5
Successfully data retrieved for: http://www.dei.unipd.it/database2/IMDbOntology/character6
Successfully data retrieved for: http://www.dei.unipd.it/database2/IMDbOntology/character7
Successfully data retrieved for: http://www.dei.unipd.it/database2/IMDbOntology/character8
Successfully data retrieved for: http://www.dei.unipd.it/database2/IMDbOntology/character9
Successfully data retrieved for: http://www.dei.unipd.it/database2/IMDbOntology/character10
Successfully data retrieved for: http://www.dei.unipd.it/database2/IMDbOntology/character

In [24]:
# create characters ttl file
with open(savePath + 'characters.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))