## Create __Star Wars__ rdf database
__Key points:__
- the code was run by __python 3.11__
- The project must be located in the __Desktop__ folder directory. If you downloaded the file to another location, don't forget to change the path

In [35]:
# import libraries
from pathlib import Path
import pandas as pd
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD
# regular expression
import re
# api requests
import requests

In [9]:
# file path
homePath = str(Path.home())
datasetPath = "/Desktop/MovieDB/dataset"
charactersPath = homePath + datasetPath + "/characters.csv"
planetsPath = homePath + datasetPath +"/planets.csv"
speciesPath = homePath + datasetPath +"/species.csv"
starshipsPath = homePath + datasetPath +"/starships.csv"
vehiclesPath = homePath + datasetPath +"/vehicles.csv"

# save path
savePath = homePath + "/Desktop/MovieDB/ttl/"

In [10]:
# Define namespace for ontology
namespace = Namespace("http://www.semanticweb.org/ontologies/database2/project/starwars/example10/")

In [11]:
# Function to create a valid URI component from a string
def create_uri_component(string):
    return re.sub(r'\W+', '', string.replace(" ", "_"))

# Extract number from url
# https://swapi.dev/api/planets/1/ -> 1
def extract_number(string):
    numbers = re.findall(r'\d+', string)
    return numbers[-1] if numbers else None

## Starships

In [12]:
# Load the CSV file
starships = pd.read_csv(starshipsPath, sep=',', keep_default_na=False, na_values=['_'])
starships.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   name                    37 non-null     object
 1   model                   37 non-null     object
 2   manufacturer            37 non-null     object
 3   cost_in_credits         37 non-null     object
 4   length                  37 non-null     object
 5   max_atmosphering_speed  37 non-null     object
 6   crew                    37 non-null     object
 7   passengers              37 non-null     object
 8   cargo_capacity          37 non-null     object
 9   consumables             37 non-null     object
 10  hyperdrive_rating       37 non-null     object
 11  MGLT                    37 non-null     object
 12  starship_class          37 non-null     object
 13  created                 37 non-null     object
 14  edited                  37 non-null     object
 15  url     

In [13]:
#create a new graph
g = Graph()

In [14]:
#iterate over the starships dataframe
for index, row in starships.iterrows():

    # Extract id from starship api url
    starshipId = extract_number(row['url'])
    
    StarshipURI = URIRef(namespace + create_uri_component("starship"+ str(starshipId)))
    
    # Add triples using store's add() method.
    g.add((StarshipURI, RDF.type, namespace.Starship))
    g.add((StarshipURI, namespace['name'], Literal(row['name'], datatype=XSD.string)))
    g.add((StarshipURI, namespace['model'], Literal(row['model'], datatype=XSD.string)))
    g.add((StarshipURI, namespace['manufacturer'], Literal(row['manufacturer'], datatype=XSD.string)))
    g.add((StarshipURI, namespace['costInCredits'], Literal(row['cost_in_credits'], datatype=XSD.string)))
    g.add((StarshipURI, namespace['length'], Literal(row['length'], datatype=XSD.string)))
    g.add((StarshipURI, namespace['maxAtmospheringSpeed'], Literal(row['max_atmosphering_speed'], datatype=XSD.string)))
    g.add((StarshipURI, namespace['crew'], Literal(row['crew'], datatype=XSD.string)))
    g.add((StarshipURI, namespace['passengers'], Literal(row['passengers'], datatype=XSD.string)))
    g.add((StarshipURI, namespace['cargoCapacity'], Literal(row['cargo_capacity'], datatype=XSD.string)))
    g.add((StarshipURI, namespace['consumables'], Literal(row['consumables'], datatype=XSD.string)))
    g.add((StarshipURI, namespace['hyperdriveRating'], Literal(row['hyperdrive_rating'], datatype=XSD.string)))
    g.add((StarshipURI, namespace['MGLT'], Literal(row['MGLT'], datatype=XSD.string)))
    g.add((StarshipURI, namespace['starshipClass'], Literal(row['starship_class'], datatype=XSD.string)))

In [15]:
# create planets ttl file
with open(savePath + 'starships.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

## Vehicles

In [16]:
# Load the CSV file
vehicles = pd.read_csv(vehiclesPath, sep=',', keep_default_na=False, na_values=['_'])
vehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   name                    39 non-null     object
 1   model                   39 non-null     object
 2   manufacturer            39 non-null     object
 3   cost_in_credits         39 non-null     object
 4   length                  39 non-null     object
 5   max_atmosphering_speed  39 non-null     object
 6   crew                    39 non-null     int64 
 7   passengers              39 non-null     object
 8   cargo_capacity          39 non-null     object
 9   consumables             39 non-null     object
 10  vehicle_class           39 non-null     object
 11  created                 39 non-null     object
 12  edited                  39 non-null     object
 13  url                     39 non-null     object
 14  desc                    39 non-null     object
dtypes: int64

In [17]:
#create a new graph
g = Graph()

In [18]:
#iterate over the vehicles dataframe
for index, row in vehicles.iterrows():

    # Extract id from vehicle api url
    vehicleId = extract_number(row['url'])
    
    VehicleURI = URIRef(namespace + create_uri_component("vehicle"+ str(vehicleId)))
    
    # Add triples using store's add() method.
    g.add((VehicleURI, RDF.type, namespace.Vehicle))
    g.add((VehicleURI, namespace['name'], Literal(row['name'], datatype=XSD.string)))
    g.add((VehicleURI, namespace['model'], Literal(row['model'], datatype=XSD.string)))
    g.add((VehicleURI, namespace['manufacturer'], Literal(row['manufacturer'], datatype=XSD.string)))
    g.add((VehicleURI, namespace['costInCredits'], Literal(row['cost_in_credits'], datatype=XSD.string)))
    g.add((VehicleURI, namespace['length'], Literal(row['length'], datatype=XSD.string)))
    g.add((VehicleURI, namespace['maxAtmospheringSpeed'], Literal(row['max_atmosphering_speed'], datatype=XSD.string)))
    g.add((VehicleURI, namespace['crew'], Literal(row['crew'], datatype=XSD.string)))
    g.add((VehicleURI, namespace['passengers'], Literal(row['passengers'], datatype=XSD.string)))
    g.add((VehicleURI, namespace['cargoCapacity'], Literal(row['cargo_capacity'], datatype=XSD.string)))
    g.add((VehicleURI, namespace['consumables'], Literal(row['consumables'], datatype=XSD.string)))
    g.add((VehicleURI, namespace['VehicleClass'], Literal(row['vehicle_class'], datatype=XSD.string)))

In [19]:
# create vehicle ttl file
with open(savePath + 'vehicles.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

## Planets

In [20]:
# Load the CSV file
planets = pd.read_csv(planetsPath, sep=',', keep_default_na=False, na_values=['_'])
planets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   name             61 non-null     object
 1   rotation_period  61 non-null     object
 2   orbital_period   61 non-null     object
 3   diameter         61 non-null     object
 4   climate          61 non-null     object
 5   gravity          61 non-null     object
 6   terrain          61 non-null     object
 7   surface_water    61 non-null     object
 8   population       61 non-null     object
 9   created          61 non-null     object
 10  edited           61 non-null     object
 11  url              61 non-null     object
 12  desc             61 non-null     object
dtypes: object(13)
memory usage: 6.3+ KB


In [21]:
#create a new graph
g = Graph()

In [22]:
#iterate over the planets dataframe
for index, row in planets.iterrows():

    # Extract id from planet api url
    planetId = extract_number(row['url'])
    
    PlanetURI = URIRef(namespace + create_uri_component("planet"+ str(planetId)))
    
    # Add triples using store's add() method.
    g.add((PlanetURI, RDF.type, namespace.Planet))
    g.add((PlanetURI, namespace['name'], Literal(row['name'], datatype=XSD.string)))
    g.add((PlanetURI, namespace['rotationPeriod'], Literal(row['rotation_period'])))
    g.add((PlanetURI, namespace['orbitalPeriod'], Literal(row['orbital_period'])))
    g.add((PlanetURI, namespace['diameter'], Literal(row['diameter'])))
    g.add((PlanetURI, namespace['climate'], Literal(row['climate'])))
    g.add((PlanetURI, namespace['gravity'], Literal(row['gravity'])))
    g.add((PlanetURI, namespace['terrain'], Literal(row['terrain'])))
    g.add((PlanetURI, namespace['surfaceWater'], Literal(row['surface_water'])))
    g.add((PlanetURI, namespace['population'], Literal(row['population'])))

In [23]:
# create planets ttl file
with open(savePath + 'planets.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

## Species

In [24]:
# Load the CSV file
species = pd.read_csv(speciesPath, sep=',', keep_default_na=False, na_values=['_'])
species.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   name              37 non-null     object
 1   classification    37 non-null     object
 2   designation       37 non-null     object
 3   average_height    37 non-null     object
 4   skin_colors       37 non-null     object
 5   hair_colors       37 non-null     object
 6   eye_colors        37 non-null     object
 7   average_lifespan  37 non-null     object
 8   homeworld         37 non-null     object
 9   language          37 non-null     object
 10  created           37 non-null     object
 11  edited            37 non-null     object
 12  url               37 non-null     object
 13  desc              37 non-null     object
dtypes: object(14)
memory usage: 4.2+ KB


In [25]:
# Create a new graph
g = Graph()

In [26]:
#iterate over the species dataframe
for index, row in species.iterrows():

    # Extract id from specie api url
    specieId = extract_number(row['url'])

    Specie = URIRef(namespace + create_uri_component("specie"+ str(specieId)))
    
    # Add triples using store's add() method.
    g.add((Specie, RDF.type, namespace.Character))
    g.add((Specie, namespace['name'], Literal(row['name'], datatype=XSD.string)))
    g.add((Specie, namespace['classification'], Literal(row['classification'])))
    g.add((Specie, namespace['averageHeight'], Literal(row['average_height'])))
    
    g.add((Specie, namespace['skinColors'], Literal(row['skin_colors'])))
    g.add((Specie, namespace['hairColors'], Literal(row['hair_colors'])))
    g.add((Specie, namespace['eyeColors'], Literal(row['eye_colors'])))
    g.add((Specie, namespace['averageLifespan'], Literal(row['average_lifespan'])))
    g.add((Specie, namespace['language'], Literal(row['language'])))
    
    # Extract id from hometown api url
    planetId = extract_number(row['homeworld'])

    if planetId:
        planetName = create_uri_component("planet" + planetId)
        g.add((Specie, namespace['homeworld'], URIRef(namespace[planetName])))

In [27]:
# create species ttl file
with open(savePath + 'species.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

## Characters

In [45]:
# Load the CSV file
characters = pd.read_csv(charactersPath, sep=',', keep_default_na=False, na_values=['_'])
characters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        87 non-null     object
 1   height      87 non-null     object
 2   mass        87 non-null     object
 3   hair_color  87 non-null     object
 4   skin_color  87 non-null     object
 5   eye_color   87 non-null     object
 6   birth_year  87 non-null     object
 7   gender      87 non-null     object
 8   homeworld   87 non-null     object
 9   created     87 non-null     object
 10  edited      87 non-null     object
 11  url         87 non-null     object
 12  desc        87 non-null     object
dtypes: object(13)
memory usage: 9.0+ KB


In [46]:
# Create a new graph
g = Graph()

In [47]:
#iterate over the characters dataframe
for index, row in characters.iterrows():

    # Extract id from character api url
    characterId = extract_number(row['url'])

    Character = URIRef(namespace + create_uri_component("character"+ str(characterId)))
    
    # Add triples using store's add() method.
    g.add((Character, RDF.type, namespace.Character))
    g.add((Character, namespace['name'], Literal(row['name'], datatype=XSD.string)))
    g.add((Character, namespace['height'], Literal(row['height'])))
    g.add((Character, namespace['mass'], Literal(row['mass'])))
    g.add((Character, namespace['hairColor'], Literal(row['hair_color'])))
    g.add((Character, namespace['skinColor'], Literal(row['skin_color'])))
    g.add((Character, namespace['eyeColor'], Literal(row['eye_color'])))
    g.add((Character, namespace['birthYear'], Literal(row['birth_year'])))
    g.add((Character, namespace['gender'], Literal(row['gender'])))

    # Extract id from hometown api url
    planetId = extract_number(row['homeworld'])
    planetName = create_uri_component("planet" + planetId)
    
    g.add((Character, namespace['homeworld'], URIRef(namespace[planetName])))

    # Get additional information from character api url
    # Wait about 2 mins for all requests will be finished
    response = requests.get(row['url'])
    # Success
    if response.status_code == 200:
        data = response.json()
        starships = data.get('starships', [])
        if starships:
            for starship in starships:
                starshipId = extract_number(starship)
                starshipName = create_uri_component("starship" + starshipId)
                g.add((Character, namespace['hasStarship'], URIRef(namespace[starshipName])))
    
        vehicles = data.get('vehicles', [])
        if vehicles:
            for vehicle in vehicles:
                vehicleId = extract_number(vehicle)
                vehicleName = create_uri_component("vehicle" + vehicleId)
                g.add((Character, namespace['hasVehicle'], URIRef(namespace[vehicleName])))
                
        species = data.get('species', [])
        if species:
            for specie in species:
                specieId = extract_number(specie)
                specieName = create_uri_component("specie" + specieId)
                g.add((Character, namespace['type'], URIRef(namespace[specieName])))
        print(f"Successfully data retrieved for: {Character}")
    # Fail
    else:
        print(f"Failed to retrieve data: {response.status_code}")

Successfully data retrieved for: http://www.semanticweb.org/ontologies/database2/project/starwars/example10/character1
Successfully data retrieved for: http://www.semanticweb.org/ontologies/database2/project/starwars/example10/character2
Successfully data retrieved for: http://www.semanticweb.org/ontologies/database2/project/starwars/example10/character3
Successfully data retrieved for: http://www.semanticweb.org/ontologies/database2/project/starwars/example10/character4
Successfully data retrieved for: http://www.semanticweb.org/ontologies/database2/project/starwars/example10/character5
Successfully data retrieved for: http://www.semanticweb.org/ontologies/database2/project/starwars/example10/character6
Successfully data retrieved for: http://www.semanticweb.org/ontologies/database2/project/starwars/example10/character7
Successfully data retrieved for: http://www.semanticweb.org/ontologies/database2/project/starwars/example10/character8
Successfully data retrieved for: http://www.sema

In [48]:
# create characters ttl file
with open(savePath + 'characters.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))