## Create __Star Wars__ rdf database
__Key points:__
- the code was run by __python 3.11__
- The project must be located in the __Desktop__ folder directory. If you downloaded the file to another location, don't forget to change the path

In [56]:
# import libraries
from pathlib import Path
import pandas as pd
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD
# regular expression
import re

In [57]:
# file path
homePath = str(Path.home())
datasetPath = "/Desktop/MovieDB/dataset"
charactersPath = homePath + datasetPath + "/characters.csv"
planetsPath = homePath + datasetPath +"/planets.csv"
speciesPath = homePath + datasetPath +"/species.csv"
starshipsPath = homePath + datasetPath +"/starships.csv"
vehiclesPath = homePath + datasetPath +"/vehicles.csv"

# save path
savePath = homePath + "/Desktop/MovieDB/ttl/"

In [58]:
# Define namespace for ontology
namespace = Namespace("http://www.semanticweb.org/ontologies/database2/project/starwars/example4/")

## Characters

In [59]:
# Load the CSV file
characters = pd.read_csv(charactersPath, sep=',', keep_default_na=False, na_values=['_'])
characters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        87 non-null     object
 1   height      87 non-null     object
 2   mass        87 non-null     object
 3   hair_color  87 non-null     object
 4   skin_color  87 non-null     object
 5   eye_color   87 non-null     object
 6   birth_year  87 non-null     object
 7   gender      87 non-null     object
 8   homeworld   87 non-null     object
 9   species     87 non-null     object
dtypes: object(10)
memory usage: 6.9+ KB


In [60]:
# Create a new graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("namespace", namespace)

In [61]:
# Function to create a valid URI component from a string
def create_uri_component(string):
    return re.sub(r'\W+', '', string.replace(" ", "_"))

#iterate over the characters dataframe
for index, row in characters.iterrows():
    Character = URIRef(namespace + create_uri_component(row['name']))
    
    # Add triples using store's add() method.
    g.add((Character, RDF.type, namespace.Character))
    g.add((Character, namespace['name'], Literal(row['name'], datatype=XSD.string)))
    g.add((Character, namespace['height'], Literal(row['height'])))
    g.add((Character, namespace['mass'], Literal(row['mass'])))
    g.add((Character, namespace['hairColor'], Literal(row['hair_color'])))
    g.add((Character, namespace['skinColor'], Literal(row['skin_color'])))
    g.add((Character, namespace['eyeColor'], Literal(row['eye_color'])))
    g.add((Character, namespace['birthYear'], Literal(row['birth_year'])))
    g.add((Character, namespace['gender'], Literal(row['gender'])))
    g.add((Character, namespace['homeworld'], Literal(row['homeworld'])))
    g.add((Character, namespace['species'], Literal(row['species'])))

In [62]:
# create characters ttl file
with open(savePath + 'characters.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

## Planets

In [63]:
# Load the CSV file
planets = pd.read_csv(planetsPath, sep=',', keep_default_na=False, na_values=['_'])
planets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   name             61 non-null     object
 1   rotation_period  61 non-null     object
 2   orbital_period   61 non-null     object
 3   diameter         61 non-null     object
 4   climate          61 non-null     object
 5   gravity          61 non-null     object
 6   terrain          61 non-null     object
 7   surface_water    61 non-null     object
 8   population       61 non-null     object
dtypes: object(9)
memory usage: 4.4+ KB


In [64]:
#create a new graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("namespace", namespace)

In [65]:
#iterate over the characters dataframe
for index, row in planets.iterrows():
    Planet = URIRef(namespace + create_uri_component(row['name']))
    
    # Add triples using store's add() method.
    g.add((Planet, RDF.type, namespace.Planet))
    g.add((Planet, namespace['name'], Literal(row['name'], datatype=XSD.string)))
    g.add((Planet, namespace['rotationPeriod'], Literal(row['rotation_period'])))
    g.add((Planet, namespace['orbitalPeriod'], Literal(row['orbital_period'])))
    g.add((Planet, namespace['diameter'], Literal(row['diameter'])))
    g.add((Planet, namespace['climate'], Literal(row['climate'])))
    g.add((Planet, namespace['gravity'], Literal(row['gravity'])))
    g.add((Planet, namespace['terrain'], Literal(row['terrain'])))
    g.add((Planet, namespace['surfaceWater'], Literal(row['surface_water'])))
    g.add((Planet, namespace['population'], Literal(row['population'])))

In [66]:
# create planets ttl file
with open(savePath + 'planets.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))