In [68]:
import numpy as np
import pandas as pd
import os
import time
import networkx as nx
from rdflib import Graph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import FOAF, XSD
from pyvis.network import Network
import urllib.parse
import re
import rdflib
import spotipy
import uuid
from SPARQLWrapper import SPARQLWrapper, POST
from spotipy.oauth2 import SpotifyClientCredentials

## Helper Functions

In [69]:
def sanitize_for_uri(name):
    name = str(name)
    return re.sub(r'[^a-zA-Z0-9_ ;/-]', '', name)

## Spotify API 

In [70]:
client_credentials_manager = SpotifyClientCredentials(client_id='daced2e32ea548818a3289ff350acd39',
                                                      client_secret='89adb131d6474e9497eadc4c209e5656')
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## Converting Spotify API Data to CSV file

In [71]:
df = pd.read_csv("../data/dataset.csv", usecols=["track_id"])

In [72]:
saved_file_path = "../data/spotify_data.csv"
if os.path.exists(saved_file_path):
    saved_df = pd.read_csv(saved_file_path)
    successfully_retrieved = set(saved_df['track_id'].tolist())
else:
    saved_df = pd.DataFrame()
    successfully_retrieved = set()

In [73]:
remaining_ids = set(df['track_id']) - successfully_retrieved

columns = [
    'track_id', 'track_name', 'track_popularity', 'track_duration_ms', 'track_explicit', 'track_isrc',
    'album_id', 'album_name', 'album_type', 'album_release_date', 'album_total_tracks', 'album_uri',
    'artist_id', 'artist_name', 'artist_uri', 'track_uri'
]
all_tracks_df = pd.DataFrame(columns=columns)

In [74]:
batch_size = 50
batch_count = 1
total_batches = len(remaining_ids) // batch_size + 1

while remaining_ids:
    batch = list(remaining_ids)[:batch_size]
    print(f"Processing batch {batch_count} of {total_batches}...")

    try:
        response = sp.tracks(batch)
        
        for track in response['tracks']:
            if track:
                track_data = {
                    'track_id': track['id'],
                    'track_name': track['name'],
                    'track_popularity': track['popularity'],
                    'track_duration_ms': track['duration_ms'],
                    'track_explicit': track['explicit'],
                    'track_isrc': track['external_ids'].get('isrc', None),
                    'album_id': track['album']['id'],
                    'album_name': track['album']['name'],
                    'album_type': track['album']['album_type'],
                    'album_release_date': track['album']['release_date'],
                    'album_total_tracks': track['album']['total_tracks'],
                    'album_spotify_uri': track['album']['uri'],
                    'artist_id': '; '.join([artist['id'] for artist in track['artists']]),
                    'artist_name': '; '.join([artist['name'] for artist in track['artists']]),
                    'artist_spotify_uri': '; '.join([artist['uri'] for artist in track['artists']]),
                    'track_spotify_uri': track['uri']
                }
                all_tracks_df = pd.concat([all_tracks_df, pd.DataFrame([track_data])], ignore_index=True)
                successfully_retrieved.add(track['id'])
                
        remaining_ids -= set(batch)
        all_tracks_df.to_csv(saved_file_path, index=False)
        
    except Exception as e:
        print(f"Error processing batch {batch_count}: {e}")
        break

    time.sleep(1)
    batch_count += 1

In [75]:
spotify_data =  pd.read_csv(saved_file_path)
turtle_file_name = "../turtle/media_ontology.ttl"

# Uncomment for testing, use 100 songs only
spotify_data = spotify_data.head(100)
turtle_file_name = "../turtle/media_ontology_TEST.ttl"

In [76]:
spotify_data.dropna(axis=1, how='all', inplace=True)
spotify_data['artist_name'] = spotify_data['artist_name'].apply(sanitize_for_uri)
spotify_data['track_name'] = spotify_data['track_name'].apply(sanitize_for_uri)
spotify_data['album_name'] = spotify_data['album_name'].apply(sanitize_for_uri)
spotify_data["track_spotify_uri"] = spotify_data["track_spotify_uri"].str.replace("spotify:track:", "", regex=False)
spotify_data["artist_spotify_uri"] = spotify_data["track_spotify_uri"].str.replace("spotify:artist:", "", regex=False)
spotify_data["album_spotify_uri"] = spotify_data["track_spotify_uri"].str.replace("spotify:album:", "", regex=False)

## Initialize Graph

In [77]:
MEDIA = Namespace("http://mediafinder.org/media/")
PROPERTY = Namespace("http://mediafinder.org/properties/")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")

In [78]:
g = Graph()
g.bind("media", MEDIA)
g.bind("property", PROPERTY)
g.bind("rdfs", RDFS)

## Classes

In [79]:
g.add((MEDIA.Media, RDF.type, RDFS.Class))
g.add((MEDIA.Track, RDF.type, RDFS.Class))
g.add((MEDIA.Album, RDF.type, RDFS.Class))
g.add((MEDIA.Person, RDF.type, RDFS.Class))
g.add((MEDIA.Artist, RDF.type, RDFS.Class))
g.add((MEDIA.Image, RDF.type, RDFS.Class))

<Graph identifier=N22caeb5a38bd487b9ac8be149ac771c7 (<class 'rdflib.graph.Graph'>)>

## SubClasses

In [80]:
g.add((MEDIA.Track, RDFS.subClassOf, MEDIA.Media))
g.add((MEDIA.Album, RDFS.subClassOf, MEDIA.Media))
g.add((MEDIA.Image, RDFS.subClassOf, MEDIA.Media))
g.add((MEDIA.Artist, RDFS.subClassOf, MEDIA.Person))

<Graph identifier=N22caeb5a38bd487b9ac8be149ac771c7 (<class 'rdflib.graph.Graph'>)>

## Properties

In [81]:
g.add((PROPERTY.trackName, RDF.type, RDF.Property))
g.add((PROPERTY.trackPopularity, RDF.type, RDF.Property))
g.add((PROPERTY.trackDuration, RDF.type, RDF.Property))
g.add((PROPERTY.trackExplicit, RDF.type, RDF.Property))
g.add((PROPERTY.trackISRC, RDF.type, RDF.Property))
g.add((PROPERTY.albumName, RDF.type, RDF.Property))
g.add((PROPERTY.albumType, RDF.type, RDF.Property))
g.add((PROPERTY.albumReleaseDate, RDF.type, RDF.Property))
g.add((PROPERTY.albumTotalTracks, RDF.type, RDF.Property))
g.add((PROPERTY.artistName, RDF.type, RDF.Property))
g.add((PROPERTY.albumSpotifyURI, RDF.type, RDF.Property))
g.add((PROPERTY.artistSpotifyURI, RDF.type, RDF.Property))
g.add((PROPERTY.trackSpotifyURI, RDF.type, RDF.Property))
g.add((PROPERTY.containsObject, RDF.type, RDF.Property))
g.add((PROPERTY.base64Value, RDF.type, RDF.Property))

<Graph identifier=N22caeb5a38bd487b9ac8be149ac771c7 (<class 'rdflib.graph.Graph'>)>

## Domains and ranges

In [82]:
g.add((PROPERTY.containsObject, RDFS.domain, MEDIA.Image))
g.add((PROPERTY.containsObject, RDFS.range, RDFS.Literal))

g.add((PROPERTY.base64Value, RDFS.domain, MEDIA.Image))
g.add((PROPERTY.base64Value, RDFS.range, RDFS.Literal))

g.add((PROPERTY.trackName, RDFS.domain, MEDIA.Track))
g.add((PROPERTY.trackName, RDFS.range, RDFS.Literal))

g.add((PROPERTY.trackPopularity, RDFS.domain, MEDIA.Track))
g.add((PROPERTY.trackPopularity, RDFS.range, RDFS.Literal))

g.add((PROPERTY.trackDuration, RDFS.domain, MEDIA.Track))
g.add((PROPERTY.trackDuration, RDFS.range, RDFS.Literal))

g.add((PROPERTY.trackExplicit, RDFS.domain, MEDIA.Track))
g.add((PROPERTY.trackExplicit, RDFS.range, RDFS.Literal))

g.add((PROPERTY.trackISRC, RDFS.domain, MEDIA.Track))
g.add((PROPERTY.trackISRC, RDFS.range, RDFS.Literal))

g.add((PROPERTY.albumName, RDFS.domain, MEDIA.Album))
g.add((PROPERTY.albumName, RDFS.range, RDFS.Literal))

g.add((PROPERTY.albumType, RDFS.domain, MEDIA.Album))
g.add((PROPERTY.albumType, RDFS.range, RDFS.Literal))

g.add((PROPERTY.albumReleaseDate, RDFS.domain, MEDIA.Album))
g.add((PROPERTY.albumReleaseDate, RDFS.range, RDFS.Literal))

g.add((PROPERTY.albumTotalTracks, RDFS.domain, MEDIA.Album))
g.add((PROPERTY.albumTotalTracks, RDFS.range, RDFS.Literal))

g.add((PROPERTY.artistName, RDFS.domain, MEDIA.Artist))
g.add((PROPERTY.artistName, RDFS.range, RDFS.Literal))

g.add((PROPERTY.albumSpotifyURI, RDFS.domain, MEDIA.Album))
g.add((PROPERTY.albumSpotifyURI, RDFS.range, RDFS.Literal))

g.add((PROPERTY.artistSpotifyURI, RDFS.domain, MEDIA.Artist))
g.add((PROPERTY.artistSpotifyURI, RDFS.range, RDFS.Literal))

g.add((PROPERTY.trackSpotifyURI, RDFS.domain, MEDIA.Track))
g.add((PROPERTY.trackSpotifyURI, RDFS.range, RDFS.Literal))

<Graph identifier=N22caeb5a38bd487b9ac8be149ac771c7 (<class 'rdflib.graph.Graph'>)>

In [83]:
for index, row in spotify_data.iterrows():
    track_uri = URIRef(MEDIA[f"{uuid.uuid5(uuid.NAMESPACE_DNS, str(row['track_id']))}"])
    g.add((track_uri, RDF.type, MEDIA.Track))
    g.add((track_uri, PROPERTY.trackName, Literal(row['track_name'])))
    g.add((track_uri, PROPERTY.trackPopularity, Literal(row['track_popularity'])))
    g.add((track_uri, PROPERTY.trackDuration, Literal(row['track_duration_ms'])))
    g.add((track_uri, PROPERTY.trackExplicit, Literal(row['track_explicit'])))
    g.add((track_uri, PROPERTY.trackISRC, Literal(row['track_isrc'])))
    g.add((track_uri, PROPERTY.trackSpotifyURI, Literal(row['track_spotify_uri'])))

    album_uri = URIRef(MEDIA[f"{uuid.uuid5(uuid.NAMESPACE_DNS, str(row['album_id']))}"])
    g.add((album_uri, RDF.type, MEDIA.Album))
    g.add((album_uri, PROPERTY.albumName, Literal(row['album_name'])))
    g.add((album_uri, PROPERTY.albumType, Literal(row['album_type'])))
    g.add((album_uri, PROPERTY.albumReleaseDate, Literal(row['album_release_date'])))
    g.add((album_uri, PROPERTY.albumTotalTracks, Literal(str(int(row['album_total_tracks'])))))
    g.add((album_uri, PROPERTY.albumSpotifyURI, Literal(row['album_spotify_uri'])))
    g.add((track_uri, PROPERTY.trackBelongsToAlbum, album_uri))

    artists = row['artist_name'].split('; ')
    for artist_name in artists:
        artist_uri = URIRef(MEDIA[f"{uuid.uuid5(uuid.NAMESPACE_DNS, artist_name)}"])
        g.add((artist_uri, RDF.type, MEDIA.Artist))
        g.add((artist_uri, PROPERTY.artistName, Literal(artist_name)))
        g.add((artist_uri, PROPERTY.artistSpotifyURI, Literal(row['artist_spotify_uri'])))
        g.add((track_uri, PROPERTY.trackHasArtist, artist_uri))

g.serialize(turtle_file_name, format="turtle")

<Graph identifier=N22caeb5a38bd487b9ac8be149ac771c7 (<class 'rdflib.graph.Graph'>)>