In [32]:
import networkx as nx
import timeit
import json
import random

In [33]:
random.seed(0)

In [34]:
start = timeit.default_timer()

# todo? add different weights to connections between song and artist/album/playlist in the graph??
G = nx.Graph()
num_files = 5
for i in range(num_files):
    lb = i*1000
    ub = lb+999
    #print(f"mpd.slice.{lb}-{ub}.json")
    filename = f"spotify_million_playlist_dataset/data/mpd.slice.{lb}-{ub}.json"
    with open(filename) as read_file:
        tmp = json.load(read_file)
        for playlist in tmp["playlists"]:

            G.add_node(playlist["pid"], playlist_name=playlist["name"], collaborative=playlist["collaborative"], modified_at=playlist["modified_at"], num_tracks=playlist["num_tracks"], num_albums=playlist["num_albums"])
            for track in playlist["tracks"]:
                G.add_node(track["track_uri"], track_name=track["track_name"], duration=track["duration_ms"], artist_name=track["artist_name"], album_name=track["album_name"])
                G.add_node(track["artist_uri"], artist_name=track["artist_name"])
                G.add_node(track["album_uri"], album_name=track["album_name"])
                G.add_edge(track["track_uri"], track["artist_uri"])
                G.add_edge(track["track_uri"], track["album_uri"])
                G.add_edge(track["album_uri"], track["artist_uri"])
                G.add_edge(track["track_uri"], playlist["pid"])

    print(f"{i+1}/{num_files}; n:{G.number_of_nodes()}, m:{G.number_of_edges()}", end="\r")
print(f"n:{G.number_of_nodes()}, m:{G.number_of_edges()}")
    
stop = timeit.default_timer()
print('Time: ', stop - start)  

n:193865, m:60549505495
Time:  4.53984850004781


In [35]:
personalization_dict = { # both in this example from playlist 0
    'spotify:track:6I9VzXrHxO9rA9A5euc8Ak':0.1, #toxic britney spears
    'spotify:track:0UaMYEvWZi0ZqiDOoHU3YI':0.1 #lose control missy elliott
}

In [36]:
results = nx.pagerank(G, personalization=personalization_dict)
results

{0: 0.0031482297322574116,
 'spotify:track:0UaMYEvWZi0ZqiDOoHU3YI': 0.07636469022769218,
 'spotify:artist:2wIVse2owClT7go1WT98tk': 0.0031704716176044227,
 'spotify:album:6vV5UrXcfyQD1wu4Qo2I9K': 0.0019349808511729227,
 'spotify:track:6I9VzXrHxO9rA9A5euc8Ak': 0.07644561163014336,
 'spotify:artist:26dSoYclwsYLMAKD3tpOr4': 0.002105626853089296,
 'spotify:album:0z7pVBGOD7HCIB7S8eLkLI': 0.001326151080959247,
 'spotify:track:0WqIKmW4BTrj3eJFmnCKMv': 0.0006022023565278384,
 'spotify:artist:6vWDO969PvNqNYHIOW5v0m': 0.00011392286363072708,
 'spotify:album:25hVFAxTlDvXbx2X2QkUkE': 1.365895771224168e-05,
 'spotify:track:1AWQoqb9bSvzTjaLralEkT': 0.00026634468656771073,
 'spotify:artist:31TPClRtHm23RisEBtV3X7': 7.855502266477394e-05,
 'spotify:album:6QPkyl04rXwTGlGlcYaRoW': 1.808936500892755e-05,
 'spotify:track:1lzr43nnXAijIGYnCT8M8H': 0.00046262312229823567,
 'spotify:artist:5EvFsr3kj42KNv97ZEnqij': 3.854760909971099e-05,
 'spotify:album:6NmFmPX56pcLBOFMhIiKvF': 1.7929239664159284e-05,
 'spotify:

In [37]:
rec_songs = filter(lambda x: str.startswith(str(x[0]), "spotify:track:"), results.items())
reccs = sorted(rec_songs, key=lambda x: x[1], reverse=True)[:25]

In [38]:
for i in range(len(reccs)):
    print(reccs[i], G.nodes[reccs[i][0]])

('spotify:track:6I9VzXrHxO9rA9A5euc8Ak', 0.07644561163014336) {'track_name': 'Toxic', 'duration': 198800, 'artist_name': 'Britney Spears', 'album_name': 'In The Zone'}
('spotify:track:0UaMYEvWZi0ZqiDOoHU3YI', 0.07636469022769218) {'track_name': 'Lose Control (feat. Ciara & Fat Man Scoop)', 'duration': 226863, 'artist_name': 'Missy Elliott', 'album_name': 'The Cookbook'}
('spotify:track:2gam98EZKrF9XuOkU13ApN', 0.0006255381071306575) {'track_name': 'Promiscuous', 'duration': 242293, 'artist_name': 'Nelly Furtado', 'album_name': 'Loose'}
('spotify:track:5dNfHmqgr128gMY2tc5CeJ', 0.0006171327196380769) {'track_name': 'Ignition - Remix', 'duration': 186066, 'artist_name': 'R. Kelly', 'album_name': 'Chocolate Factory'}
('spotify:track:0WqIKmW4BTrj3eJFmnCKMv', 0.0006022023565278384) {'track_name': 'Crazy In Love', 'duration': 235933, 'artist_name': 'Beyoncé', 'album_name': 'Dangerously In Love (Alben für die Ewigkeit)'}
('spotify:track:4z5fkIflIBvSG9elVNmiOJ', 0.000601447630248173) {'track_na

In [39]:
G.nodes[reccs[2][0]] #in this example this song is also in the first playlist, just like the two songs in the personalization vector

{'track_name': 'Promiscuous',
 'duration': 242293,
 'artist_name': 'Nelly Furtado',
 'album_name': 'Loose'}