In [1]:
import networkx as nx
import timeit
import json
import random
import pickle

In [2]:
random.seed(0)

In [3]:
start = timeit.default_timer()

# todo? add different weights to connections between song and artist/album/playlist in the graph??
G = nx.Graph()
num_files = 999 # 999 == all files
trim_start=len("spotify:")
for i in range(num_files):
    lb = i*1000
    ub = lb+999
    #print(f"mpd.slice.{lb}-{ub}.json")
    filename = f"spotify_million_playlist_dataset/data/mpd.slice.{lb}-{ub}.json"
    with open(filename) as read_file:
        tmp = json.load(read_file)
        for playlist in tmp["playlists"]:

            G.add_node(playlist["pid"], playlist_name=playlist["name"])#, num_tracks=playlist["num_tracks"], num_albums=playlist["num_albums"])#, collaborative=playlist["collaborative"], modified_at=playlist["modified_at"])
            for track in playlist["tracks"]:
                track_id = track["track_uri"][trim_start:]
                artist_id = track["artist_uri"][trim_start:]
                album_id = track["album_uri"][trim_start:]
                G.add_node(track_id, track_name=track["track_name"], artist_name=track["artist_name"])#, album_name=track["album_name"])#, duration=track["duration_ms"])
                G.add_node(artist_id, artist_name=track["artist_name"])
                G.add_node(album_id, album_name=track["album_name"])
                # G.add_edge(track_id, artist_id) # dont need since songs are connected to artists via albums
                G.add_edge(track_id, album_id)
                G.add_edge(album_id, artist_id)
                G.add_edge(track_id, playlist["pid"])
    time_elapsed = timeit.default_timer() - start
    mins_elapsed = (time_elapsed)//60
    secs_elapsed = time_elapsed % 60
    print(f"{i+1}/{num_files}; n:{G.number_of_nodes()}, m:{G.number_of_edges()}; time elapsed: {mins_elapsed:02} min {secs_elapsed:02.1f} sec", end="\r")
print()
print(f"n:{G.number_of_nodes()}, m:{G.number_of_edges()}")
    
stop = timeit.default_timer()
print('Time elapsed (minutes):', (stop - start)/60)  

999/999; n:4290426, m:68512319; time elapsed: 53.0 min 45.5 sec
n:4290426, m:68512319
Time elapsed (minutes): 53.85183885166577


In [5]:
# save the read graph into a pickle file to read it in faster next time
pickle.dump(G, open("./spotify_million_playlist_dataset/data/graphrep/spotify.pickle", 'wb'))

In [6]:
n100=3260226 # num nodes in full data
m100=67662843 # num edges in full data
f"{G.number_of_nodes()/n100} % nodes, {G.number_of_edges()/m100} % links"

'1.315990363858211 % nodes, 1.0125545419367021 % links'

In [6]:
personalization_dict = { # both in this example from playlist 0
    'track:6I9VzXrHxO9rA9A5euc8Ak':6, #toxic britney spears
    'track:0UaMYEvWZi0ZqiDOoHU3YI':1 #lose control missy elliott
}

In [7]:
start = timeit.default_timer()
results = nx.pagerank(G, personalization=personalization_dict)
stop = timeit.default_timer()
print('Time elapsed (minutes):', (stop - start)/60)  

Time elapsed (minutes): 77.11673877166662


In [8]:
rec_songs = filter(lambda x: str.startswith(str(x[0]), "track:"), results.items())
reccs = sorted(rec_songs, key=lambda x: x[1], reverse=True)[:25]

In [9]:
for i in range(len(reccs)):
    print(reccs[i], G.nodes[reccs[i][0]])

('track:6I9VzXrHxO9rA9A5euc8Ak', 0.12862125190395463) {'track_name': 'Toxic', 'artist_name': 'Britney Spears'}
('track:0UaMYEvWZi0ZqiDOoHU3YI', 0.02145303126342739) {'track_name': 'Lose Control (feat. Ciara & Fat Man Scoop)', 'artist_name': 'Missy Elliott'}
('track:7KXjTSCq5nL1LoYtL7XAwS', 0.00020681104969854356) {'track_name': 'HUMBLE.', 'artist_name': 'Kendrick Lamar'}
('track:1xznGGDReH1oQq0xzbwXa3', 0.0001879775427511055) {'track_name': 'One Dance', 'artist_name': 'Drake'}
('track:7BKLCZ1jbUBVqRi2FVlTVw', 0.00018789789105930901) {'track_name': 'Closer', 'artist_name': 'The Chainsmokers'}
('track:7yyRTcZmCiyzzJlNzGC9Ol', 0.00018401327432690954) {'track_name': 'Broccoli (feat. Lil Yachty)', 'artist_name': 'DRAM'}
('track:3a1lNhkSLSkpJE4MSHpDu9', 0.00017952633261077231) {'track_name': 'Congratulations', 'artist_name': 'Post Malone'}
('track:152lZdxL1OR0ZMW6KquMif', 0.00016694119133353453) {'track_name': 'Location', 'artist_name': 'Khalid'}
('track:2EEeOnHehOozLq4aS0n6SL', 0.0001588243

In [10]:
G.nodes[reccs[2][0]]

{'track_name': 'HUMBLE.', 'artist_name': 'Kendrick Lamar'}