In [41]:
!pip install networkx


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import os
import json
import networkx as nx
import torch
from torch_geometric.data import Data
from tqdm import tqdm

# Example values for N and K
N = 100  # Number of files to use for the dataset
K = 5    # Value for K-core decomposition

In [9]:

def getFiles(N):
    '''
    Returns directory and files to use for dataset
    '''
    dir = 'C:\python\projects\gnn-recommender-main\spotify_million_playlist_dataset'
    files = sorted(os.listdir(dir), key=lambda x: int(x.split(".")[2].split("-")[0]))
    return dir, files[:N]


  dir = 'C:\python\projects\gnn-recommender-main\spotify_million_playlist_dataset'


In [10]:

def makeGraph(dir, files):
    '''
    Returns a graph, number of original playlists, and hashmap of PIDs/URIs
    '''
    G = nx.Graph()
    p_meta, uris = {}, {}
    SID = 0

    for file in files:
        with open(os.path.join(dir, file), 'r') as f:
            data = json.load(f)['playlists']
            for playlist in data:
                pid = playlist['pid']
                G.add_node(pid, type='playlist')
                p_meta[pid] = {'name': playlist['name']}
                for song in playlist['tracks']:
                    uri = song['track_uri']
                    if uri not in uris:
                        uris[uri] = {'SID': SID, 'track_name': song['track_name'],
                                     'artist_name': song['artist_name'], 'artist_uri': song['artist_uri']}
                        G.add_node(SID, type='song')
                        SID += 1
                    G.add_edge(pid, uris[uri]['SID'])

    orig_playlists = sum(1 for _, data in G.nodes(data=True) if data['type'] == 'playlist')
    return G, orig_playlists, SID - 1, p_meta, uris


In [11]:


def getKCore(G, K):
    '''
    Returns K-core graph, number of playlists, songs, and edges
    '''
    # Remove self-loops from the graph G
    G.remove_edges_from(nx.selfloop_edges(G))

    # Compute the k-core of the graph
    G_kcore = nx.k_core(G, k=K)
    
    # Count the number of playlists, songs, and edges in the k-core
    num_playlists = sum(1 for _, data in G_kcore.nodes(data=True) if data['type'] == 'playlist')
    num_songs = sum(1 for _, data in G_kcore.nodes(data=True) if data['type'] == 'song')
    num_edges = G_kcore.number_of_edges()
    
    return G_kcore, num_playlists, num_songs, num_edges


In [12]:

def reindexGraph(G, orig_playlists, num_playlists, num_songs, p_meta, uris):
    # NetworkX handles node indexing internally, so this step might be simplified
    # depending on the specific needs of reindexing in your application
    # This step is highly specific to the original snap application and may not be directly applicable in NetworkX
    pass


In [13]:

def createPyObject(G):
    # Convert to edge_index and storing in a PyG Data object
    edge_list = [(u, v) for u, v in G.edges()]
    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
    return Data(edge_index=edge_index, num_nodes=G.number_of_nodes())


In [14]:

def saveObject(data, p_meta, num_playlists, num_songs, num_edges, K, N):
    # Save the data object and metadata as before
    # Implementation remains the same as in your original code
    pass


In [15]:

# Main execution
if __name__ == "__main__":
    dir, files = getFiles(N)
    G, orig_playlists, lastPID, p_meta, uris = makeGraph(dir, files)
    G_kcore, num_playlists, num_songs, num_edges = getKCore(G, K)
    # Reindexing might not be necessary or needs adaptation
    data = createPyObject(G_kcore)
    saveObject(data, p_meta, num_playlists, num_songs, num_edges, K, N)
