In [1]:
# Imports
import os
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Directory containing JSON files
# directory = './spotify_million_playlist_dataset/data'
directory = './Test Playlist'
# Initialize data structures
playlists_data = []
unique_songs = {}  # Dictionary to store unique songs with URI as key
playlist_song_map = {}  # Mapping of playlist name to song URIs

# Read each JSON file and extract data
for filename in os.listdir(directory):
    if filename.endswith('.json'):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r') as file:
            data = json.load(file)
            for playlist in data['playlists']:
                # Extract playlist-level features
                playlist_info = {
                    'name': playlist['name'],
                    'num_tracks': playlist['num_tracks'],
                    'num_albums': playlist['num_albums'],
                    'num_followers': playlist['num_followers'],
                    'num_edits': playlist['num_edits'],
                    'duration_ms': playlist['duration_ms'],
                    'num_artists': playlist['num_artists']
                }
                playlists_data.append(playlist_info)

                # Initialize song list for this playlist
                playlist_song_map[playlist['name']] = []

                # Extract and store unique track-level features
                for track in playlist['tracks']:
                    track_uri = track['track_uri']
                    if track_uri not in unique_songs:
                        unique_songs[track_uri] = {
                            'artist_name': track['artist_name'],
                            'track_name': track['track_name'],
                            'duration_ms': track['duration_ms']
                        }
                    playlist_song_map[playlist['name']].append(track_uri)

# Convert playlists data and unique songs to DataFrames
playlists_df = pd.DataFrame(playlists_data)
unique_songs_df = pd.DataFrame(unique_songs.values(), index=unique_songs.keys())

# Now, playlists_df, unique_songs_df, and playlist_song_map are ready for further processing

In [4]:
# Assuming `playlists_df`, `unique_songs_df`, and `playlist_song_map` are already defined as per the user's code

# Step 1: Tokenize Playlist Names
tfidf_vectorizer = TfidfVectorizer()
playlist_name_tfidf = tfidf_vectorizer.fit_transform(playlists_df['name'])

# Step 2: Create Artist and Song Features Efficiently
# Create a list of all unique artists and songs
all_artists = set(artist for song in unique_songs.values() for artist in song['artist_name'])
all_songs = set(song['track_name'] for song in unique_songs.values())

# Convert sets to lists
all_artists_list = list(all_artists)
all_songs_list = list(all_songs)

# Ensure unique playlist names (if necessary, adjust this based on your data)
playlists_df['unique_playlist_id'] = playlists_df['name'] + '_' + playlists_df.index.astype(str)

# Create artist presence data
artist_presence = {(playlist_id, unique_songs[track_uri]['artist_name']): 1
                   for playlist_id, tracks in playlist_song_map.items()
                   for track_uri in tracks}
multi_index_artists = pd.MultiIndex.from_tuples(artist_presence.keys(), names=['playlist_id', 'artist'])
artist_series = pd.Series(artist_presence, index=multi_index_artists)
artist_matrix = artist_series.unstack(fill_value=0)
artist_matrix = artist_matrix.reindex(columns=all_artists_list, fill_value=0)

# Create song presence data
song_presence = {(playlist_id, unique_songs[track_uri]['track_name']): 1
                 for playlist_id, tracks in playlist_song_map.items()
                 for track_uri in tracks}
multi_index_songs = pd.MultiIndex.from_tuples(song_presence.keys(), names=['playlist_id', 'song'])
song_series = pd.Series(song_presence, index=multi_index_songs)
song_matrix = song_series.unstack(fill_value=0)
song_matrix = song_matrix.reindex(columns=all_songs_list, fill_value=0)

# Combine Features
combined_features = pd.concat([
    pd.DataFrame(playlist_name_tfidf.toarray(), index=playlists_df['unique_playlist_id'], columns=tfidf_vectorizer.get_feature_names_out()),
    artist_matrix,
    song_matrix
], axis=1)

# The `combined_features` DataFrame is now ready for unsupervised learning
combined_features.head()

Unnamed: 0,000,00s,100,123,14,15,16,17,18,1967,...,End Credits (Aladdin And The King Of Thieves),Desperado - (Live),Cheapest Flight,Policeman - Radio Edit,Alive - Oh Snap It’s Luke! Remix,Hair,Find Love (feat. Dboy),17 (feat. Packy),Generation Away,Latkes
party party_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
summer_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
Rap_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
#tb_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
Disney_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [6]:
song_playlist_matrix = combined_features.values

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [7]:
from sklearn.decomposition import NMF

# number of factors
n_factors = 20

# apply matrix factorisation using Non-negative Matrix Factorisation (NMF)
model = NMF(n_components=n_factors)
playlist_factors = model.fit_transform(song_playlist_matrix) # playlist_factors holds the 'tastes' of our playlists
song_factors = model.components_.T  # song_factors holds the 'features' of our songs

ValueError: Input X contains NaN.
NMF does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values