In [None]:
import csv
import json
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd

import os
from dotenv import load_dotenv
import time

In [None]:
# Fetch environment variables
load_dotenv()
CLIENT_ID = os.getenv('CLIENT_ID')
CLIENT_SECRET = os.getenv('CLIENT_SECRET')

# Set up Spotify API credentials
client_credentials_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
# ------------------------------ GET USER PLAYLIST DATA ------------------------------
features = ['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']
user_playlist = pd.read_csv('../Playlists/hedgehogs_dilemma.csv')
# user_playlist = pd.read_csv('../Playlists/you_wanna_rock.csv')
user_tracks = [] # Output of list of track dicts
track_info_100 = [] # Intermediate list for storing track info
track_ids_100 = [] # Intermediate list for storing track ids
for index, row in user_playlist.iterrows():
    id = row['Track URI'][14:]
    track_ids_100.append(id)
    track_info = {
        '':index,
        'artist_name': row['Artist Name(s)'],
        'track_name': row['Track Name'],
        'track_id': id,
        # ... we will add more attributes later
    }
    track_info_100.append(track_info)

    if len(track_ids_100) == 100 or index == len(user_playlist) - 1:
        response = sp.audio_features(track_ids_100)
        print(f"Successful response with {len(response)} entries")
        # Update track info for each song
        for response_index, audio_features in enumerate(response):
            if audio_features:  # Check if audio_features is not None
                for feature in features:
                    track_info_100[response_index][feature] = audio_features[feature]
                user_tracks.append(track_info_100[response_index])
        # Reset intermediate variables
        print(f"Total songs stored so far: {len(user_tracks) = }")
        track_ids_100 = []
        track_info_100 = []
        time.sleep(2)
user_df = pd.DataFrame(user_tracks)
print(user_df) # The printed index should be the length of user playlist minus one

In [None]:
# Exclude track name, artist name, track id, index,...
user_features = user_df[features] 
# Get centroid vector of user playlist
centroid_vec = user_features.mean(axis=0)
user_centroid = pd.DataFrame(centroid_vec).transpose()
print(user_centroid)

In [None]:
# ------------------------------ GET CENTROIDS OF 1K PLAYLIST DATASET ------------------------------
slices = ["0-999", "1000-1999", "2000-2999", "3000-3999", "4000-4999", "5000-5999", "6000-6999", "7000-7999", "8000-8999", "9000-9999"]
# ---------- YOUR FILEPATHS HERE ----------
FILENAME = "mpd.slice.0-999.json"
FOLDER = r"../spotify_million_playlist_dataset/data"
PATH = FOLDER + "/" + FILENAME
# ---------- YOUR FILEPATHS HERE ----------
playlists = []
index = 1
for slice in slices:
    FILENAME = "mpd.slice.{}.json".format(slice)
    dataset_tracks = pd.read_csv('../mpd.slice.{}.csv'.format(slice))
    with open(PATH, "r") as playlist_file:
        batch = json.load(playlist_file)
        print(f"Loading complete! Beginning parsing...")
        # Loop through all playlists
        for playlist_index, playlist in enumerate(batch["playlists"]):
            print(f"\nScanned playlist {playlist_index} ({len(playlist['tracks'])} songs)")
            playlist_tracks = []
            # Loop through tracks in playlist
            for track_index, track in enumerate(playlist["tracks"]):
                id = track["track_uri"][14:]
                # Search for playlist track in dataset
                matches = dataset_tracks[dataset_tracks['track_id'] == id]
                if matches.empty:
                    continue
                # Get one of the matches as a DataFrame (all should be the same data, just different indices)
                match = matches.iloc[[0]]
                track_features = match.to_dict('records')[0] # Convert DataFrame to Dictionary
                playlist_tracks.append(track_features)

            if len(playlist_tracks) > 0: # Non-empty number of tracks in playlist
                playlists.append(pd.DataFrame(playlist_tracks))
    print(f"Considering {len(playlists)} out of {index*1000} playlists in {slice} json.")
    index += 1

In [None]:
# playlists is a list holding each playlist
# each playlist is a dataframe holding dicts (rows) of tracks
print(len(playlists))
for playlist in playlists:
    #print(playlist)
    print(playlist.columns)
    break
    print("playlist type", type(playlist))
    for track in playlist:
        print("track type", type(track))
        print(track)

In [None]:
# ------------------------------ CALCULATE PLAYLIST CENTROIDS ------------------------------
features = ['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']
centroids = []
for index, playlist in enumerate(playlists):
    # Exclude track name, artist name, track id, index,...
    pl_features = playlist[features] 
    # Get centroid vector of user playlist
    centroid_vec = pl_features.mean(axis=0)
    pl_centroid = pd.DataFrame(centroid_vec).transpose()
    centroids.append(pl_centroid)
    # print(pl_centroid)

# Convert centroids to a DataFrame for clustering
centroids_df = pd.concat(centroids, ignore_index=True) # Stack dataframes on top of each other as rows
print(centroids_df)
# found = centroids_df.iloc[0] # Get first row centroid

In [None]:
def find_optimal_clusters(data, max_k=20):
    # Drop the 'cluster' column from centroids_df to exclude it from the input
    if 'cluster' in data.columns:
        data = data.drop(columns=['cluster'])
    
    scaled_data = scaler.fit_transform(data[features])
    inertia = []

    for k in range(1, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(scaled_data)
        inertia.append(kmeans.inertia_)
    
    plt.figure(figsize=(8, 4))
    plt.plot(range(1, max_k + 1), inertia, marker='o')
    plt.xlabel('Number of clusters')
    plt.ylabel('Inertia')
    plt.title('Elbow Method')
    plt.show()

find_optimal_clusters(centroids_df)

In [None]:
# ------------------------------ CLUSTER PLAYLISTS ------------------------------
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import euclidean_distances

# Drop the 'cluster' column from centroids_df to exclude it from the input
if 'cluster' in centroids_df.columns:
    centroids_df = centroids_df.drop(columns=['cluster'])

# Perform k-means clustering on the playlist centroids
k = 5
kmeans = KMeans(n_clusters=k)
# Standardize Feature Columns
scaler = StandardScaler()
scaled_centroids_df = scaler.fit_transform(centroids_df[features])
# KMeans Cluster
kmeans.fit(scaled_centroids_df)

In [None]:
# print("\nClustered Playlist Centroids:")
# print(centroids_df.head(5))
print("\nScaled Clustered Playlist Centroids:")
print(scaled_centroids_df)

In [None]:
# ------------------------------ ASSIGN USER PLAYLIST ------------------------------
user_cluster = kmeans.predict(user_centroid)
print(f"Predicted cluster: {user_cluster}")

In [None]:
# Add cluster labels to the centroids DataFrame
# centroids_df['cluster'] = kmeans.labels_

# print("\nClustered Playlist Centroids:")
# print(centroids_df.head(5))
centroids_df['cluster'] = kmeans.fit_predict(scaled_centroids_df)
print(centroids_df.head(5))


In [None]:
# ------------------------------ GET USER CLUSTER ------------------------------
# Find rows in centroids_df that have the same cluster as the user
factors = ['Unnamed: 0', 'artist_name', 'track_name', 'track_id', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']

relevant = centroids_df[centroids_df['cluster'] == float(user_cluster[0])] # DataFrame of rows with centroids and cluster
# Get indices of rows
indices = relevant.index
# Loop through relevant playlists
relevant_playlists = []
for pl_index in indices:
    relevant_playlists.append(playlists[pl_index]) # Append DataFrame playlist
relevant_tracks = pd.concat(relevant_playlists, ignore_index=True)
print(len(relevant_tracks))
# Remove Duplicates by: Subtracting user_df from relevant_tracks based on 'id'
relevant_tracks = pd.merge(relevant_tracks, user_df, on='track_id',how='left',suffixes=('','_a'), indicator=True).query('_merge == "left_only"')
relevant_tracks = relevant_tracks[factors] # Remove duplicate columns
relevant_features = relevant_tracks[features] # Remove track_name, artist_name, id, ...

In [None]:
distances = euclidean_distances(user_centroid, relevant_features).flatten()
# Add distances to relevant_tracks DataFrame
relevant_tracks['distance'] = distances

# Sort by Euclidean distance and select top n_recs
n_recs = 5
recommendations = relevant_tracks.sort_values(by=['distance'], ascending=[True]).drop_duplicates(subset=['track_id']).head(n_recs)

# We should always be able to find enough recs unless it's an absurdly large amount

print("Here are some songs I recommend adding to your playlist!")
print(recommendations)

In [None]:
# What was our user playlist centroid?
print(user_centroid)

In [None]:
# Graphing stuff?
import matplotlib.pyplot as plt
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(centroids_df)

Df = pd.DataFrame(data=principalComponents,
                           columns=['principal component 1', 'principal component 2'])
finalDf = pd.concat([Df, centroids_df[['cluster']]], axis=1)

plt.figure(figsize=(8,6))
plt.scatter(Df['principal component 1'], Df['principal component 2'], c=centroids_df['cluster'].values, cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Cluster')
plt.title('PCA Visualization of Tracks')

plt.show()

In [None]:
# Perform PCA on the centroids
pca = PCA(n_components=2)
centroids_pca = pca.fit_transform(centroids_df)

# Plot PCA-transformed data points
plt.figure(figsize=(8, 6))
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], c=kmeans.labels_, cmap='viridis', s=100, alpha=0.8)

# Plot centroids
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', marker='x', s=200, label='Centroids')

plt.title('PCA Plot with Centroids')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
centroids_df.to_csv('centroids_with_clusters.csv', na_rep='NA', encoding='utf-8')

In [None]:
import pandas as pd
import json

# Function to convert the Playlists (list of DFs) to a JSON serializable structure
def playlists_to_json(playlists_list):
    serializable_data = []
    for index, playlist_df in enumerate(playlists_list):
        if 'Unnamed: 0' in playlist_df:
            playlist_df = playlist_df.drop(columns='Unnamed: 0')
        nested_dict = playlist_df.to_dict(orient='records')
        entry = {'Index': index, 'Tracks': nested_dict}
        serializable_data.append(entry)
    return serializable_data

# Convert Playlists (list of DFs) to write back
json_wb = playlists_to_json(playlists)
with open('output.json', 'w') as json_file:
    json.dump(json_wb, json_file, indent=4)

# Print the JSON structure for verification
print(json.dumps(json_wb, indent=4))