In [1]:
#Now that we have the following data on her playlist:
#Top 10 Artists:             |       Top 10 Genres: 
# Angel Olsen: 7.41%         |       indie rock: 6.85%
# Alabama Shakes: 5.56%      |       chamber pop: 6.36%
# Shakey Graves: 3.70%       |       stomp and holler: 3.91%
# Big Thief: 3.70%           |       art pop: 3.18%
# Vulfpeck: 2.78%            |       alternative rock: 2.69%
# Dr. Dog: 2.78%             |       indie folk: 2.44%
# Scary Pockets: 2.78%       |       modern rock: 2.44%
# Perfume Genius: 2.78%      |       indie pop: 2.44%
# DeVotchKa: 2.78%           |       noise pop: 2.44%
# Lake Street Dive: 1.85%    |       rock: 2.20%

#Let's try to find playlists that match the best to her music tase

In [2]:
import json
import numpy as np
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
import spotipy
from sklearn.feature_extraction.text import CountVectorizer
from spotipy.oauth2 import SpotifyClientCredentials

In [3]:
# Load Spotify API credentials
SPOTIFY_CLIENT_ID = '5bfd2e70ee10473bb7f453a838d02952'
SPOTIFY_CLIENT_SECRET = 'a2fb65c973f246ef9a9392e5e14887fe'

# Initialize Spotipy client
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID, client_secret=SPOTIFY_CLIENT_SECRET))

# Load vetting playlist and data
vetting_playlist = json.load(open('Vetting_Playlist.json'))
data = json.load(open("1 Million Playlists/mpd.slice.0-999.json"))
playlists = data["playlists"]

# Preprocess data: Extract features (e.g., genres/artists frequencies) from playlists
def extract_features(playlist):
    genre_counter = Counter()
    artist_counter = Counter()
    for track in playlist['tracks']:
        artist_name = track.get('artist_name', 'Unknown')
        artist_counter[artist_name] += 1
        if artist_name != 'Unknown':
            artist_genres = get_artist_genres(artist_name)
            if artist_genres:
                genre_counter.update(artist_genres)
    return list(genre_counter.elements()), list(artist_counter.elements())


# Get artist genres using Spotify API
def get_artist_genres(artist_name):
    results = sp.search(q='artist:' + artist_name, type='artist')
    items = results['artists']['items']
    if len(items) > 0:
        return items[0]['genres']
    else:
        return []

# Define distance metric
def distance(playlist1, playlist2):
    features1, features2 = extract_features(playlist1), extract_features(playlist2)
    vectorizer = CountVectorizer()
    genres_combined = [", ".join(features1[0]), ", ".join(features2[0])]
    vectorized = vectorizer.fit_transform(genres_combined)
    features1_vec, features2_vec = vectorized.toarray()
    return 1 - cosine_similarity([features1_vec], [features2_vec])[0, 0]


# Create distance matrix
num_playlists = len(playlists)
distance_matrix = np.zeros((num_playlists, num_playlists))
for i in range(num_playlists):
    for j in range(num_playlists):
        distance_matrix[i, j] = distance(playlists[i], playlists[j])

# Perform hierarchical clustering
cluster = AgglomerativeClustering(n_clusters=10, affinity='precomputed', linkage='complete')
cluster.fit(distance_matrix)

# Find the closest match to the vetting playlist
vetting_features = np.concatenate(list(extract_features(vetting_playlist)))
vetting_cluster = cluster.predict([vetting_features])[0]
closest_match = None
closest_distance = float('inf')
for i, playlist in enumerate(playlists):
    if cluster.labels_[i] == vetting_cluster:
        d = distance(vetting_playlist, playlist)
        if d < closest_distance:
            closest_distance = d
            closest_match = playlist

print("Closest match:", closest_match)


In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(100, 50))
dendrogram(Z, labels=[f"Playlist {i}" for i in range(num_playlists)], leaf_rotation=90)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Playlist Index')
plt.ylabel('Distance')
plt.show()