In [8]:
import json
import numpy as np
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import MiniBatchKMeans
import matplotlib.pyplot as plt

# Load vetting playlist and data
vetting_playlist = json.load(open('Vetting_Playlist.json'))
data = json.load(open("1 Million Playlists/mpd.slice.0-999.json"))
playlists = data["playlists"]

# Preprocess data: Extract features (e.g., genres/artists frequencies) from playlists
def extract_features(playlist):
    genre_counter = Counter()
    artist_counter = Counter()
    for track in playlist['tracks']:
        artist_name = track.get('artist_name', 'Unknown')
        artist_counter[artist_name] += 1
        if artist_name != 'Unknown':
            artist_genres = get_artist_genres(artist_name)
            genre_counter.update(artist_genres)
    return list(genre_counter), list(artist_counter)

# Get artist genres using Spotify API
def get_artist_genres(artist_name):
    # Assume you have implemented this function
    pass

# Define distance metric
def distance(playlist1, playlist2):
    features1 = np.concatenate(list(extract_features(playlist1)))
    features2 = np.concatenate(list(extract_features(playlist2)))
    return 1 - cosine_similarity([features1], [features2])[0, 0]

# Create feature matrix
all_features = set()
for playlist in playlists:
    genre_features, artist_features = extract_features(playlist)
    all_features.update(genre_features)
    all_features.update(artist_features)
all_features = list(all_features)
X = np.zeros((len(playlists), len(all_features)))
for i, playlist in enumerate(playlists):
    genre_features, artist_features = extract_features(playlist)
    for genre in genre_features:
        X[i, all_features.index(genre)] = genre_features.count(genre)
    for artist in artist_features:
        X[i, all_features.index(artist)] = artist_features.count(artist)

# Perform MiniBatchKMeans clustering
k = 10  # Number of clusters
mbk = MiniBatchKMeans(n_clusters=k, batch_size=100, random_state=42)
mbk.fit(X)

# Find the closest match to the vetting playlist
vetting_features = np.zeros(len(all_features))
vetting_genre_features, vetting_artist_features = extract_features(vetting_playlist)
for genre in vetting_genre_features:
    vetting_features[all_features.index(genre)] = vetting_genre_features.count(genre)
for artist in vetting_artist_features:
    vetting_features[all_features.index(artist)] = vetting_artist_features.count(artist)
vetting_cluster = mbk.predict([vetting_features])[0]
closest_match = None
closest_distance = float('inf')
for i, playlist in enumerate(playlists):
    if mbk.labels_[i] == vetting_cluster:
        d = distance(vetting_playlist, playlist)
        if d < closest_distance:
            closest_distance = d
            closest_match = playlist

print("Closest match:", closest_match)


AttributeError: 'str' object has no attribute 'get'