In [1]:
import pandas as pd
import json
from hdbscan import HDBSCAN
import umap

import numpy as np
import plotly.express as px

from collections import Counter

In [2]:
def get_most_similar(artists, cosine_matrix, artist_to_idx, unique_artists, n):
    similarities = np.zeros_like(cosine_matrix[0])

    print(similarities.shape)

    for artist in artists:
        similarities += cosine_matrix[artist_to_idx[artist]]
    
    print(similarities.shape)

    sim_artists = list(sorted(enumerate(similarities), key=lambda x: x[1], reverse=True))
    # n+1 to exclude similarity with self

    artist_idxs = [artist_to_idx[artist] for artist in artists]

    most_sim = [(idx, sim)

                for idx,sim in sim_artists if idx not in artist_idxs] 

    print("| Most Similar Artists to " + ', '.join(artists) + " | Cosine Similarity |")

    print("| --------------- | --------------- |")

    for related_artist, score in most_sim[:n]:

        print('| %s | %.3f |' % (unique_artists[related_artist], score))

In [3]:
with open('c:/Dev/Programming/python/Spotify Liked Songs/data/output.json', mode='r') as data:
    json_obj = json.load(data)

# how many times an artist occurs in all songs in data

num_playlists = len(json_obj)

print(f'{num_playlists} unique playlists')

6159 unique playlists


In [4]:
artist_occurences_per_playlist = Counter()
        # how many unique playlists an artist occurs in

reformatted = []

for playlist_id in json_obj:
    playlist = json_obj[playlist_id]

    reformatted_playlist = []

    current_playlist_artists = set()
    # all unique artists that occur in this playlist

    for song in playlist:
        if (song['id'] is None):
            continue

        artists = [artist for artist in song['artists']
                   if not (artist is None or artist == '')]
        
        reformatted_playlist.append(artists)

        if (len(artists) > 0):
            current_playlist_artists.add(artists[0])
    
    reformatted.append(reformatted_playlist)

    artist_occurences_per_playlist.update(current_playlist_artists)

In [5]:
unique_artists = [x[0] for x in artist_occurences_per_playlist.most_common(5000)]

num_artists = len(unique_artists)

print(f'{num_artists} unique artists')

artist_to_idx = {artist: idx for idx,
                    artist in enumerate(unique_artists)}

5000 unique artists


In [6]:
term_frequencies = np.zeros((num_playlists, num_artists))
document_frequencies = Counter()

for idx, playlist in enumerate(reformatted):

    for song in playlist:
        if (len(song) > 0):
            artist = song[0]
            if (artist in unique_artists):  # if occurs suitably often
                artist_idx = artist_to_idx[artist]

                term_frequencies[idx][artist_idx] = 1 #we dont care much abt how many times an artist occurs in the playlist

                document_frequencies.update([artist])

idf = np.log(num_artists / np.fromiter(document_frequencies.values(), dtype=float))

tf_idf = term_frequencies * idf

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tf_idf.T)

In [8]:
for artist in ["Taylor Swift", "Queen", "AJR", "Neutral Milk Hotel"]:
    get_most_similar([artist], cosine_sim,
                 artist_to_idx, unique_artists, 5)
    print()

(5000,)
(5000,)
| Most Similar Artists to Taylor Swift | Cosine Similarity |
| --------------- | --------------- |
| Olivia Rodrigo | 0.334 |
| Ariana Grande | 0.310 |
| Sabrina Carpenter | 0.284 |
| Harry Styles | 0.283 |
| Miley Cyrus | 0.282 |

(5000,)
(5000,)
| Most Similar Artists to Queen | Cosine Similarity |
| --------------- | --------------- |
| AC/DC | 0.329 |
| Guns N' Roses | 0.290 |
| Bon Jovi | 0.277 |
| Journey | 0.265 |
| Survivor | 0.255 |

(5000,)
(5000,)
| Most Similar Artists to AJR | Cosine Similarity |
| --------------- | --------------- |
| BoyWithUke | 0.219 |
| Imagine Dragons | 0.216 |
| Stellar | 0.208 |
| Panic! At The Disco | 0.175 |
| Andy Grammer | 0.173 |

(5000,)
(5000,)
| Most Similar Artists to Neutral Milk Hotel | Cosine Similarity |
| --------------- | --------------- |
| Townes Van Zandt | 0.333 |
| Gregory Alan Isakov | 0.226 |
| Björk | 0.218 |
| Kero Kero Bonito | 0.198 |
| Fleet Foxes | 0.198 |



In [62]:
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import silhouette_score

reduced = umap.UMAP(n_neighbors=100, n_components=10,
                                    n_jobs=-1, metric='cosine', min_dist=0).fit_transform(tf_idf.T)



In [63]:
from sklearn.cluster import SpectralClustering

labels_numeric = list(

    SpectralClustering(n_clusters=10, random_state=5).fit_predict(reduced))

labels = list(map(str, labels_numeric))

In [64]:
genre_artist_freq = [Counter() for _ in range(np.min(labels_numeric), np.max(
    labels_numeric)+1)]  # dict of artists partitioned by genre and their popularity
for artist, label in zip(unique_artists, labels_numeric):
    genre_artist_freq[label][artist] = artist_occurences_per_playlist[artist]

for idx, counter in enumerate(genre_artist_freq):
    print(str(idx+1) + '. ' + ', '.join([x[0] for x in counter.most_common(10)]))
    print()

1. The Weeknd, Drake, Eminem, Kendrick Lamar, Kanye West, Travis Scott, Tyler, The Creator, Juice WRLD, XXXTENTACION, Lil Uzi Vert

2. Queen, Elton John, The Beatles, AC/DC, Michael Jackson, Billy Joel, Bon Jovi, Guns N' Roses, Fleetwood Mac, ABBA

3. Taylor Swift, Billie Eilish, Imagine Dragons, Post Malone, Ed Sheeran, Bruno Mars, Rihanna, Coldplay, Maroon 5, Ariana Grande

4. Pritam, A.R. Rahman, Vishal-Shekhar, Arijit Singh, Sachin-Jigar, Anirudh Ravichander, Tanishk Bagchi, Atif Aslam, Vishal Mishra, Darshan Raval

5. Hillsong UNITED, Elevation Worship, Hillsong Worship, TobyMac, Lauren Daigle, Chris Tomlin, for KING & COUNTRY, Bethel Music, Brandon Lake, Maverick City Music

6. Bad Bunny, KAROL G, J Balvin, Daddy Yankee, Eslabon Armado, Don Omar, Luis Fonsi, Becky G, Fuerza Regida, Peso Pluma

7. BTS, NewJeans, BLACKPINK, Jung Kook, FIFTY FIFTY, Stray Kids, ENHYPEN, TWICE, TOMORROW X TOGETHER, aespa

8. Morgan Wallen, Luke Combs, Zach Bryan, Jason Aldean, Luke Bryan, Rascal Flatt

In [65]:
genres = {0: "Hip-Hop", 1: "Rock", 2: "Pop", 3: "South Asian", 4: "CCM", 5: "Spanish", 6: "K-Pop", 7: "Country", 8: "Alt", 9: "Indie"}

#make sure to fix this manually each time you run the model

named_genres = [genres[x] for x in labels_numeric]



In [66]:
reducer = umap.UMAP(n_neighbors=50, n_components=2,
                            n_jobs=-1, metric='cosine', min_dist=0)
        # 30 is a good value for this

umap_embedded = reducer.fit_transform(tf_idf.T)

print(umap_embedded.shape)

(5000, 2)


In [67]:
def show_plot(data):
    if ('labels' in data):
        fig = px.scatter(data, x='x', y='y',
                         color='labels', hover_data='names', width=1000, height=500)
    else:
        fig = px.scatter(data, x='x', y='y', hover_data='names')

    fig.update_layout(title='Spotify Genre Mapping')
    fig.write_html("interactive_genre_plot.html")
    fig.show()

In [68]:
df = pd.DataFrame(
    {'x': umap_embedded[:, 0], 'y': umap_embedded[:, 1], 'names': unique_artists, "labels": named_genres})

show_plot(df)



In [16]:
my_artists = sorted(list(set(
    ["Louis Tomlinson", "Taylor Swift", "One Direction", "Harry Styles", "Niall Horan", "Lewis Capaldi", "U2", "Gracie Abrams", "James Arthur", "Dean Lewis", "Troye Sivan", "Arctic Monkeys"])))

get_most_similar(my_artists, cosine_sim, artist_to_idx, unique_artists, 5)

(5000,)
(5000,)
| Most Similar Artists to Arctic Monkeys, Dean Lewis, Gracie Abrams, Harry Styles, James Arthur, Lewis Capaldi, Louis Tomlinson, Niall Horan, One Direction, Taylor Swift, Troye Sivan, U2 | Cosine Similarity |
| --------------- | --------------- |
| Ed Sheeran | 1.892 |
| Olivia Rodrigo | 1.804 |
| Shawn Mendes | 1.722 |
| Billie Eilish | 1.462 |
| Conan Gray | 1.403 |
