In [1]:
import re
import os
import json
import pickle
import config
import spotipy
import numpy as np
import pandas as pd
from spotipy.oauth2 import SpotifyOAuth
from scipy.spatial.distance import cdist


In [2]:
# Credentials
os.environ["SPOTIPY_CLIENT_ID"] = config.SPOTIPY_CLIENT_ID
os.environ["SPOTIPY_CLIENT_SECRET"] = config.SPOTIPY_CLIENT_SECRET
os.environ['SPOTIPY_REDIRECT_URI'] = config.SPOTIPY_REDIRECT_URI  # Needed for user authorization

# Defining scope to read user playlist and write playlist to user
scope = 'user-library-read user-follow-read playlist-modify-private playlist-modify'
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

In [3]:
feat_cols_user = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
            'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']

# Pickled models
model_path = 'models/KMeans_K17_20000_sample_model.sav'
tsne_path = 'models/openTSNETransformer.sav'
scaler_path = 'models/StdScaler.sav'

# Load models
model = pickle.load(open(model_path, 'rb'))
tsne_transformer = pickle.load(open(tsne_path, 'rb'))
scaler = pickle.load(open(scaler_path, 'rb'))

with open("data/playlists.json", "r") as read_file:
    playlists = json.load(read_file)

scaled_data = np.loadtxt('data/scaled_data.csv', delimiter=',')
feats_df = pd.DataFrame(scaled_data)



In [4]:
def get_current_user_fav_tracks():
    # # Get all favorite songs from current user and save them to lists to construct dataframe
    tracks = sp.current_user_saved_tracks()

    song = []
    artist = []
    uri = []

    #First pass of the saved tracks
    for i in tracks['items']:
        song.append(i['track']['name'])
        artist.append(i['track']['artists'][0]['name'])
        uri_id = i['track']['uri']
        uri.append(re.search('(^spotify:track:([^\s]+))', uri_id).groups()[1])

    # Keep looking in tracks for the rest of the saved tracks, they are paginated
    while tracks['next']:
        tracks = sp.next(tracks)
        for i in tracks['items']:
            song.append(i['track']['name'])
            artist.append(i['track']['artists'][0]['name'])
            uri_id = i['track']['uri']
            uri.append(re.search('(^spotify:track:([^\s]+))', uri_id).groups()[1])

    # Create a dataframe from info extracted above, song, artist and uri
    fav_songs_df = pd.DataFrame(
        {'song': song,
         'artist': artist,
         'uri': uri
        })
    return fav_songs_df
fav_songs_df = get_current_user_fav_tracks()


In [5]:
def get_current_user_audio_features(fav_songs_df):
    # Extract audio features from each song and create a dataframe
    appended_df = []
    for i in fav_songs_df['uri']:
        appended_df.append(pd.DataFrame(sp.audio_features(i)[0], index=[i,]))
    uri_df = pd.concat(appended_df)
    uri_df = uri_df.drop(['uri'], axis=1)
    uri_df = uri_df.reset_index()
    uri_df = uri_df.rename(columns={"index": "uri"})

    fav_songs_feats_df = pd.merge(fav_songs_df, uri_df, on="uri")
    return fav_songs_feats_df
fav_songs_feats_df = get_current_user_audio_features(fav_songs_df)

In [11]:
y = fav_songs_feats_df[feat_cols_user].mean()

In [6]:
def get_user_scaled_y_vector(fav_songs_feats_df):
    y = fav_songs_feats_df[feat_cols_user].mean()
    scaled_y = scaler.transform(np.array(y).reshape(1,-1))
    return scaled_y

scaled_y = get_user_scaled_y_vector(fav_songs_feats_df)

In [7]:
def get_top_n_playlists(model, feats_df, playlists, y, n, metric, similar, printing):
    """
    This function will compute the most similar or disimilar playlists given a target vector y which represents the mean
    features of the user's favorite songs. Similarity is calculated based on metrics such as Cosine, Manhattan, Euclidean, etc.
    Parameters:
        - model: Trained clustering model.
        - feats_df (dataframe): Dataframe with scaled data for all the training data
        - playlists (dictionary): Dictionary with all the playlists from the .json files
        - y (np.array): user's favorite songs scaled vector
        - n (int): top n playlists to retrieve
        - metric (str): metric to use, recommended 'cityblock', 'euclidean', 'cosine'.
        - similar (bool): whether to calculate most similar or most disimilar 
        - printing (bool): whether to print the results or not
    Output:
        - indices (np.array): indices of the top n playlists based on the feats_df dataframe
    
    """
    feats_df['cluster'] = pd.Categorical(model.labels_)
    user_cluster = model.predict(y)
    
    df_slice = feats_df[feats_df['cluster']==user_cluster[0]]
    df_slice = df_slice.drop(['cluster'], axis=1)
    indices = feats_df[feats_df['cluster']==user_cluster[0]].reset_index()['index'].to_numpy()
    
    scaled_data = df_slice.to_numpy()
    if similar:
        simi = cdist(scaled_data, y, metric=metric).argsort(axis=None)[:n]
    else:
        simi = cdist(scaled_data, y, metric=metric).argsort(axis=None)[-n:]
    simi = indices[simi]
     
    if printing:
        for idx in simi:
            print('Playlist: {}\tpid:{}'.format(playlists[idx]['name'], playlists[idx]['pid']))
            for song in playlists[idx]['tracks'][0:3]:
                print('Artist: {}\t Song:{}'.format(song['artist_name'], song['track_name']))
            print('\n')
    
    return simi
similar_playlists_indices = get_top_n_playlists(model, feats_df, playlists, scaled_y, 3, 'cityblock', False, False)

In [14]:
def get_songs_recommendations(similar_playlists_indices, n, printing):
    """
    This function computes the variance, of each song in the given playlists, to the user's favorite songs (y_scaled)
    Parameters:
        - similar_playlists_indices (np.array)
    """
    playlist_audio_features = []
    for playlist in similar_playlists_indices:
        for song in playlists[playlist]['tracks']:
            playlist_audio_features.append(sp.audio_features(song['track_uri'].split("k:")[1])[0])

    playlist_audio_features_df = pd.DataFrame(playlist_audio_features)
    array_audio_feats = playlist_audio_features_df[feat_cols_user].to_numpy()
    
    y_vector = np.array(y).reshape(1,-1)
    low_variance_indices = np.sum(np.square((y_vector-array_audio_feats)),axis=1).argsort(axis=None)
    song_uris = playlist_audio_features_df.loc[low_variance_indices]['id']
    song_uris.drop_duplicates(inplace=True)

    if printing:
        for uri in song_uris[:10]:
            print('Song: {}'.format(sp.track(uri)['name']))
            print('Artist: {}\n'.format(sp.track(uri)['artists'][0]['name']))

    return song_uris[:n]

In [16]:
def build_spotify_playlist(playlist_name, description, items):
    items = items.to_list()
    user_id = sp.current_user()['id']
    new_playlist = sp.user_playlist_create(user_id, playlist_name, description=description)
    sp.playlist_add_items(new_playlist['id'],items=items)


In [17]:
playlist_name = 'KMeans Euclidean Similar 10 playlists'
description = 'Machine Learning playlist'
similar_playlists_indices = get_top_n_playlists(model, feats_df, playlists, scaled_y, 10, 'euclidean', similar=True, printing=False)
song_uris = get_songs_recommendations(similar_playlists_indices,n=30, printing=True)

Song: We Can't Stop
Artist: Miley Cyrus

Song: Burn
Artist: Ellie Goulding

Song: New Slang
Artist: The Shins

Song: Payphone
Artist: Maroon 5

Song: Save Me
Artist: Gotye

Song: Low (feat. T-Pain)
Artist: Flo Rida

Song: Animals
Artist: Maroon 5

Song: The Kill (Bury Me)
Artist: Thirty Seconds To Mars

Song: Summer
Artist: Paris Jones

Song: No Way No
Artist: MAGIC!

