In [0]:
import numpy as np
import pandas as pd
from joblib import dump, load
from sklearn.neighbors import KDTree

import os
import sys

In [0]:
def reduce_mem_usage(df, verbose=True):
    """ Function iterates through all the columns of a dataframe and modify the data type
        to reduce memory usage.
        Credit to: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
        Parameters
        ----------
        df : Pandas DataFrame
        verbose: (True) by default, prints out before and after memory usage
        Returns
        -------
        df : Reduced Memory Pandas DataFrame
    """

    if verbose:
        start_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    if verbose:
        end_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(
            100 * (start_mem - end_mem) / start_mem))

    return df

def load_dataset():
    """Loads dataset, and combines them into a single dataframe."""
    try:
        fpaths = ['https://raw.githubusercontent.com/Okocha76/Okocha76.github.io/master/spotify_2018.csv',
                  'https://raw.githubusercontent.com/Okocha76/Okocha76.github.io/master/spotify_2019.csv',
                  'https://raw.githubusercontent.com/Okocha76/Okocha76.github.io/master/spotify_2020.csv']
        df1 = pd.read_csv(fpaths[0], encoding = 'latin1', low_memory=False)
        df2 = pd.read_csv(fpaths[1], encoding = 'latin1', low_memory=False)
        df3 = pd.read_csv(fpaths[2], encoding = 'latin1', low_memory=False)
        df = pd.concat([df1, df2, df3], ignore_index=True)
        df = reduce_mem_usage(df)  # Reduces Memory Usage

    except Exception as e:
        print(
            'Error Occurred while reading the Spotify dataset: {e}'.format(e))
        raise e

    # Check total sum of rows match
    assert df.shape[0] == (df1.shape[0] + df2.shape[0] + df3.shape[0])

    file_names = [fpath.split('/')[-1] for fpath in fpaths]
    print(
        f'''
        -------------------- SHAPE ---------------------
        DF1 {file_names[0]}: {df1.shape}
        DF2 {file_names[1]}: {df2.shape}
        DF3 {file_names[2]}: {df3.shape}
        MERGED DF df: {df.shape}
        ------------------------------------------------
        ''')

    return df


def wrangle(df):

    # Set to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Drop Duplicates
    df = df.drop_duplicates(keep='first')
    
    ## isolating values with a high level of 'speechiness' as they are generally not music but other types of recordings.
    ## converting unwanted values to null then removing those rows
    df['speechiness'] = df['speechiness'].where(df['speechiness'] < .66)

    # 'key', 'mode' and 'time_signature' are numeric columns, but arguably should be a categorical feature,
    # so convert it from a number to a string
    # df['key'] = df['key'].astype(str)
    # df['mode'] = df['mode'].astype(str)
    # df['time_signature'] = df['time_signature'].astype(str)

    # Drop 'popularity' and 'duration_ms', as these are no accoustic features
    df = df.drop(['popularity','duration_ms','genre'], axis=1)
    
    # Drop missing values
    df = df.dropna()

    return df


df = load_dataset()
df.head()

Memory usage of dataframe is 56.38 MB
Memory usage after optimization is: 22.71 MB
Decreased by 59.7%

        -------------------- SHAPE ---------------------
        DF1 spotify_2018.csv: (158885, 18)
        DF2 spotify_2019.csv: (155645, 18)
        DF3 spotify_2020.csv: (96035, 18)
        MERGED DF df: (410565, 18)
        ------------------------------------------------
        


Unnamed: 0,artist_name,track_name,track_id,popularity,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Kina Grannis,Can't Help Falling In Love,6lfxq3CG4xtTiEg7opyCyx,72,acoustic,0.266113,0.059601,0,-18.515625,1,0.036285,0.904785,7.1e-05,0.131958,0.142944,181.75,201933,3
1,Ben Rector,Love Like This,06JmNnH3iXKENNRKifqu0v,64,acoustic,0.637207,0.129028,2,-11.890625,1,0.040497,0.902832,0.0,0.106995,0.36792,139.75,214240,4
2,Jason Mraz,Have It All,7BXW1QCg56yzEBV8pW8pah,66,acoustic,0.643066,0.638184,1,-4.945312,0,0.04071,0.203979,0.0,0.070679,0.600098,82.0,226107,4
3,Ben Rector,Old Friends,4MZQ3lHA1TYO6yyedtmBYg,61,acoustic,0.468994,0.403076,8,-10.046875,1,0.048309,0.131958,4.7e-05,0.116028,0.208008,147.375,224744,4
4,Ben Rector,I Will Always Be Yours,4m1lB7qJ78VPYsQy7RoBcU,60,acoustic,0.445068,0.770996,0,-4.605469,1,0.05011,0.133057,0.0,0.272949,0.447998,147.875,226827,4


In [0]:
# Wrangle Data
df = wrangle(df)

print(f'After Wrangling, Shape: {df.shape}')
df.head()

After Wrangling, Shape: (405526, 15)


Unnamed: 0,artist_name,track_name,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,Kina Grannis,Can't Help Falling In Love,6lfxq3CG4xtTiEg7opyCyx,0.266113,0.059601,0,-18.515625,1,0.036285,0.904785,7.1e-05,0.131958,0.142944,181.75,3
1,Ben Rector,Love Like This,06JmNnH3iXKENNRKifqu0v,0.637207,0.129028,2,-11.890625,1,0.040497,0.902832,0.0,0.106995,0.36792,139.75,4
2,Jason Mraz,Have It All,7BXW1QCg56yzEBV8pW8pah,0.643066,0.638184,1,-4.945312,0,0.04071,0.203979,0.0,0.070679,0.600098,82.0,4
3,Ben Rector,Old Friends,4MZQ3lHA1TYO6yyedtmBYg,0.468994,0.403076,8,-10.046875,1,0.048309,0.131958,4.7e-05,0.116028,0.208008,147.375,4
4,Ben Rector,I Will Always Be Yours,4m1lB7qJ78VPYsQy7RoBcU,0.445068,0.770996,0,-4.605469,1,0.05011,0.133057,0.0,0.272949,0.447998,147.875,4


In [0]:
# Check cardinality of non-numeric features
df.describe(exclude='number').T.sort_values(by='unique')

Unnamed: 0,count,unique,top,freq
artist_name,405526,35974,Armin van Buuren,1292
track_name,405526,253086,Home,160
track_id,405526,309448,5eJjDgREURzIEO3OzIYmum,10


In [0]:
# Identifying information, Artist, ID, Name
df_artists = df[['artist_name', 'track_id', 'track_name']]

In [0]:
X_train = df.drop(['artist_name', 'track_name'], axis=1)

In [0]:
pd.set_option('display.max_columns', None)

In [0]:
# Some track_ids occur more than once, because they fall into more than one genre.
X_train_grouped = X_train.groupby('track_id', as_index=False).max()

print(X_train_grouped.shape)
X_train_grouped.head()

(309448, 13)


Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,0004MqBaviNnr5YGhiJIP8,0.800781,0.894043,11,-2.949219,0,0.05719,0.000578,0.856934,0.129028,0.189941,128.0,4
1,000AWPHTOpbAx3wyxeUhBa,0.703125,0.944824,2,-6.535156,0,0.056396,0.000883,0.75293,0.188965,0.440918,124.0,4
2,000H1aSPTxP6k0Fn8DD0Ob,0.673828,0.712891,1,-5.613281,0,0.031097,0.055603,0.0,0.468994,0.667969,130.0,4
3,000TF50F5TTFTgxJUYxK3Z,0.302979,0.059814,3,-17.65625,1,0.036499,0.992188,0.831055,0.117981,0.141968,58.59375,4
4,000jcQGuywRFvo5rGqBJQB,0.774902,0.854004,8,-4.101562,1,0.046387,0.309082,0.0,0.487061,0.902832,132.0,4


In [0]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

identifier = 'track_id'
features = X_train_grouped.columns.drop(identifier).tolist()

scaler = StandardScaler().fit(X_train_grouped[features])

X_train_scaled = pd.DataFrame(scaler.transform(X_train_grouped[features]),
                        columns=X_train_grouped[features].columns,
                        index=X_train_grouped['track_id'].values)

print(X_train_scaled.shape)
X_train_scaled.head()

(309448, 12)


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0004MqBaviNnr5YGhiJIP8,1.344727,0.991211,1.598633,1.045898,-1.271484,-0.321777,-0.911621,1.666992,-0.434814,-0.962891,0.197754,0.223145
000AWPHTOpbAx3wyxeUhBa,0.805664,1.182617,-0.922363,0.423096,-1.271484,-0.331299,-0.911133,1.382812,-0.117859,-0.014267,0.061981,0.223145
000H1aSPTxP6k0Fn8DD0Ob,0.644043,0.306641,-1.202148,0.583008,-1.271484,-0.632324,-0.752441,-0.671387,1.363281,0.844238,0.265625,0.223145
000TF50F5TTFTgxJUYxK3Z,-1.402344,-2.158203,-0.64209,-1.508789,0.786621,-0.568359,1.947266,1.59668,-0.493408,-1.144531,-2.158203,0.223145
000jcQGuywRFvo5rGqBJQB,1.202148,0.839355,0.758301,0.846191,0.786621,-0.450439,-0.021942,-0.671387,1.458984,1.732422,0.333496,0.223145


In [0]:
# Train tree on data
tree = KDTree(X_train_scaled, leaf_size=10)

In [0]:
# Tree can be queried for nearest indices (notice the nearest neighbor is the input itself)
tree.query(pd.DataFrame(X_train_scaled.iloc[100000]).T, k=11, return_distance=True)

(array([[0.        , 0.45894288, 0.45894288, 0.48227059, 0.49403652,
         0.49569438, 0.50429337, 0.51298478, 0.523136  , 0.52493448,
         0.54472622]]),
 array([[100000, 244696,  17208, 146783,  12461,  92153, 262992, 263094,
         193378, 282485, 276454]]))

In [0]:
# Function to query based on ID
def nearest_by_id(id, k=11):
    return tree.query(X_train_scaled.loc[[id]], k=k)[1]

In [0]:
test_track_id = '2W12VIuXFF4Ir4NiHL1lLP'

In [0]:
X_train_scaled.iloc[100000]

danceability        0.848633
energy              0.832031
key                 1.038086
loudness            0.935059
mode               -1.271484
speechiness        -0.575195
acousticness       -0.708984
instrumentalness   -0.671387
liveness           -0.302490
valence             1.418945
tempo              -0.039795
time_signature      0.223145
Name: 2W12VIuXFF4Ir4NiHL1lLP, dtype: float16

In [0]:
# Dictionary mapping track ID to dataframe index
val_ind_map = {value : index for index, value in zip(range(len(X_train_grouped)), X_train_grouped['track_id'])}

In [0]:
# Show results to check
results = nearest_by_id(test_track_id)

output = X_train_grouped.iloc[results[0]]

output

Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
100000,2W12VIuXFF4Ir4NiHL1lLP,0.710938,0.852051,9,-3.587891,0,0.035889,0.070801,0.0,0.154053,0.819824,121.0,4
244696,6AHFWW3TV95wcZGKQ4LQ6Z,0.730957,0.849121,9,-3.183594,0,0.063721,0.031403,0.00019,0.177002,0.879883,119.0,4
17208,0Qq9NYaNDH23fxFucOXugr,0.730957,0.849121,9,-3.183594,0,0.063721,0.031403,0.00019,0.177002,0.879883,119.0,4
146783,3gTGxduS5wEqTq4M3roLrO,0.664062,0.848145,9,-2.589844,0,0.060303,0.032806,6e-06,0.171997,0.783203,124.0625,4
12461,0JSotxr4YFcp5e8c5yaMKv,0.741211,0.937012,9,-2.826172,0,0.031799,0.059113,0.0004,0.097107,0.823242,120.0,4
92153,2JoupvfoFX1C8gv2tvig2J,0.745117,0.855957,9,-2.955078,0,0.0495,0.088013,0.0,0.215942,0.868164,116.0,4
262992,6ckFK2ksVUNpfeUqRDtirQ,0.744141,0.850098,9,-3.822266,0,0.03421,0.11499,2e-06,0.136963,0.810059,108.0625,4
263094,6crkptUrimTb6Z0t199uvA,0.707031,0.86084,9,-5.015625,0,0.0354,0.189941,6.8e-05,0.20105,0.846191,118.0,4
193378,4rsOg8Q9S368Y54c24r3NT,0.701172,0.803223,8,-5.335938,0,0.041107,0.145996,0.010803,0.162964,0.791992,120.0,4
282485,76temC5Qb1fBXMsvICfPpH,0.694824,0.779785,9,-5.027344,0,0.050812,0.025604,1e-06,0.175049,0.788086,127.9375,4


In [0]:
test =  pd.DataFrame(output['track_id'])

output2 = df[df['track_id'] == '2ktqqycUethpMN82GPQP2N']

output2

Unnamed: 0,artist_name,track_name,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
103273,Spektre,Chant Rush - Beico & MT93 Remix,2ktqqycUethpMN82GPQP2N,0.628906,0.977051,11,-6.785156,0,0.044403,0.010696,0.904785,0.11499,0.120972,129.0,4


In [0]:
# Dump our tree model with pickle into this file:
dump(tree, 'SpotifyKDTree.joblib')

# Dump transformation
dump(scaler, 'SpotifyScaled.joblib')

['SpotifyScaled.joblib']

In [0]:
# Export standardized features for later use
X_train_scaled.to_csv("spotify_standardized.csv")

In [0]:
# Import standardized data
df_import = pd.read_csv("spotify_standardized.csv", index_col=0)

# Import picked model
from joblib import load
Spot_KDTree = load('SpotifyKDTree.joblib')

# Import scaling
Spot_Scaled = load('SpotifyScaled.joblib')

# Make functions using model
def new_nearest_by_id(id, k=11):
    return Spot_KDTree.query(df_import.loc[[id]], k=k)[1]

In [0]:
test_track_id = '6Wosx2euFPMT14UXiWudMy'

In [0]:
results = new_nearest_by_id(test_track_id)

output = X_train_grouped.iloc[results[0]]

output

Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
53898,1Lqh2RUDCEtf689GKUCjfV,0.603027,0.723145,9,-5.890625,0,0.04541,0.024994,0.0,0.082397,0.38208,114.9375,4
259273,6Wosx2euFPMT14UXiWudMy,0.603027,0.723145,9,-5.890625,0,0.04541,0.024994,0.0,0.082397,0.38208,114.9375,4
275590,6wJ7316jrcKtVpqunNFr5L,0.619141,0.71582,9,-7.003906,0,0.053589,0.023804,8.8e-05,0.071899,0.312988,124.0,4
241661,65cgIP8nQy2ztJeAVMjDyQ,0.63916,0.744141,9,-4.652344,0,0.028,0.020798,0.046692,0.093079,0.447998,118.0,4
302663,7pBsquIkbED6W6uSQJGbkn,0.604004,0.73584,10,-5.597656,0,0.055511,0.006081,0.109009,0.109985,0.344971,118.0,4
238746,615DSA0aTZFswhBoPljJLA,0.625977,0.646973,9,-7.648438,0,0.028305,0.011101,0.000318,0.104004,0.353027,118.125,4
116587,2vbBkAz6M69parLwokAiID,0.638184,0.692871,8,-6.398438,0,0.031403,0.125,4e-06,0.09021,0.429932,114.9375,4
101554,2YMX3uHZa5mNe25jphPzNA,0.578125,0.816895,10,-5.085938,0,0.029907,2.2e-05,0.000141,0.087891,0.378906,116.0,4
82638,24uO5OR1A9ogthA4MF284p,0.573242,0.71582,9,-4.65625,0,0.035614,0.0013,0.0,0.091003,0.36499,102.125,4
261175,6Znneb7vooaukFTckfChPm,0.604004,0.719238,9,-5.597656,0,0.046204,0.004139,1e-06,0.058807,0.355957,100.0,4


In [0]:
!pip install spotipy

Collecting spotipy
  Downloading https://files.pythonhosted.org/packages/d5/da/f6f71a33c99af2a22b3f885d290116d0e963afa095bf77aba4226f88a876/spotipy-2.9.0-py3-none-any.whl
Installing collected packages: spotipy
Successfully installed spotipy-2.9.0


In [0]:
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.oauth2 as oauth2

In [0]:
def get_features(id):
    cid = "XXX" # Spotify Client ID
    secret = "XXX" # Spotify Secret Key

    client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)    

    token = client_credentials_manager.get_access_token()

    feat = sp.audio_features(tracks=id)
    
    i = feat[0]
    feat_dict = {'danceability':i['danceability'], 'energy':i['energy'], 'key':i['key'], 'loudness':i['loudness'],                  
                 'mode':i['mode'], 'speechiness':i['speechiness'], 'acousticness':i['acousticness'],  
                 'instrumentalness':i['instrumentalness'], 'liveness':i['liveness'], 'valence':i['valence'],
                 'tempo':i['tempo'], 'time_signature':i['time_signature']}
    return(feat_dict)

In [0]:
test_track_id = '7iN1s7xHE4ifF5povM6A48'

In [0]:
get_features(test_track_id)





  


{'acousticness': 0.631,
 'danceability': 0.443,
 'energy': 0.403,
 'instrumentalness': 0,
 'key': 0,
 'liveness': 0.111,
 'loudness': -8.339,
 'mode': 1,
 'speechiness': 0.0322,
 'tempo': 143.462,
 'time_signature': 4,
 'valence': 0.41}

In [0]:
Spot_Scaled.transform([list(get_features(test_track_id).values())])





  


array([[-0.62982533, -0.86361255, -1.48287915,  0.10978544,  0.78695082,
        -0.61953   ,  0.90642975, -0.67157944, -0.53025636, -0.13116017,
         0.72224821,  0.22311778]])

In [0]:
def get_recommendations(id, k=11):
    feats = list(get_features(id).values())
    scaled_feats = Spot_Scaled.transform([feats])
    
    top_k_ind = Spot_KDTree.query(scaled_feats, k=k)[1][0,1:]
    
    top_k_id = list(map(lambda x: X_train_grouped.loc[[x]]['track_id'].iloc[0], top_k_ind))
    
    return top_k_id

In [0]:
get_recommendations(test_track_id)





  


['22PcfdynDcGFHDrZK9b0Tr',
 '7k5lxS3hvATTW4ngag4PNs',
 '6JXlccLNpK7F2XFwHuMy23',
 '4ngjZxPSN9MhnVJeBvPv5P',
 '7kHLzMFQGgBB2Kavdj47vW',
 '16F5WwdmTEYkHe9VoM8K1p',
 '0ICWLdBQR09jm2cQO9FPX0',
 '5HM7jV3IuyYXjOzlLxMy8P',
 '0c8NfaTT7nyz0TDmNK85lq',
 '0DTzNcTTUZRFhJqMcBh34s']

In [0]:
results = get_recommendations(test_track_id)

print(get_features(test_track_id))

X_train_grouped[X_train_grouped['track_id'].isin(results)]





  






  


{'danceability': 0.443, 'energy': 0.403, 'key': 0, 'loudness': -8.339, 'mode': 1, 'speechiness': 0.0322, 'acousticness': 0.631, 'instrumentalness': 0, 'liveness': 0.111, 'valence': 0.41, 'tempo': 143.462, 'time_signature': 4}


Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
8613,0DTzNcTTUZRFhJqMcBh34s,0.48999,0.406006,0,-8.679688,1,0.033203,0.73584,0.001,0.180054,0.384033,148.0,4
11637,0ICWLdBQR09jm2cQO9FPX0,0.436035,0.467041,0,-7.386719,1,0.036407,0.62793,0.0,0.107971,0.478027,155.125,4
24401,0c8NfaTT7nyz0TDmNK85lq,0.479004,0.450928,0,-7.160156,1,0.0298,0.720215,0.0,0.128052,0.340088,136.25,4
43862,16F5WwdmTEYkHe9VoM8K1p,0.429932,0.505859,0,-7.414062,1,0.027695,0.598145,0.0,0.109009,0.332031,148.625,4
81058,22PcfdynDcGFHDrZK9b0Tr,0.449951,0.504883,0,-7.574219,1,0.033386,0.537109,0.0,0.13501,0.396973,145.125,4
190724,4ngjZxPSN9MhnVJeBvPv5P,0.509766,0.429932,0,-9.226562,1,0.029602,0.736816,5.7e-05,0.128052,0.381104,143.75,4
209615,5HM7jV3IuyYXjOzlLxMy8P,0.427002,0.381104,1,-8.5625,1,0.046112,0.527832,0.0,0.07782,0.476074,140.0,4
250793,6JXlccLNpK7F2XFwHuMy23,0.457031,0.48999,0,-6.808594,1,0.03479,0.633789,2e-06,0.117981,0.355957,137.75,4
299333,7k5lxS3hvATTW4ngag4PNs,0.452881,0.417969,0,-8.601562,1,0.0289,0.708008,1.1e-05,0.059814,0.316895,146.125,4
299443,7kHLzMFQGgBB2Kavdj47vW,0.477051,0.409912,0,-6.726562,1,0.028198,0.72998,0.0,0.089172,0.441895,152.0,4
