In [0]:
import numpy as np
import pandas as pd
from joblib import dump, load
from sklearn.neighbors import KDTree

import os
import sys

In [0]:
def reduce_mem_usage(df, verbose=True):
    """ Function iterates through all the columns of a dataframe and modify the data type
        to reduce memory usage.
        Credit to: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
        Parameters
        ----------
        df : Pandas DataFrame
        verbose: (True) by default, prints out before and after memory usage
        Returns
        -------
        df : Reduced Memory Pandas DataFrame
    """

    if verbose:
        start_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    if verbose:
        end_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(
            100 * (start_mem - end_mem) / start_mem))

    return df

def load_dataset():
    """Loads dataset, and combines them into a single dataframe."""
    try:
        fpaths = ['https://raw.githubusercontent.com/Build-Week-SpotifySong4/DataScience/master/data/spotify_2018.csv',
                  'https://raw.githubusercontent.com/Build-Week-SpotifySong4/DataScience/master/data/spotify_2019.csv',
                  'https://raw.githubusercontent.com/Build-Week-SpotifySong4/DataScience/master/data/spotify_2020.csv']
        df1 = pd.read_csv(fpaths[0], encoding = 'latin1', low_memory=False)
        df2 = pd.read_csv(fpaths[1], encoding = 'latin1', low_memory=False)
        df3 = pd.read_csv(fpaths[2], encoding = 'latin1', low_memory=False)
        df = pd.concat([df1, df2, df3], ignore_index=True)
        df = reduce_mem_usage(df)  # Reduces Memory Usage

    except Exception as e:
        print(
            'Error Occurred while reading the Spotify dataset: {e}'.format(e))
        raise e

    # Check total sum of rows match
    assert df.shape[0] == (df1.shape[0] + df2.shape[0] + df3.shape[0])

    file_names = [fpath.split('/')[-1] for fpath in fpaths]
    print(
        f'''
        -------------------- SHAPE ---------------------
        DF1 {file_names[0]}: {df1.shape}
        DF2 {file_names[1]}: {df2.shape}
        DF3 {file_names[2]}: {df3.shape}
        MERGED DF df: {df.shape}
        ------------------------------------------------
        ''')

    return df


def wrangle(df):

    # Set to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Drop Duplicates
    df = df.drop_duplicates(keep='first')
    
    ## isolating values with a high level of 'speechiness' as they are generally not music but other types of recordings.
    ## converting unwanted values to null then removing those rows
    df['speechiness'] = df['speechiness'].where(df['speechiness'] < .66)

    # 'key', 'mode' and 'time_signature' are numeric columns, but arguably should be a categorical feature,
    # so convert it from a number to a string
    df['key'] = df['key'].astype(str)
    df['mode'] = df['mode'].astype(str)
    df['time_signature'] = df['time_signature'].astype(str)

    # Drop 'popularity' and 'duration_ms', as these are no accoustic features
    df = df.drop(['popularity','duration_ms'], axis=1)
    
    # Drop missing values
    df = df.dropna()

    return df


df = load_dataset()
df.head()

Memory usage of dataframe is 56.38 MB
Memory usage after optimization is: 22.71 MB
Decreased by 59.7%

        -------------------- SHAPE ---------------------
        DF1 spotify_2018.csv: (158885, 18)
        DF2 spotify_2019.csv: (155645, 18)
        DF3 spotify_2020.csv: (96035, 18)
        MERGED DF df: (410565, 18)
        ------------------------------------------------
        


Unnamed: 0,artist_name,track_name,track_id,popularity,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Kina Grannis,Can't Help Falling In Love,6lfxq3CG4xtTiEg7opyCyx,72,acoustic,0.266113,0.059601,0,-18.515625,1,0.036285,0.904785,7.1e-05,0.131958,0.142944,181.75,201933,3
1,Ben Rector,Love Like This,06JmNnH3iXKENNRKifqu0v,64,acoustic,0.637207,0.129028,2,-11.890625,1,0.040497,0.902832,0.0,0.106995,0.36792,139.75,214240,4
2,Jason Mraz,Have It All,7BXW1QCg56yzEBV8pW8pah,66,acoustic,0.643066,0.638184,1,-4.945312,0,0.04071,0.203979,0.0,0.070679,0.600098,82.0,226107,4
3,Ben Rector,Old Friends,4MZQ3lHA1TYO6yyedtmBYg,61,acoustic,0.468994,0.403076,8,-10.046875,1,0.048309,0.131958,4.7e-05,0.116028,0.208008,147.375,224744,4
4,Ben Rector,I Will Always Be Yours,4m1lB7qJ78VPYsQy7RoBcU,60,acoustic,0.445068,0.770996,0,-4.605469,1,0.05011,0.133057,0.0,0.272949,0.447998,147.875,226827,4


In [0]:
# Wrangle Data
df = wrangle(df)

print(f'After Wrangling, Shape: {df.shape}')
df.head()

After Wrangling, Shape: (405526, 16)


Unnamed: 0,artist_name,track_name,track_id,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,Kina Grannis,Can't Help Falling In Love,6lfxq3CG4xtTiEg7opyCyx,acoustic,0.266113,0.059601,0,-18.515625,1,0.036285,0.904785,7.1e-05,0.131958,0.142944,181.75,3
1,Ben Rector,Love Like This,06JmNnH3iXKENNRKifqu0v,acoustic,0.637207,0.129028,2,-11.890625,1,0.040497,0.902832,0.0,0.106995,0.36792,139.75,4
2,Jason Mraz,Have It All,7BXW1QCg56yzEBV8pW8pah,acoustic,0.643066,0.638184,1,-4.945312,0,0.04071,0.203979,0.0,0.070679,0.600098,82.0,4
3,Ben Rector,Old Friends,4MZQ3lHA1TYO6yyedtmBYg,acoustic,0.468994,0.403076,8,-10.046875,1,0.048309,0.131958,4.7e-05,0.116028,0.208008,147.375,4
4,Ben Rector,I Will Always Be Yours,4m1lB7qJ78VPYsQy7RoBcU,acoustic,0.445068,0.770996,0,-4.605469,1,0.05011,0.133057,0.0,0.272949,0.447998,147.875,4


In [0]:
# Check cardinality of non-numeric features
df.describe(exclude='number').T.sort_values(by='unique')

Unnamed: 0,count,unique,top,freq
mode,405526,2,1,250858
time_signature,405526,5,4,361469
key,405526,12,7,47835
genre,405526,106,techno,5738
artist_name,405526,35974,Armin van Buuren,1292
track_name,405526,253086,Home,160
track_id,405526,309448,5BdZbBsD8WsHICUfuSF0la,10


In [0]:
# Identifying information, Artist, ID, Name
df_artists = df[['artist_name', 'track_id', 'track_name']]

In [0]:
numerics = df.select_dtypes(include='number').columns.tolist()
categoricals = df.select_dtypes(exclude='number').columns.tolist()
low_cardinality = [col for col in categoricals if df[col].nunique() <= 150]

features = numerics + low_cardinality

X_train = df[features]

In [0]:
!pip install category_encoders

In [0]:
pd.set_option('display.max_columns', None)

In [0]:
import category_encoders as ce

encoder = ce.OneHotEncoder(use_cat_names=True)
X_train_encoded = encoder.fit_transform(X_train)

print(X_train_encoded.shape)
X_train_encoded.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre_acoustic,genre_afrobeat,genre_alt-rock,genre_alternative,genre_ambient,genre_anime,genre_black-metal,genre_bluegrass,genre_blues,genre_breakbeat,genre_british,genre_cantopop,genre_chicago-house,genre_chill,genre_classical,genre_club,genre_comedy,genre_country,genre_dance,genre_dancehall,genre_death-metal,genre_deep-house,genre_detroit-techno,genre_disco,genre_disney,genre_drum-and-bass,genre_dub,genre_dubstep,genre_edm,genre_electro,genre_electronic,...,genre_rockabilly,genre_romance,genre_salsa,genre_samba,genre_sertanejo,genre_show-tunes,genre_singer-songwriter,genre_ska,genre_sleep,genre_songwriter,genre_soul,genre_spanish,genre_swedish,genre_synth-pop,genre_tango,genre_techno,genre_trance,genre_trip-hop,genre_turkish,genre_world-music,genre_rock-n-roll,key_0,key_2,key_1,key_8,key_7,key_5,key_10,key_11,key_3,key_9,key_6,key_4,mode_1,mode_0,time_signature_3,time_signature_4,time_signature_5,time_signature_1,time_signature_0
0,0.266113,0.059601,-18.515625,0.036285,0.904785,7.1e-05,0.131958,0.142944,181.75,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
1,0.637207,0.129028,-11.890625,0.040497,0.902832,0.0,0.106995,0.36792,139.75,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
2,0.643066,0.638184,-4.945312,0.04071,0.203979,0.0,0.070679,0.600098,82.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
3,0.468994,0.403076,-10.046875,0.048309,0.131958,4.7e-05,0.116028,0.208008,147.375,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
4,0.445068,0.770996,-4.605469,0.05011,0.133057,0.0,0.272949,0.447998,147.875,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0


In [0]:
# Group by track_id
X_train_encoded['track_id'] = df_artists['track_id']

# Some track_ids occur more than once, because they fall into more than one genre.
X_train_grouped = X_train_encoded.groupby('track_id', as_index=False).max()

print(X_train_grouped.shape)
X_train_grouped.head()

(309448, 135)


Unnamed: 0,track_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre_acoustic,genre_afrobeat,genre_alt-rock,genre_alternative,genre_ambient,genre_anime,genre_black-metal,genre_bluegrass,genre_blues,genre_breakbeat,genre_british,genre_cantopop,genre_chicago-house,genre_chill,genre_classical,genre_club,genre_comedy,genre_country,genre_dance,genre_dancehall,genre_death-metal,genre_deep-house,genre_detroit-techno,genre_disco,genre_disney,genre_drum-and-bass,genre_dub,genre_dubstep,genre_edm,genre_electro,...,genre_rockabilly,genre_romance,genre_salsa,genre_samba,genre_sertanejo,genre_show-tunes,genre_singer-songwriter,genre_ska,genre_sleep,genre_songwriter,genre_soul,genre_spanish,genre_swedish,genre_synth-pop,genre_tango,genre_techno,genre_trance,genre_trip-hop,genre_turkish,genre_world-music,genre_rock-n-roll,key_0,key_2,key_1,key_8,key_7,key_5,key_10,key_11,key_3,key_9,key_6,key_4,mode_1,mode_0,time_signature_3,time_signature_4,time_signature_5,time_signature_1,time_signature_0
0,0004MqBaviNnr5YGhiJIP8,0.800781,0.894043,-2.949219,0.05719,0.000578,0.856934,0.129028,0.189941,128.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
1,000AWPHTOpbAx3wyxeUhBa,0.703125,0.944824,-6.535156,0.056396,0.000883,0.75293,0.188965,0.440918,124.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
2,000H1aSPTxP6k0Fn8DD0Ob,0.673828,0.712891,-5.613281,0.031097,0.055603,0.0,0.468994,0.667969,130.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
3,000TF50F5TTFTgxJUYxK3Z,0.302979,0.059814,-17.65625,0.036499,0.992188,0.831055,0.117981,0.141968,58.59375,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0
4,000jcQGuywRFvo5rGqBJQB,0.774902,0.854004,-4.101562,0.046387,0.309082,0.0,0.487061,0.902832,132.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0


In [0]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()

identifier = 'track_id'
features = X_train_grouped.columns.drop(identifier).tolist()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_grouped[features]),
                        columns=X_train_grouped[features].columns,
                        index=X_train_grouped['track_id'].values)

print(X_train_scaled.shape)
X_train_scaled.head()

(309448, 134)


Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre_acoustic,genre_afrobeat,genre_alt-rock,genre_alternative,genre_ambient,genre_anime,genre_black-metal,genre_bluegrass,genre_blues,genre_breakbeat,genre_british,genre_cantopop,genre_chicago-house,genre_chill,genre_classical,genre_club,genre_comedy,genre_country,genre_dance,genre_dancehall,genre_death-metal,genre_deep-house,genre_detroit-techno,genre_disco,genre_disney,genre_drum-and-bass,genre_dub,genre_dubstep,genre_edm,genre_electro,genre_electronic,...,genre_rockabilly,genre_romance,genre_salsa,genre_samba,genre_sertanejo,genre_show-tunes,genre_singer-songwriter,genre_ska,genre_sleep,genre_songwriter,genre_soul,genre_spanish,genre_swedish,genre_synth-pop,genre_tango,genre_techno,genre_trance,genre_trip-hop,genre_turkish,genre_world-music,genre_rock-n-roll,key_0,key_2,key_1,key_8,key_7,key_5,key_10,key_11,key_3,key_9,key_6,key_4,mode_1,mode_0,time_signature_3,time_signature_4,time_signature_5,time_signature_1,time_signature_0
0004MqBaviNnr5YGhiJIP8,0.810677,0.894041,0.879045,0.086759,0.00058,0.856934,0.124345,0.190127,0.512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
000AWPHTOpbAx3wyxeUhBa,0.711814,0.944823,0.822681,0.085556,0.000887,0.75293,0.184604,0.441349,0.496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
000H1aSPTxP6k0Fn8DD0Ob,0.682155,0.712885,0.837171,0.047176,0.055821,0.0,0.466139,0.668622,0.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
000TF50F5TTFTgxJUYxK3Z,0.306723,0.059795,0.647879,0.05537,0.996078,0.831055,0.113238,0.142107,0.234375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
000jcQGuywRFvo5rGqBJQB,0.784478,0.854001,0.860932,0.07037,0.310294,0.0,0.484302,0.903715,0.528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [0]:
# Train tree on data
tree = KDTree(X_train_scaled, leaf_size=10)

In [0]:
# Tree can be queried for nearest indices (notice the nearest neighbor is the input itself)
tree.query(pd.DataFrame(X_train_scaled.iloc[0]).T, k=11, return_distance=True)

(array([[0.        , 0.10767239, 0.11587443, 0.11830413, 0.1334649 ,
         0.14530887, 0.15438645, 0.17069818, 0.17205643, 0.1748833 ,
         0.17635612]]),
 array([[     0,  43717, 224018, 152458, 292876, 168691,  85153, 297865,
         144132, 249897,  39530]]))

In [0]:
# Function to query based on ID
def nearest_by_id(id, k=11):
    return tree.query(X_train_scaled.loc[[id]], k=k)[1]

In [0]:
test_track_id = '3cMwLCYsiRCiQpXBlPaRl7'

In [0]:
# Dictionary mapping track ID to dataframe index
val_ind_map = {value : index for index, value in zip(range(len(X_train_grouped)), X_train_grouped['track_id'])}

In [0]:
# Show results to check
results = nearest_by_id(test_track_id)

X_train_grouped.iloc[results[0]]

Unnamed: 0,track_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre_acoustic,genre_afrobeat,genre_alt-rock,genre_alternative,genre_ambient,genre_anime,genre_black-metal,genre_bluegrass,genre_blues,genre_breakbeat,genre_british,genre_cantopop,genre_chicago-house,genre_chill,genre_classical,genre_club,genre_comedy,genre_country,genre_dance,genre_dancehall,genre_death-metal,genre_deep-house,genre_detroit-techno,genre_disco,genre_disney,genre_drum-and-bass,genre_dub,genre_dubstep,genre_edm,genre_electro,...,genre_rockabilly,genre_romance,genre_salsa,genre_samba,genre_sertanejo,genre_show-tunes,genre_singer-songwriter,genre_ska,genre_sleep,genre_songwriter,genre_soul,genre_spanish,genre_swedish,genre_synth-pop,genre_tango,genre_techno,genre_trance,genre_trip-hop,genre_turkish,genre_world-music,genre_rock-n-roll,key_0,key_2,key_1,key_8,key_7,key_5,key_10,key_11,key_3,key_9,key_6,key_4,mode_1,mode_0,time_signature_3,time_signature_4,time_signature_5,time_signature_1,time_signature_0
144132,3cMwLCYsiRCiQpXBlPaRl7,0.681152,0.973145,-5.453125,0.056793,0.000292,0.916992,0.0979,0.151001,136.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
249897,6I7c8PjAARr5WEw8hwGYXp,0.673828,0.937012,-7.058594,0.039093,0.000123,0.901855,0.091309,0.128052,128.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
109778,2ktqqycUethpMN82GPQP2N,0.628906,0.977051,-6.785156,0.044403,0.010696,0.904785,0.11499,0.120972,129.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
168691,4F5X8pnhC2vk2RGNlC4Aqz,0.684082,0.942871,-6.457031,0.043915,5.4e-05,0.869141,0.096375,0.199951,128.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
224018,5dz2txpcTF7ATLTh8cmNSg,0.795898,0.974121,-7.367188,0.040985,4e-05,0.882812,0.101013,0.18396,130.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
180175,4XDa5b4jseOqKoe6J9Iigt,0.743164,0.938965,-9.078125,0.05661,0.0002,0.938965,0.11499,0.046906,128.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
43717,161yxPAyWtqfMgCXZLtNz9,0.806152,0.951172,-7.070312,0.043304,0.007462,0.907227,0.101013,0.204956,126.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
39530,0zWS1visLUnYzWCBNi7sRb,0.714844,0.875977,-8.546875,0.079895,0.007111,0.868164,0.107971,0.073792,131.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
292931,7a5ivZFEOldzrvCr7gO1Zd,0.63916,0.949219,-10.390625,0.049591,0.000688,0.915039,0.103027,0.271973,128.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
181522,4ZHi42Dbsh8mqrJ4U10SVq,0.629883,0.943848,-13.09375,0.039398,0.036285,0.842773,0.109985,0.140015,125.8125,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0


In [0]:
# Dump our tree model with pickle into this file:
dump(tree, 'SpotifyKDTree.joblib')

['SpotifyKDTree.joblib']

In [0]:
# Export standardized features for later use
X_train_scaled.to_csv("spotify_standardized.csv")

In [0]:
# Import standardized data
df_import = pd.read_csv("spotify_standardized.csv", index_col=0)

# Import picked model
from joblib import load
Spot_KDTree = load('SpotifyKDTree.joblib')

# Make functions using model
def new_nearest_by_id(id, k=11):
    return Spot_KDTree.query(df_import.loc[[id]], k=k)[1]

In [0]:
test_track_id = '6Wosx2euFPMT14UXiWudMy'

In [0]:
results = new_nearest_by_id(test_track_id)

X_train_grouped.iloc[results[0]]

Unnamed: 0,track_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre_acoustic,genre_afrobeat,genre_alt-rock,genre_alternative,genre_ambient,genre_anime,genre_black-metal,genre_bluegrass,genre_blues,genre_breakbeat,genre_british,genre_cantopop,genre_chicago-house,genre_chill,genre_classical,genre_club,genre_comedy,genre_country,genre_dance,genre_dancehall,genre_death-metal,genre_deep-house,genre_detroit-techno,genre_disco,genre_disney,genre_drum-and-bass,genre_dub,genre_dubstep,genre_edm,genre_electro,genre_electronic,genre_emo,genre_folk,genre_forro,genre_french,genre_funk,genre_garage,genre_german,genre_gospel,genre_grindcore,genre_groove,genre_grunge,genre_guitar,genre_happy,genre_hard-rock,genre_hardcore,genre_hardstyle,genre_heavy-metal,genre_hip-hop,genre_honky-tonk,genre_house,genre_idm,genre_indian,genre_indie,genre_indie-pop,genre_industrial,genre_iranian,genre_j-dance,genre_j-idol,genre_j-pop,genre_j-rock,genre_jazz,genre_k-pop,genre_kids,genre_latin,genre_latino,genre_mandopop,genre_metal,genre_metalcore,genre_minimal-techno,genre_mpb,genre_new-age,genre_opera,genre_pagode,genre_party,genre_piano,genre_pop,genre_power-pop,genre_progressive-house,genre_psych-rock,genre_punk,genre_punk-rock,genre_reggae,genre_reggaeton,genre_rock,genre_rockabilly,genre_romance,genre_salsa,genre_samba,genre_sertanejo,genre_show-tunes,genre_singer-songwriter,genre_ska,genre_sleep,genre_songwriter,genre_soul,genre_spanish,genre_swedish,genre_synth-pop,genre_tango,genre_techno,genre_trance,genre_trip-hop,genre_turkish,genre_world-music,genre_rock-n-roll,key_0,key_2,key_1,key_8,key_7,key_5,key_10,key_11,key_3,key_9,key_6,key_4,mode_1,mode_0,time_signature_3,time_signature_4,time_signature_5,time_signature_1,time_signature_0
259273,6Wosx2euFPMT14UXiWudMy,0.603027,0.723145,-5.890625,0.04541,0.024994,0.0,0.082397,0.38208,114.9375,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0
148053,3iWGoe4u0x3Jf2X96sbAJm,0.570801,0.940918,-4.9375,0.046387,0.00247,0.203979,0.067078,0.470947,128.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0
244745,6ALOVWJZcJO70uojmNHtb6,0.707031,0.914062,-4.328125,0.034698,0.088196,0.083313,0.112,0.706055,135.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0
42031,13Ls4Uc1tSAIaCVhz0H8hm,0.602051,0.86084,-6.84375,0.0625,0.076721,0.336914,0.620117,0.135986,135.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0
244768,6ANbhzNG4DNWI1kZDDprQw,0.592773,0.942871,-1.917969,0.117004,0.00326,0.720215,0.292969,0.626953,150.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0
308401,7yKMGeJM3EjNrS6QXnY6xP,0.578125,0.823242,-9.1875,0.037292,0.004059,0.895996,0.090698,0.025803,110.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0
234643,5ufaAwg46yyNGdgkDz987H,0.562012,0.916992,-6.195312,0.035095,0.00027,0.934082,0.065308,0.068176,124.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0
159350,40UbcJWyWPqOWFvv4Dpvfc,0.654785,0.726074,-4.613281,0.03241,0.058685,0.003111,0.14502,0.291992,128.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0
231592,5puKxbSkahwtI2XJR1xZgr,0.679199,0.826172,-6.152344,0.053894,0.020996,0.00113,0.093018,0.48999,122.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0
106275,2fXpPy5gtGLj9Wx0qGDeKy,0.65918,0.902832,-3.138672,0.038696,0.048889,0.00252,0.074219,0.384033,127.9375,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0
