# Import Package & Read in data

In [18]:
import pandas as pd
df = pd.read_csv('data/SpotifyAudioFeatures2019.csv')
df.head()

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.00582,0.743,238373,0.339,0.0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15
1,YG,1tHDG53xJNGsItRA3vfVgs,BAND DRUM (feat. A$AP Rocky),0.0244,0.846,214800,0.557,0.0,8,0.286,-7.259,1,0.457,159.009,4,0.371,0
2,R3HAB,6Wosx2euFPMT14UXiWudMy,Radio Silence,0.025,0.603,138913,0.723,0.0,9,0.0824,-5.89,0,0.0454,114.966,4,0.382,56
3,Chris Cooq,3J2Jpw61sO7l6Hc7qdYV91,Lactose,0.0294,0.8,125381,0.579,0.912,5,0.0994,-12.118,0,0.0701,123.003,4,0.641,0
4,Chris Cooq,2jbYvQCyPgX3CdmAzeVeuS,Same - Original mix,3.5e-05,0.783,124016,0.792,0.878,7,0.0332,-10.277,1,0.0661,120.047,4,0.928,0


# Data Wrangling

In [19]:
df = df.dropna()
df.isnull().sum().sum()

0

In [20]:
df_features= df.drop(columns=['artist_name', 'track_id', 'track_name'])
df_features.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,0.00582,0.743,238373,0.339,0.0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15
1,0.0244,0.846,214800,0.557,0.0,8,0.286,-7.259,1,0.457,159.009,4,0.371,0
2,0.025,0.603,138913,0.723,0.0,9,0.0824,-5.89,0,0.0454,114.966,4,0.382,56
3,0.0294,0.8,125381,0.579,0.912,5,0.0994,-12.118,0,0.0701,123.003,4,0.641,0
4,3.5e-05,0.783,124016,0.792,0.878,7,0.0332,-10.277,1,0.0661,120.047,4,0.928,0


# Create Prediction Model

In [21]:
from sklearn.preprocessing import RobustScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KDTree
scaler = RobustScaler()
X = scaler.fit_transform(df_features)
model = KDTree(X)

# Sample X For 1 Prediction Only

In [22]:
import numpy as np
X_Prediction = np.array([X[0]])
X_Prediction

array([[-3.26240900e-01,  5.14925373e-01,  4.72894652e-01,
        -6.96569921e-01, -3.38636364e-04, -6.66666667e-01,
        -3.09025271e-01,  4.84390087e-02,  0.00000000e+00,
         3.91897891e+00,  1.92307692e+00,  0.00000000e+00,
        -7.29468599e-01, -2.25806452e-01]])

In [23]:
dist, ind = model.query(X_Prediction, k=6)
df_dist = pd.DataFrame(dist)
pd.DataFrame(ind).head()

Unnamed: 0,0,1,2,3,4,5
0,0,43436,51256,92348,24902,53729


In [24]:
pd.DataFrame(df_dist).head()

Unnamed: 0,0,1,2,3,4,5
0,0.0,1.144224,1.274368,1.304762,1.316577,1.331322


# Convert distance score back to song ID

In [25]:
prediction_list = []
for row in ind:
    song_id = [df.track_id[i] for i in row]
    prediction_list.append(song_id)

In [26]:
# Turn list in dataframe and export to csv
columns = ['id', '1', '2', '3', '4', '5']
final_prediction = pd.DataFrame(prediction_list, columns=columns)
final_prediction = final_prediction.drop_duplicates(subset=['id'], keep='first')
final_prediction

Unnamed: 0,id,1,2,3,4,5
0,2RM4jf1Xa9zPgMGRDiht8O,1rG9U7m2wqM0AVmZeIsdtz,56GWUZPELOP1nK4hQwfVVV,3UDOPqusSZZ7NqyWyMMk50,02s1Voowwhr0qTSOrMVEXk,2ogGsvbGjJz71Ppc9mTExA


# Test Query

In [11]:
# Test query
final_prediction[final_prediction.id == "2RM4jf1Xa9zPgMGRDiht8O"].values.tolist()[0][1:]

['1rG9U7m2wqM0AVmZeIsdtz',
 '56GWUZPELOP1nK4hQwfVVV',
 '3UDOPqusSZZ7NqyWyMMk50',
 '02s1Voowwhr0qTSOrMVEXk',
 '2ogGsvbGjJz71Ppc9mTExA']

# Test with JSON from dataframe

In [91]:
out=df.drop(columns=['artist_name', 'track_name']).head(1).to_json(orient='records')[1:-1].replace('},{', '} {')
print(out)
type(json.loads(out))

{"track_id":"2RM4jf1Xa9zPgMGRDiht8O","acousticness":0.00582,"danceability":0.743,"duration_ms":238373,"energy":0.339,"instrumentalness":0.0,"key":1,"liveness":0.0812,"loudness":-7.678,"mode":1,"speechiness":0.409,"tempo":203.927,"time_signature":4,"valence":0.118,"popularity":15}


dict

In [92]:
s = pd.Series(json.loads(out))
json_df = s.to_frame()
json_df.transpose()

Unnamed: 0,track_id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,2RM4jf1Xa9zPgMGRDiht8O,0.00582,0.743,238373,0.339,0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15


In [93]:
print(df.drop(columns=['artist_name', 'track_name']).shape)
print(json_df.transpose().shape)
df2 = pd.concat([df.drop(columns=['artist_name', 'track_name']),json_df.transpose()], sort=False)
df2 = df2.reset_index(drop=True)
df2.tail()

(130663, 15)
(1, 15)


Unnamed: 0,track_id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
130659,43MP9F7UzvfilSrw2SqZGJ,0.918,0.387,194583,0.249,0,9,0.103,-13.233,1,0.0437,94.039,4,0.346,60
130660,4TWlUuFk81NGUNKwndyS5Q,0.33,0.717,139191,0.532,0,8,0.0997,-8.351,0,0.206,156.977,4,0.546,47
130661,5iGBXzOoRo4sBTy8wdzMyK,0.0079,0.772,180013,0.51,0,4,0.131,-9.67,0,0.12,120.049,4,0.0755,50
130662,7LNtyuekYHiZ99UxkrfCQR,0.549,0.715,145827,0.734,0,3,0.108,-3.244,1,0.0367,130.128,3,0.976,55
130663,2RM4jf1Xa9zPgMGRDiht8O,0.00582,0.743,238373,0.339,0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15


In [94]:
dynamic_df = df2.drop(columns=['track_id'])
print(dynamic_df.shape)
dynamic_df.tail()

(130664, 14)


Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
130659,0.918,0.387,194583,0.249,0,9,0.103,-13.233,1,0.0437,94.039,4,0.346,60
130660,0.33,0.717,139191,0.532,0,8,0.0997,-8.351,0,0.206,156.977,4,0.546,47
130661,0.0079,0.772,180013,0.51,0,4,0.131,-9.67,0,0.12,120.049,4,0.0755,50
130662,0.549,0.715,145827,0.734,0,3,0.108,-3.244,1,0.0367,130.128,3,0.976,55
130663,0.00582,0.743,238373,0.339,0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15


## Make Prediction for test song from original dataframe

In [96]:
from sklearn.preprocessing import RobustScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KDTree
scaler = RobustScaler()
X = scaler.fit_transform(dynamic_df)
model = KDTree(X)
X_Prediction = np.array([X[-1]])
X_Prediction
dist, ind = model.query(X_Prediction, k=6)
prediction_list = []
for row in ind:
    song_id = [df2.track_id[i] for i in row]
    prediction_list.append(song_id)
columns = ['id', '1', '2', '3', '4', '5']
final_prediction = pd.DataFrame(prediction_list, columns=columns)
final_prediction = final_prediction.drop_duplicates(subset=['id'], keep='first')
final_prediction

Unnamed: 0,id,1,2,3,4,5
0,2RM4jf1Xa9zPgMGRDiht8O,2RM4jf1Xa9zPgMGRDiht8O,1rG9U7m2wqM0AVmZeIsdtz,56GWUZPELOP1nK4hQwfVVV,3UDOPqusSZZ7NqyWyMMk50,02s1Voowwhr0qTSOrMVEXk


# Test with custom defined dictionary

In [101]:
dict1={"track_id":"any_song_id","acousticness":0.00583,"danceability":0.743,"duration_ms":238373,"energy":0.339,"instrumentalness":0.0,"key":1,"liveness":0.0812,"loudness":-7.678,"mode":1,"speechiness":0.409,"tempo":203.927,"time_signature":4,"valence":0.118,"popularity":15}
s = pd.Series(dict1)
json_df = s.to_frame()
json_df.transpose()

Unnamed: 0,track_id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,any_song_id,0.00583,0.743,238373,0.339,0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15


In [102]:
print(df.drop(columns=['artist_name', 'track_name']).shape)
print(json_df.transpose().shape)
df2 = pd.concat([df.drop(columns=['artist_name', 'track_name']),json_df.transpose()], sort=False)
df2 = df2.reset_index(drop=True)
df2.tail()

(130663, 15)
(1, 15)


Unnamed: 0,track_id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
130659,43MP9F7UzvfilSrw2SqZGJ,0.918,0.387,194583,0.249,0,9,0.103,-13.233,1,0.0437,94.039,4,0.346,60
130660,4TWlUuFk81NGUNKwndyS5Q,0.33,0.717,139191,0.532,0,8,0.0997,-8.351,0,0.206,156.977,4,0.546,47
130661,5iGBXzOoRo4sBTy8wdzMyK,0.0079,0.772,180013,0.51,0,4,0.131,-9.67,0,0.12,120.049,4,0.0755,50
130662,7LNtyuekYHiZ99UxkrfCQR,0.549,0.715,145827,0.734,0,3,0.108,-3.244,1,0.0367,130.128,3,0.976,55
130663,any_song_id,0.00583,0.743,238373,0.339,0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15


In [103]:
dynamic_df = df2.drop(columns=['track_id'])
print(dynamic_df.shape)
dynamic_df.tail()

(130664, 14)


Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
130659,0.918,0.387,194583,0.249,0,9,0.103,-13.233,1,0.0437,94.039,4,0.346,60
130660,0.33,0.717,139191,0.532,0,8,0.0997,-8.351,0,0.206,156.977,4,0.546,47
130661,0.0079,0.772,180013,0.51,0,4,0.131,-9.67,0,0.12,120.049,4,0.0755,50
130662,0.549,0.715,145827,0.734,0,3,0.108,-3.244,1,0.0367,130.128,3,0.976,55
130663,0.00583,0.743,238373,0.339,0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15


## Make Prediction for custom defined song

In [104]:
from sklearn.preprocessing import RobustScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KDTree
scaler = RobustScaler()
X = scaler.fit_transform(dynamic_df)
model = KDTree(X)
X_Prediction = np.array([X[-1]])
X_Prediction
dist, ind = model.query(X_Prediction, k=6)
prediction_list = []
for row in ind:
    song_id = [df2.track_id[i] for i in row]
    prediction_list.append(song_id)
columns = ['id', '1', '2', '3', '4', '5']
final_prediction = pd.DataFrame(prediction_list, columns=columns)
final_prediction = final_prediction.drop_duplicates(subset=['id'], keep='first')
final_prediction

Unnamed: 0,id,1,2,3,4,5
0,any_song_id,2RM4jf1Xa9zPgMGRDiht8O,1rG9U7m2wqM0AVmZeIsdtz,56GWUZPELOP1nK4hQwfVVV,3UDOPqusSZZ7NqyWyMMk50,02s1Voowwhr0qTSOrMVEXk


In [105]:
final_prediction.values.tolist()[0][1:]

['2RM4jf1Xa9zPgMGRDiht8O',
 '1rG9U7m2wqM0AVmZeIsdtz',
 '56GWUZPELOP1nK4hQwfVVV',
 '3UDOPqusSZZ7NqyWyMMk50',
 '02s1Voowwhr0qTSOrMVEXk']