# Nearest Neighbors with Keras

[Source github repo](https://github.com/sorenlind/keras-knn)

[Source medium article](https://medium.com/@sorenlind/nearest-neighbors-with-keras-and-coreml-755e76fedf36)

In [1]:
# Imports
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

## Get Data

In [8]:
df = pd.read_csv('kaggle_data/SpotifyAudioFeaturesApril2019.csv')
# separating extraneous information and numerical information
songs = df[['track_name', 'artist_name', 'track_id', 'duration_ms']]
features = df.drop(['artist_name', 'track_id', 'track_name', 'duration_ms'], axis=1)

In [9]:
df

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.005820,0.743,238373,0.339,0.000,1,0.0812,-7.678,1,0.4090,203.927,4,0.1180,15
1,YG,1tHDG53xJNGsItRA3vfVgs,BAND DRUM (feat. A$AP Rocky),0.024400,0.846,214800,0.557,0.000,8,0.2860,-7.259,1,0.4570,159.009,4,0.3710,0
2,R3HAB,6Wosx2euFPMT14UXiWudMy,Radio Silence,0.025000,0.603,138913,0.723,0.000,9,0.0824,-5.890,0,0.0454,114.966,4,0.3820,56
3,Chris Cooq,3J2Jpw61sO7l6Hc7qdYV91,Lactose,0.029400,0.800,125381,0.579,0.912,5,0.0994,-12.118,0,0.0701,123.003,4,0.6410,0
4,Chris Cooq,2jbYvQCyPgX3CdmAzeVeuS,Same - Original mix,0.000035,0.783,124016,0.792,0.878,7,0.0332,-10.277,1,0.0661,120.047,4,0.9280,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130658,Calum Scott,0cvfSKcm9VeduwyYPrxtLx,Come Back Home,0.006780,0.601,190539,0.801,0.000,11,0.0991,-5.174,1,0.0323,131.049,4,0.2890,57
130659,Saint Claire,43MP9F7UzvfilSrw2SqZGJ,Enough for You,0.918000,0.387,194583,0.249,0.000,9,0.1030,-13.233,1,0.0437,94.039,4,0.3460,60
130660,Mike Stud,4TWlUuFk81NGUNKwndyS5Q,Do It,0.330000,0.717,139191,0.532,0.000,8,0.0997,-8.351,0,0.2060,156.977,4,0.5460,47
130661,D Savage,5iGBXzOoRo4sBTy8wdzMyK,No Smoke,0.007900,0.772,180013,0.510,0.000,4,0.1310,-9.670,0,0.1200,120.049,4,0.0755,50


## Build the k-NN and Joined Model
Here we build the actual k-NN and combine it with the existing network. All there is to the K-NN is the Dense layer. We do not need a bias so we specify `use_bias=False`. It's also important to note that we don't need an activation function, hence `actication='linear'`.

In [10]:
model = Sequential([
    # Dot product between feature vector and reference vectors
    Dense(input_shape=(13,),
          units=features.shape[0],
          activation='linear',
          name='dense_1',
          use_bias=False)
])

In [11]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 130663)            1698619   
Total params: 1,698,619
Trainable params: 1,698,619
Non-trainable params: 0
_________________________________________________________________


### Normalize Encodings

In [16]:
def normalize(vectors):
    # each vector now has the same "length"
    ref_norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    return (vectors / ref_norms)

In [17]:
feat_norm = normalize(features)

In [18]:
feat_norm

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,2.843628e-05,0.003630,0.001656,0.000000,0.004886,0.000397,-0.037514,0.004886,0.001998,0.996379,0.019544,0.000577,0.073289
1,1.530420e-04,0.005306,0.003494,0.000000,0.050178,0.001794,-0.045530,0.006272,0.002866,0.997338,0.025089,0.002327,0.000000
2,1.947079e-04,0.004696,0.005631,0.000000,0.070095,0.000642,-0.045873,0.000000,0.000354,0.895391,0.031153,0.002975,0.436146
3,2.375311e-04,0.006463,0.004678,0.007368,0.040396,0.000803,-0.097905,0.000000,0.000566,0.993777,0.032317,0.005179,0.000000
4,2.914593e-07,0.006483,0.006558,0.007270,0.057961,0.000275,-0.085095,0.008280,0.000547,0.994000,0.033120,0.007684,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
130658,4.725145e-05,0.004189,0.005582,0.000000,0.076662,0.000691,-0.036059,0.006969,0.000225,0.913312,0.027877,0.002014,0.397247
130659,8.140284e-03,0.003432,0.002208,0.000000,0.079807,0.000913,-0.117342,0.008867,0.000388,0.833883,0.035470,0.003068,0.532045
130660,2.008250e-03,0.004363,0.003238,0.000000,0.048685,0.000607,-0.050821,0.000000,0.001254,0.955300,0.024342,0.003323,0.286023
130661,6.052234e-05,0.005914,0.003907,0.000000,0.030644,0.001004,-0.074082,0.000000,0.000919,0.919702,0.030644,0.000578,0.383053


### Set Weights to Extracted Features

In [25]:
# Welcome to the neighborhood
temp_weights = model.get_weights()
temp_weights[-1] = feat_norm.T
model.set_weights(temp_weights)

In [26]:
model.get_weights()[0].shape
# Perfect

(13, 130663)

## Predict

In [27]:
# Will need better system, this is just an example
example_song = songs.loc[46518]
print(example_song)
prediction = model.predict(feat_norm.values[46518].reshape(1,13))
# really annoying that predict requires a 2D array

track_name             Breaking Waves
artist_name           Ruslan Radriges
track_id       6w6N7XQrGAVMxGT6ZoQfHC
duration_ms                    208221
Name: 46518, dtype: object


In [47]:
results = songs.loc[prediction.argsort()[0][-10:]]
wrong_results = songs.loc[prediction.argsort()[0][:10]]

In [45]:
results
# Are these songs? They look like songs

Unnamed: 0,track_name,artist_name,track_id,duration_ms
68361,Deluge,hélix,29tCZrIvZxe8EP9lr3sOsG,178303
19984,Invisible Friend,Jared Moreno,1icCpXG7AtSmXelyB4wmhg,188760
26452,c.ronaldo,Olamide,0uhu7BaZLPfDYHaTE5LEva,283638
114363,Finest Hour (feat. Abir) - Zookëper Remix,Cash Cash,2NbVwt2xbVlToVsujmi0wA,216799
44393,Пролетая над коттеджами Барвихи,Мот,67Sbenpw6K0nzibDyU2FnR,229871
65935,If I Needed Someone - Live,Eric Clapton,4SPQ7tRKiIvQzAvijz8OZh,148867
88563,Came to Annihilate,Hollywood Burns,745mbCMlvPLSl5zW9QnIpG,383853
54588,Break It Down,Quiet Disorder,6hSSxa8S9amTBPTnK1N5dU,180015
66378,Gorgeous - Muttonheads Radio Edit,Colorblast,2yuBT5YgnmPQAEPp9HBofY,184000
46518,Breaking Waves,Ruslan Radriges,6w6N7XQrGAVMxGT6ZoQfHC,208221


In [48]:
wrong_results
# ouch

Unnamed: 0,track_name,artist_name,track_id,duration_ms
79475,"Looped, No Fade White Noise",White Noise Sound,3ZYHKWCGWuQpfoNT5o4SEn,74880
79580,Found The Answer,Ghost Flows,4vkOgrvJUsLkm5BItrLUb7,61875
53883,what (Silent Track),modii,2q2KTvUpl3KLrHMWBAmNI8,78000
44115,Continuous Fan,White Noise Therapy,3ctTeQ8UxUtpuRdW6oFD5Y,163898
36511,Pink Wave Noise,Babysounds,2bl2qH3kkV8awK8LxRmwJv,231999
69522,Pink Wave Noise,White Noise Meditation,51BC7OgE1HJvnOK9gtpkCF,231999
37407,Pink Wave Noise,White Noise Baby Sleep,4zTQew86qLC5JO13G7aoQq,231999
104944,"St Matthew Passion, BWV 244: Part II, No. 56: ...",Johann Sebastian Bach,4zz1OIdsxmecenIElM3fnY,14533
43614,5 Min Brown (Tremelo),Deep Sleep Brown Noise,24KKz0CvBA8xWdVgdRsLFp,300000
14643,Forgien Wit Mans,Neighborhood Crook,1ni7jwIjWdDZkDYQO0V6Ji,98664
