In [1]:
import os
import sys
import requests

import numpy as np
import pandas as pd

from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

In [2]:
np.random.seed(42)


In [3]:
df_path = os.path.join(os.getcwd(), os.pardir, 'data', 'data.csv')
df = pd.read_csv(df_path)

features = ['acousticness', 'danceability','energy',
               'instrumentalness', 'key', 'liveness', 'loudness',
               'mode','speechiness', 'tempo',
               'valence']
df_train = df[features]

df_train.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,0.995,0.708,0.195,0.563,10,0.151,-12.428,1,0.0506,118.469,0.779
1,0.994,0.379,0.0135,0.901,8,0.0763,-28.454,1,0.0462,83.972,0.0767
2,0.604,0.749,0.22,0.0,5,0.119,-19.924,0,0.929,107.177,0.88
3,0.995,0.781,0.13,0.887,1,0.111,-14.734,0,0.0926,108.003,0.72
4,0.99,0.21,0.204,0.908,11,0.098,-16.829,1,0.0424,62.149,0.0693


In [4]:
scaler = StandardScaler()
df_train_scaled = pd.DataFrame(scaler.fit_transform(df_train),
                               columns=features)
df_train_scaled.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,1.332319,0.968662,-1.097999,1.296562,1.365333,-0.314998,-0.186652,0.641344,-0.28984,0.0495,0.940924
1,1.329664,-0.907636,-1.776785,2.389253,0.796383,-0.737519,-3.014729,0.641344,-0.319186,-1.073199,-1.735454
2,0.294154,1.202486,-1.004503,-0.523513,-0.057043,-0.495997,-1.509457,-1.559227,5.568626,-0.317996,1.325822
3,1.332319,1.384983,-1.341091,2.343994,-1.194943,-0.541247,-0.593587,-1.559227,-0.009722,-0.291114,0.716082
4,1.319044,-1.871449,-1.064341,2.411883,1.649808,-0.614778,-0.963288,0.641344,-0.34453,-1.783425,-1.763655


### Autoencoder

In [11]:
n = df_train_scaled.shape[1]

# Encoder

encoder = Sequential([Dense(n // 2, name='encode_2', input_shape=(n,))])

encoder.compile(optimizer='adam', loss='mse')
encoder.summary()


Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encode_2 (Dense)             (None, 5)                 60        
Total params: 60
Trainable params: 60
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Decoder
decoder = Sequential([Dense(n, name='decode_2', input_shape=(n // 2,))])

decoder.compile(optimizer='adam', loss='mse')
decoder.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
decode_2 (Dense)             (None, 11)                66        
Total params: 66
Trainable params: 66
Non-trainable params: 0
_________________________________________________________________


### Build the full autoencoder and train it

In [14]:
input_layer = Input(shape=(n,))
encoder_output = encoder(input_layer)
decoder_output = decoder(encoder_output)
autoencoder = Model(input_layer, decoder_output)
autoencoder.compile(optimizer='adam', loss='mse')

autoencoder.summary()


Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 11)]              0         
_________________________________________________________________
sequential_5 (Sequential)    (None, 5)                 60        
_________________________________________________________________
sequential_6 (Sequential)    (None, 11)                66        
Total params: 126
Trainable params: 126
Non-trainable params: 0
_________________________________________________________________


In [15]:
stop = EarlyStopping(monitor='loss', 
                     patience=5, 
                     restore_best_weights=True)

history = autoencoder.fit(df_train_scaled, 
                          df_train_scaled, 
                          epochs=20, 
                          batch_size=128,
                          callbacks=[stop])

Train on 169909 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20


### Save the trained models

In [16]:
encoder.save('encoder.h5')
decoder.save('decoder.h5')
autoencoder.save('autoencoder.h5')

### Build an end to end model with encoder and nearest neighbors

We are stacking a keras NN encoder on top of a scikit-learn nearest neighbors model. The former reduces the dimensionality, from 11 to 5 features. 

In [31]:
class End2EndModel():
    def __init__(self, n_examples, encoder_file):
        self.scaler = StandardScaler()
        self.encode = load_model(encoder_file)

        self.nearest_n = NearestNeighbors(n_examples)
        
    def fit(self, X):
        X_scaled = self.scaler.fit_transform(X)
        encoded = self.encode.predict(X_scaled)
        nn = self.nearest_n.fit(encoded)
        return nn
    
    def predict(self, x):
        x_scaled = self.scaler.transform(x)
        encoded = self.encode.predict(x_scaled)
        scores, indices = self.nearest_n.kneighbors(encoded)
        return scores, indices
                

In [32]:
model = End2EndModel(10, 'encoder.h5')



In [33]:
model.fit(df_train)

NearestNeighbors(n_neighbors=10)

### Try it out with an item from the dataset

In [34]:
test = np.array(df_train.iloc[3764])
test = test.reshape(1, -1)
test


array([[ 1.3900e-01,  3.1600e-01,  7.7200e-01,  5.3200e-01,  1.1000e+01,
         2.3800e-01, -1.1605e+01,  0.0000e+00,  6.4900e-02,  8.8648e+01,
         6.6000e-01]])

In [36]:
scores, indices = model.predict(test)
scores, indices

(array([[8.42936970e-08, 3.09293462e-01, 3.32585634e-01, 4.00838659e-01,
         4.18509141e-01, 4.19979617e-01, 4.35221213e-01, 5.10055541e-01,
         5.10543625e-01, 5.13623751e-01]]),
 array([[  3764,   4432, 125668, 149375,  55051,  73453,  46726,  46445,
           5257, 151981]], dtype=int64))

### Retrieve our suggestions

In [37]:
results = []
for i, index in enumerate(indices[0]):
    track_id = df.iloc[index]['id']
    artists = df.iloc[index]['artists']
    title = df.iloc[index]['name']
    results.append({'index': index,
                    'track_id': track_id,
                    'artists': artists,
                    'title': title,
                    'score': scores[0][i]})
    
result_table = pd.DataFrame(results)
result_table.sort_values(by='score')

Unnamed: 0,index,track_id,artists,title,score
0,3764,1qPSxRyMfES52PbpxCzWcd,['Al Di Meola'],Race With Devil On Spanish Highway,8.42937e-08
1,4432,0lFQqCxJiT0pvUPms0bzSg,['Thin Lizzy'],Thunder and Lightning,0.3092935
2,125668,7g3htkaLz4ETFn0cifwM3y,['Lana Del Rey'],High By The Beach,0.3325856
3,149375,7cophW0FqeDEEW9i57WBvL,['Whitesnake'],Love Ain't No Stranger,0.4008387
4,55051,0M8MPqaoGgkRjkG9kgZ3n4,"['Justine Skye', 'Tyga']",Collide (feat. Tyga),0.4185091
5,73453,2MOurmmosbkle2QinkSZAz,['Willie Hutch'],Out There,0.4199796
6,46726,57HwKH3pHLeelTkckr94qf,['The Dodos'],Horny Hippies,0.4352212
7,46445,3oLullgZ9FsDo6O93nppku,['Underoath'],"Some Will Seek Forgiveness, Others Escape",0.5100555
8,5257,2PAol2oDdGSHys8hc0gtLX,['Tori Amos'],Precious Things,0.5105436
9,151981,26gexilnrxUTfBhiudNBrT,['Los Temerarios'],Te Hice Mal,0.5136238


### Explore ways to persist the scikit-learn layer

In [45]:
import pickle
with open('nearest.pickle', 'wb') as f:
    pickle.dump(model.nearest_n, f)

In [46]:
from joblib import dump, load
dump(model.nearest_n, 'nearest.joblib')

['nearest.joblib']