In [1]:
# Importing necessary libraries

# Nearest neighbors will be the v1 model for sound drip
# will give logistic regressiona nd 
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from joblib import dump
from joblib import load
import pickle
import numpy as np

import pandas as pd
from pandas.io.json import json_normalize

# Loading Latest PKL

In [2]:
X = pickle.load(open("./data/song_list_v5_hashed.pkl", "rb"))

# Preparing for Scaling

In [3]:
# dropping unnecessary columns
X = X.drop(["songid","artist","track", "duration_ms","genres","genres_stripped"], axis=1)

# checking that they're dropped
X.shape

(574018, 63)

In [4]:
columns_for_scaling = X.columns[0:13]

In [5]:
# columns_genre_hashed = X.columns[13:]

In [6]:
X[columns_for_scaling]

Unnamed: 0,popularity,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,0,0.001300,0.487,0.678,0.055100,9.0,0.0846,-7.780,1.0,0.0495,149.940,4.0,0.8700
1,15,0.000045,0.662,0.823,0.952000,4.0,0.3430,-1.711,0.0,0.0662,177.745,4.0,0.6210
2,17,0.002760,0.859,0.741,0.000000,11.0,0.3250,-12.329,0.0,0.2710,98.082,4.0,0.5290
3,21,0.348000,0.361,0.483,0.000003,7.0,0.1770,-6.875,1.0,0.0287,94.538,4.0,0.6820
4,1,0.340000,0.533,0.302,0.179000,10.0,0.1110,-10.308,1.0,0.0307,134.959,3.0,0.2940
...,...,...,...,...,...,...,...,...,...,...,...,...,...
574013,33,0.798000,0.320,0.104,0.000750,0.0,0.1060,-13.162,1.0,0.0342,124.677,3.0,0.0566
574014,50,0.820000,0.418,0.204,0.936000,8.0,0.0866,-13.522,1.0,0.0370,110.022,4.0,0.0375
574015,0,0.070400,0.614,0.952,0.000000,6.0,0.3620,-2.782,1.0,0.0718,148.079,4.0,0.8200
574016,18,0.064000,0.343,0.487,0.000000,2.0,0.1070,-9.462,1.0,0.0473,171.217,4.0,0.3650


In [7]:
X_new = X[columns_for_scaling].reindex(sorted(X[columns_for_scaling].columns), axis=1)

In [8]:
X_new

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,time_signature,valence
0,0.001300,0.487,0.678,0.055100,9.0,0.0846,-7.780,1.0,0,0.0495,149.940,4.0,0.8700
1,0.000045,0.662,0.823,0.952000,4.0,0.3430,-1.711,0.0,15,0.0662,177.745,4.0,0.6210
2,0.002760,0.859,0.741,0.000000,11.0,0.3250,-12.329,0.0,17,0.2710,98.082,4.0,0.5290
3,0.348000,0.361,0.483,0.000003,7.0,0.1770,-6.875,1.0,21,0.0287,94.538,4.0,0.6820
4,0.340000,0.533,0.302,0.179000,10.0,0.1110,-10.308,1.0,1,0.0307,134.959,3.0,0.2940
...,...,...,...,...,...,...,...,...,...,...,...,...,...
574013,0.798000,0.320,0.104,0.000750,0.0,0.1060,-13.162,1.0,33,0.0342,124.677,3.0,0.0566
574014,0.820000,0.418,0.204,0.936000,8.0,0.0866,-13.522,1.0,50,0.0370,110.022,4.0,0.0375
574015,0.070400,0.614,0.952,0.000000,6.0,0.3620,-2.782,1.0,0,0.0718,148.079,4.0,0.8200
574016,0.064000,0.343,0.487,0.000000,2.0,0.1070,-9.462,1.0,18,0.0473,171.217,4.0,0.3650


In [9]:
# instantiating scaler
scaler = MinMaxScaler()

# scaling data
# data_fitted = scaler.fit(X_new)
data_scaled = scaler.fit_transform(X[columns_for_scaling])

# confirming scaling took place
data_scaled[0]

array([0.        , 0.00130522, 0.4884654 , 0.678     , 0.0551    ,
       0.81818182, 0.0846    , 0.78720453, 1.        , 0.05118925,
       0.60009846, 0.8       , 0.87      ])

In [13]:
data_scaled.shape

(574018, 13)

# Normalization - Necessary Part of the Production App

In [15]:
data_scaled

array([[0.00000000e+00, 1.30522088e-03, 4.88465396e-01, ...,
        6.00098456e-01, 8.00000000e-01, 8.70000000e-01],
       [1.54639175e-01, 4.55823293e-05, 6.63991976e-01, ...,
        7.11381219e-01, 8.00000000e-01, 6.21000000e-01],
       [1.75257732e-01, 2.77108434e-03, 8.61584754e-01, ...,
        3.92549398e-01, 8.00000000e-01, 5.29000000e-01],
       ...,
       [0.00000000e+00, 7.06827309e-02, 6.15847543e-01, ...,
        5.92650255e-01, 8.00000000e-01, 8.20000000e-01],
       [1.85567010e-01, 6.42570281e-02, 3.44032096e-01, ...,
        6.85254484e-01, 8.00000000e-01, 3.65000000e-01],
       [4.84536082e-01, 2.93172691e-02, 3.81143430e-01, ...,
        5.03483965e-01, 6.00000000e-01, 1.05000000e-01]])

In [16]:
columns_for_normalization = columns_for_scaling

In [18]:
len(columns_for_normalization)

13

In [19]:
normalizer = Normalizer()
data_normalized = normalizer.fit_transform(data_scaled)

# K-Nearest Neighbors Classifier w/ Data scaled and Normalized

In [20]:
# instantiating model class
knn1 = NearestNeighbors(n_neighbors=20,algorithm='kd_tree',n_jobs=-1)

# fitting model
knn1 = knn1.fit(data_normalized)

# Dump Model If n_neighbors param needs to be adjusted

In [21]:
dump(knn1, './data/slider_model6.joblib', compress=True)

['./data/slider_model6.joblib']

# Song_ID Array for Predict Flask Function

In [255]:
# # converting song id column from df to an array

# song_id_array3 = np.array(df['songid'])

# filename5 = './data/song_id_array3.pkl'
# pickle.dump(song_id_array3, open(filename5, 'wb'))

# Loading Joblibbed Model

In [22]:
knn1 = load('./data/slider_model6.joblib')

In [23]:
# running model to test output
knn_results1 = knn1.kneighbors([data_normalized[2]])[1]

In [25]:
knn_results1.shape

(1, 20)

In [28]:
knn_results1[0]

array([     2,  39565, 265926, 532028, 407709, 438331, 473470, 259888,
       420313, 213790, 352801, 303257, 551492, 106525, 452963,  64608,
       124604,  46919, 433612, 572160])