In [1]:
# Importing necessary libraries

# Nearest neighbors will be the v1 model for sound drip
# will give logistic regressiona nd 
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import pandas as pd

# textblob will be used to perform sentiment analysis down the road
from textblob import TextBlob

# Data is large and will need to be unzipped
import zipfile

# unzipping file
with zipfile.ZipFile("./data_collection/final_data/DF_v1.pkl.zip","r") as zip_ref:
    zip_ref.extractall(".")

# reading in pickled data
song_list = pd.read_pickle("./DF_v1.pkl")

In [2]:
# ALERT - THIS COULD POTENTIALLY BE USED TO PERFORM SENTIMENT ANALYSIS ON LYRICS DATA IN v2. FOR v1, DISREGARD

# # creating polarity and subjectivity lists to be populated

# lyrics_polarity = []
# lyrics_subjectivity = []

# # using a for loop to populate polarity and subjectivity lists
# for title in song_list_df["lyrics"]:
#     try:
#         blob_test = TextBlob = TextBlob(title)
#         blob_sentiment = blob_test.sentiment
#         # appending results to lists above
#         lyrics_polarity.append(blob_sentiment[0])
#         lyrics_subjectivity.append(blob_sentiment[1])
#     except:
#         lyrics_polarity.append(0)
#         lyrics_subjectivity.append(0)

# # adding polarity and subjectivity to df
# song_list['lyrics_polarity'] = lyrics_polarity
# song_list['lyrics_subjectivity'] = lyrics_subjectivity

# Cleaning/Scaling Data

In [3]:
song_list.shape, song_list.columns

((728156, 16),
 Index(['acousticness', 'artist', 'danceability', 'duration_ms', 'energy',
        'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'songid',
        'speechiness', 'tempo', 'time_signature', 'track', 'valence'],
       dtype='object'))

In [4]:
# dropping unnecessary columns
X = song_list.drop(["songid","artist","track","key"], axis=1)

# checking that they're dropped
X.shape

(728156, 12)

In [5]:
# instantiating scaler
scaler = MinMaxScaler()

# scaling data
data_scaled = scaler.fit_transform(X)

# confirming scaling took place
data_scaled[0]

array([0.00130522, 0.4884654 , 0.02678623, 0.678     , 0.0551    ,
       0.0846    , 0.78720453, 1.        , 0.05113636, 0.59980079,
       0.8       , 0.87      ])

Data is scaled and ready to be fed to the model

# K-Nearest Neighbors Classifier

In [6]:
# instantiating model class
knn = NearestNeighbors(n_neighbors=20,algorithm='kd_tree')

# fitting model
model = knn.fit(data_scaled)

# running model to test output
knn_results = knn.kneighbors([data_scaled[1000]])[1]

In [7]:
knn_results

array([[  1000, 512827, 406608, 122767, 450355,  92437,  82189, 508280,
        713041, 709631, 647630, 411195,  48174, 304823, 363062, 714190,
        214905, 412399, 240739, 134590]])

# Pickling model

In [8]:
import pickle

filename = './models/knn_model_v1.pkl'
pickle.dump(model, open(filename, 'wb'))

In [9]:
# pulling in the (now) pickled model
model_pickle = pickle.load(open(filename, 'rb'))

In [10]:
# Loading model to test on scaled data (not pickled data)
pickle_results = model_pickle.kneighbors([data_scaled[1000]])[1]

# displaying output
pickle_results

array([[  1000, 512827, 406608, 122767, 450355,  92437,  82189, 508280,
        713041, 709631, 647630, 411195,  48174, 304823, 363062, 714190,
        214905, 412399, 240739, 134590]])

# Pickling Scaled Data

In [11]:
filename2 = './data_scaled.pkl'
pickle.dump(data_scaled, open(filename2, 'wb'))

# Testing both pickles for dataset, model

In [12]:
# importing libraries (even though they are visible above)
# to illustrate which packages are needed to create the function below

import pandas as pd 
from sklearn.neighbors import NearestNeighbors
import pickle


# this function predicts on a song id

def predict(id):
    #loads in pickled data and model
    X = pickle.load(open('./data_scaled.pkl', 'rb'))
    loaded_pickle = pickle.load(open('./models/knn_model_v1.pkl', 'rb'))
    #calculates results
    results = loaded_pickle.kneighbors([X[id]])[1]
    return results

In [14]:
predict(0)

FileNotFoundError: [Errno 2] No such file or directory: './data_scaled.pkl'

In [16]:

{
    "audio_features": {
        "acousticness": 0.934,
        "danceability": 0.186,
        "energy": 0.107,
        "instrumentalness": 0,
        "key": 5,
        "liveness": 0.297,
        "loudness": -14.802,
        "mode": 1,
        "speechiness": 0.0347,
        "tempo": 107.095,
        "time_signature": 4,
        "valence": 0.149
    }
}


{'audio_features': {'acousticness': 0.934,
  'danceability': 0.186,
  'energy': 0.107,
  'instrumentalness': 0,
  'key': 5,
  'liveness': 0.297,
  'loudness': -14.802,
  'mode': 1,
  'speechiness': 0.0347,
  'tempo': 107.095,
  'time_signature': 4,
  'valence': 0.149}}

In [17]:
{
  "songs": [
    {
      "similarity": [
        0.9999733801267939
      ],
      "values": "6rMRZ9DtxJhH1Ycbk6VeDi"
    },
    {
      "similarity": [
        0.9999691841226913
      ],
      "values": "6cgoS3EosBd9MZOK8Z6KOV"
    }
  ]
}

{'songs': [{'similarity': [0.9999733801267939],
   'values': '6rMRZ9DtxJhH1Ycbk6VeDi'},
  {'similarity': [0.9999691841226913], 'values': '6cgoS3EosBd9MZOK8Z6KOV'}]}