In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import os
import datetime
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from sklearn.neighbors import NearestNeighbors
# from ipynb.fs.full.Preprocessing import clean_data
%load_ext tensorboard

In [2]:
# Tracks file with ids, titles, names, etc.
tracks = pd.read_csv('../../data/tracks.csv')
print(tracks.shape)
# tracks.head()

(586672, 20)


In [3]:
# New cleaned songs file with all features as one-hot categorical variables 
df = pd.read_csv('../../data/new_songs_cleaned.csv', index_col='Unnamed: 0')
print(df.shape)
df.head()

(586344, 74)


Unnamed: 0,popularity_1,popularity_2,popularity_3,popularity_4,popularity_5,duration_ms_1,duration_ms_2,duration_ms_3,duration_ms_4,duration_ms_5,...,valence_5,tempo_1,tempo_2,tempo_3,tempo_4,tempo_5,time_signature_1,time_signature_2,time_signature_3,time_signature_4
0,1,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
1,1,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
2,1,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [None]:
# Smaller dataframe for rapid testing

# small_new_df = new_df.iloc[:150000]
# print(small_new_df.shape)
# small_new_df.head()

In [4]:
# Autoencoder using new_songs_cleaned (all categorical) VAL_LOSS = 0.0015
data = df

encoded_dim = 16

input_song = Input(shape = (data.shape[1], ))
h1 = Dense(32, activation = 'relu')(input_song)
encoded = Dense(encoded_dim, activation = 'relu')(h1)

dh1 = Dense(32, activation='sigmoid')(encoded)
decoded = Dense(data.shape[1], activation='sigmoid')(dh1)

autoencoder = Model(input_song, decoded)
encoder = Model(input_song, encoded)

opt = tf.keras.optimizers.Nadam()
autoencoder.compile(optimizer=opt, loss='mean_squared_error', metrics=['accuracy'])

stop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3, restore_best_weights=True)

autoencoder.fit(data, # input image to encoder
                data, # provide input image to decoder so the model learns how to reconstruct the input image 
                batch_size=32,
                epochs=100,
                validation_split=.2,
                callbacks=[stop])

Train on 469075 samples, validate on 117269 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

In [None]:
# Get song encodings
data = df
decoded_songs = autoencoder.predict(data)
encoded_songs = encoder.predict(data)

embeddings_df = pd.DataFrame(encoded_songs, index=data.index)
print(embeddings_df.shape)
embeddings_df.head(3)

In [None]:

def find_neighbors(song):
    '''
    Find the nearest neighbors of a song
    1. Checks for song
    2. Loads and process the embeddings into an array
    3. Trains a nearest neighbors model
    4. Finds the 10 nearest neighbors of the given song
    ARGUMENTS: song in string form
    RETURNS: index of test song, list of prediction indices
    '''
    # 1. Check if song exists: if yes, use first result
    songs = tracks.index[tracks.name == song] 
    if len(songs) <1:
        return 'ERROR: Not a valid song name' 
    else:
        song_index = songs[0]

    # 2. Load data ---> ALREADY DONE
    # file = 'embeddings_df_001.csv' # USE CORRECT PATH
    # embeddings = pd.read_csv(file)
    # # Drop extra index column
    # embeddings.drop('Unnamed: 0', axis=1, inplace=True)
    # # Convert dataframe to numpy array
    # encoded_songs = embeddings.to_numpy()

    # 3. Train nearest neighbors model on encodings
    nn = NearestNeighbors(n_neighbors=11, algorithm='ball_tree')
    nn.fit(encoded_songs)

    # 4. Get neigbors of song
    test_encoding = encoded_songs[song_index].reshape(1,-1)
    _, n_indices = nn.kneighbors(test_encoding)
    # First result is often the song, so leave out
    n_indices = n_indices.tolist()[0][1:] 

    return song_index, n_indices

In [None]:
# Song to search by
song = 'Yellow Submarine'

# Find neighbors
test_song, neighbors = find_neighbors(song)

# Display test song and nearest neighbors
# First display song from search query
ind = test_song
name = tracks.iloc[ind]['name']
artist = tracks.iloc[ind]['artists']
print(f'Test song: {name} by {artist}')
print(f'https://open.spotify.com/track/{tracks.iloc[ind].id}')
# Next display predicitons
print('\nPredictions:')
for i in range(10):
    ind = neighbors[i]
    name = tracks.iloc[ind]['name']
    artist = tracks.iloc[ind]['artists']
    print(f'{name} by {artist}')
    print(f'   https://open.spotify.com/track/{tracks.iloc[ind].id}')