In [91]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import joblib

from typing import List, Tuple
from numpy import shape

from keras.models import Sequential
from keras.layers import LSTM, Dense

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam

from scipy.spatial.distance import cosine


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity

In [92]:
df = pd.read_csv('../datasets/cleaned_RNN_dataset.zip')


In [93]:
random_playlist = df.sample(n=10)


In [94]:
random_playlist.head()

Unnamed: 0.1,Unnamed: 0,track_id,track_name,track_artist,lyrics,track_album_id,track_album_name,playlist_name,playlist_id,playlist_genre,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,language
13447,16080,6u7jPi22kF8CTQ3rb9DHE7,Old Town Road (feat. Billy Ray Cyrus) - Remix,Lil Nas X,yeah i gonna take horse old town road i gonna ...,38kpkGLuPr1nNfD3iEyOlJ,Old Town Road,indie poptimism,6MQeEHHDs6HQkN5Dlj3oKo,pop,...,-5.56,1,0.102,0.0533,0.0,0.113,0.639,136.041,157067,en
14871,17815,7lQWRAjyhTpCWFC0jmclT4,Gangsta's Paradise,Coolio,na walk valley shadow death take look life rea...,0fYctMs4EvoEqzDh8Kmg5g,Gangsta's Paradise,dr q prescription playlist,6jAPdgY9XmxC9cgkXAVmVv,pop,...,-10.05,1,0.0593,0.0655,0.0,0.398,0.387,79.974,240693,en
13032,15573,6Nle9hKrkL1wQpwNfEkxjh,Chanel,Frank Ocean,guy pretty like girl got fight stories tell se...,6OGzmhzHcjf0uN9j7dYvZH,Chanel,electropop,2Z5cPJ6Z4EVZAfF08amjvL,pop,...,-5.732,0,0.237,0.874,0.0,0.112,0.473,110.134,210285,en
9385,11222,4RX1akxesJEFSvStbMQnop,Sweet Nothing - Tiësto Remix,Calvin Harris,took heart held mouth word love came rushing e...,0Wk1PrzniZxw4dhv9VUgzD,Sweet Nothing,house,53Ga3Xt9fumdJfhNNSBvzU,edm,...,-7.419,0,0.299,0.0102,0.147,0.0576,0.393,127.977,308533,en
4709,5617,2GB8OypbvrvCee61FKx5dp,A Hazy Shade of Winter,Simon & Garfunkel,time time time see what become looked around p...,3bzgbgiytguTDnwzflAZr2,Bookends,didnt know perm stood permanent wave,3e6gYPyrTbaB8BWgSHCt5j,rock,...,-11.203,0,0.0296,0.139,7e-06,0.378,0.969,142.027,137480,en


In [95]:
audio_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']

In [96]:
def extract_audio_features(playlist):
    """
    Given a users playlist, extracts just the audio features from the playlist.
    
    Args: 
        - 2D Pandas Dataframe consisting of the playlist from a user
    
    Returns:
        - 2D Pandas Dataframe consisting of the audio features from users playlist.
    """
    audio_features = playlist[audio_cols].values
    return audio_features

df_audio_features = extract_audio_features(df)
df_audio_features.head(10)

In [97]:
dataset_audio_features = extract_audio_features(df)
df_audio_features = pd.DataFrame(dataset_audio_features, columns=audio_cols)

In [98]:
df_audio_features.shape

(15405, 12)

In [99]:
dataset_audio_features.shape

(15405, 12)

In [100]:
print(type(dataset_audio_features))
print(type(df_audio_features))

<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>


In [101]:
cos_sim = cosine_similarity(dataset_audio_features)

In [102]:
print(type(cos_sim))

<class 'numpy.ndarray'>


In [103]:
def rehape_to_3d(two_dim_playlist):
    """
    Given a two-dimensional playlist, creates a three-dimensional playlist required for innput into the RNN model.

    Args:
        - two_dim_playlist: A 2D numpy array of shape (N, 12), where N is the number of songs in the playlist and 12 is the number of audio
        features the model will analyze.

    Returns:
        - three_dim_playlist: A three dimensional representation of the playlist. Just adding a third dimension to suffice the input shape.
        The final shape is (1, N, 12)
    """
    s1 = two_dim_playlist.shape[0]
    s2 = two_dim_playlist.shape[1]
    three_dim_playlist = np.reshape(two_dim_playlist, (1, s1, s2))
    return three_dim_playlist

In [104]:
def pad_and_reshape(arr):
    num_rows, num_cols = arr.shape[1], arr.shape[2]
    col_avgs = np.mean(arr, axis=1).reshape(-1, num_cols) # calculate average of each column
    padded_arr = np.empty((1, 35, 12))
    padded_arr.fill(np.nan) # fill with NaN values initially
    
    # copy original array to padded array
    padded_arr[:, :num_rows, :num_cols] = arr
    
    # pad remaining rows with average of each column
    for i in range(num_rows, 35):
        padded_arr[:, i] = col_avgs
    
    return padded_arr

In [105]:
af = extract_audio_features(random_playlist)
af = rehape_to_3d(af)


In [106]:
padded_playlist = pad_and_reshape(af)
print(padded_playlist.shape)

(1, 35, 12)


In [107]:
padded_playlist

array([[[ 8.7800000e-01,  6.1900000e-01,  6.0000000e+00, -5.5600000e+00,
          1.0000000e+00,  1.0200000e-01,  5.3300000e-02,  0.0000000e+00,
          1.1300000e-01,  6.3900000e-01,  1.3604100e+02,  1.5706700e+05],
        [ 6.4700000e-01,  5.1400000e-01,  8.0000000e+00, -1.0050000e+01,
          1.0000000e+00,  5.9300000e-02,  6.5500000e-02,  0.0000000e+00,
          3.9800000e-01,  3.8700000e-01,  7.9974000e+01,  2.4069300e+05],
        [ 7.7600000e-01,  5.0300000e-01,  0.0000000e+00, -5.7320000e+00,
          0.0000000e+00,  2.3700000e-01,  8.7400000e-01,  0.0000000e+00,
          1.1200000e-01,  4.7300000e-01,  1.1013400e+02,  2.1028500e+05],
        [ 6.6200000e-01,  6.2000000e-01,  8.0000000e+00, -7.4190000e+00,
          0.0000000e+00,  2.9900000e-01,  1.0200000e-02,  1.4700000e-01,
          5.7600000e-02,  3.9300000e-01,  1.2797700e+02,  3.0853300e+05],
        [ 4.9700000e-01,  5.9000000e-01,  2.0000000e+00, -1.1203000e+01,
          0.0000000e+00,  2.9600000e-02,  1.390

In [108]:
af.shape

(1, 10, 12)

In [109]:
model = joblib.load('simpleRNN.pkl')

In [110]:
predictions = model.predict(padded_playlist)
print(predictions)
print(predictions.shape)


[[ 0.4136654   0.45533007  3.13637    -1.2570764   0.5161253  -0.41632524
   0.07576507 -0.14258227  0.09709774  0.31569985 -0.5050068   9.858376  ]]
(1, 12)


In [111]:
# calculate the cosine similarity between the input array and each row of the larger array
cosine_similarities = np.apply_along_axis(lambda x: 1 - cosine(predictions, x), 1, dataset_audio_features)

In [138]:
most_similar_index = np.argsort(cosine_similarities)[::-1][:5]
print(f"Most similar row index(s): {most_similar_index}")

Most similar row index(s): [6011 5132 6679  493 1500]


In [136]:
def get_song_details(df, songs : list) -> list:
    song_rows = df.iloc[songs]
    return song_rows

recommended_songs = get_song_details(df, songs=most_similar_index)
recommended_songs.loc[:, ['track_name', 'track_artist']]

Unnamed: 0,track_name,track_artist
6011,Sweet Caroline,Neil Diamond
5132,Small,chloe moriondo
6679,Still Crazy After All These Years,Paul Simon
493,Monsters in Your Bedroom,Tertia May
1500,Doors Closing,Moonchild


In [139]:
def get_song_artist_titles(df, songs : list) -> list:
    song_rows = df.iloc[songs]
    print("Serving up your top five song recommendations!")
    print("-"*55)
    print(song_rows.loc[:, ['track_name', 'track_artist']])

    

get_song_artist_titles(df=df, songs=most_similar_index)

Serving up your top five song recommendations!
-------------------------------------------------------
                             track_name    track_artist
6011                     Sweet Caroline    Neil Diamond
5132                              Small  chloe moriondo
6679  Still Crazy After All These Years      Paul Simon
493            Monsters in Your Bedroom      Tertia May
1500                      Doors Closing       Moonchild
