In [121]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import joblib

from typing import List, Tuple
from numpy import shape

from keras.models import Sequential
from keras.layers import LSTM, Dense

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam

from scipy.spatial.distance import cosine


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity

In [122]:
df = pd.read_csv('../datasets/cleaned_RNN_dataset.zip')


In [123]:
random_playlist = df.sample(n=10)


In [124]:
random_playlist.head()

Unnamed: 0.1,Unnamed: 0,track_id,track_name,track_artist,lyrics,track_album_id,track_album_name,playlist_name,playlist_id,playlist_genre,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,language
10987,13148,5My5YI9oqE9MZcHIrm38BJ,The Sweetest Thing (feat. Lauryn Hill) - From ...,Refugee Camp All-Stars,unexpected multi dimensional unexpected sound ...,7dt0RR2Kbvu3ZuWloudfjI,Love Jones The Music,neo soul soulful r b,4F3xAxHxeBwXhK6k6GPnrW,r&b,...,-10.084,0,0.1,0.0159,2.1e-05,0.0749,0.333,83.822,292000,en
8748,10444,4Iam3vZMJCMltFkK9mNruw,Bad Blood,Nao,holiday glass ocean slipping throat landing ho...,2BmceJHiy9RTyvaB1IU0P6,For All We Know,urban contemporary,1ZlL3IQS8eB0s0RMxz02yD,r&b,...,-5.687,0,0.133,0.639,4e-06,0.0631,0.378,113.108,240627,en
1260,1525,0pCyoA5o2uxUwIovECCziF,Hold My Hand,Michael Jackson,akon mj oh yeah yeah life last forever hold ha...,0yYWeLqonFk11pmb0RiMbP,Michael,pop,3gV6KQYZKIuoWml0094vx6,latin,...,-5.842,1,0.0374,0.191,0.0,0.108,0.381,90.017,212227,en
9331,11161,4r3MJuJIArZTQfOinh1HFa,Livin' It Up,Ja Rule,yea yea yea my uhh yea c mon c mon uhh ladies ...,4Xc3wBfUZ9yiszOrttoCXV,Pain Is Love (International Version),ultimate throwbacks collection,1dsaMvnC1hXPCNGC4aVtjj,r&b,...,-4.088,1,0.321,0.0575,0.0,0.0498,0.638,106.024,256960,en
14405,17240,7eBqSVxrzQZtK2mmgRG6lC,Murder On My Mind,YNW Melly,get roll no ayy i studio bro oh nah yeah call ...,7naY6j4wcgUxfHB98G79CW,I AM YOU,gangsta rap,3DUkl93JjPdOyX83SJr5ms,rap,...,-7.985,0,0.0516,0.145,3e-06,0.11,0.74,115.007,268434,en


In [125]:
audio_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']

In [126]:
def extract_audio_features(playlist):
    """
    Given a users playlist, extracts just the audio features from the playlist.
    
    Args: 
        - 2D Pandas Dataframe consisting of the playlist from a user
    
    Returns:
        - 2D Pandas Dataframe consisting of the audio features from users playlist.
    """
    audio_features = playlist[audio_cols].values
    return audio_features

df_audio_features = extract_audio_features(df)
df_audio_features.head(10)

In [127]:
dataset_audio_features = extract_audio_features(df)
df_audio_features = pd.DataFrame(dataset_audio_features, columns=audio_cols)

In [128]:
df_audio_features.shape

(15405, 12)

In [129]:
dataset_audio_features.shape

(15405, 12)

In [130]:
print(type(dataset_audio_features))
print(type(df_audio_features))

<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>


In [131]:
cos_sim = cosine_similarity(dataset_audio_features)

In [132]:
print(type(cos_sim))

<class 'numpy.ndarray'>


In [133]:
def rehape_to_3d(two_dim_playlist):
    """
    Given a two-dimensional playlist, creates a three-dimensional playlist required for innput into the RNN model.

    Args:
        - two_dim_playlist: A 2D numpy array of shape (N, 12), where N is the number of songs in the playlist and 12 is the number of audio
        features the model will analyze.

    Returns:
        - three_dim_playlist: A three dimensional representation of the playlist. Just adding a third dimension to suffice the input shape.
        The final shape is (1, N, 12)
    """
    s1 = two_dim_playlist.shape[0]
    s2 = two_dim_playlist.shape[1]
    three_dim_playlist = np.reshape(two_dim_playlist, (1, s1, s2))
    return three_dim_playlist

In [134]:
def pad_and_reshape(arr):
    num_rows, num_cols = arr.shape[1], arr.shape[2]
    col_avgs = np.mean(arr, axis=1).reshape(-1, num_cols) # calculate average of each column
    padded_arr = np.empty((1, 35, 12))
    padded_arr.fill(np.nan) # fill with NaN values initially
    
    # copy original array to padded array
    padded_arr[:, :num_rows, :num_cols] = arr
    
    # pad remaining rows with average of each column
    for i in range(num_rows, 35):
        padded_arr[:, i] = col_avgs
    
    return padded_arr

In [135]:
af = extract_audio_features(random_playlist)
af = rehape_to_3d(af)


In [136]:
padded_playlist = pad_and_reshape(af)
print(padded_playlist.shape)

(1, 35, 12)


In [137]:
padded_playlist

array([[[ 6.090000e-01,  5.980000e-01,  7.000000e+00, -1.008400e+01,
          0.000000e+00,  1.000000e-01,  1.590000e-02,  2.070000e-05,
          7.490000e-02,  3.330000e-01,  8.382200e+01,  2.920000e+05],
        [ 4.940000e-01,  4.120000e-01,  5.000000e+00, -5.687000e+00,
          0.000000e+00,  1.330000e-01,  6.390000e-01,  4.460000e-06,
          6.310000e-02,  3.780000e-01,  1.131080e+02,  2.406270e+05],
        [ 6.060000e-01,  7.200000e-01,  1.000000e+00, -5.842000e+00,
          1.000000e+00,  3.740000e-02,  1.910000e-01,  0.000000e+00,
          1.080000e-01,  3.810000e-01,  9.001700e+01,  2.122270e+05],
        [ 8.730000e-01,  7.650000e-01,  1.000000e+00, -4.088000e+00,
          1.000000e+00,  3.210000e-01,  5.750000e-02,  0.000000e+00,
          4.980000e-02,  6.380000e-01,  1.060240e+02,  2.569600e+05],
        [ 7.590000e-01,  7.300000e-01,  0.000000e+00, -7.985000e+00,
          0.000000e+00,  5.160000e-02,  1.450000e-01,  3.060000e-06,
          1.100000e-01,  7.400

In [138]:
af.shape

(1, 10, 12)

In [139]:
model = joblib.load('simpleRNN.pkl')

In [140]:
predictions = model.predict(padded_playlist)
print(predictions)
print(predictions.shape)


[[ 4.0899327e-01 -9.1485977e-03  3.3621628e+00 -1.3685110e+00
  -3.4327966e-01  3.7615336e-02  1.2920648e-01 -2.8875064e-02
   1.4405602e-01 -1.0360333e+00  2.0095821e+01  2.1025404e+01]]
(1, 12)


In [141]:
# calculate the cosine similarity between the input array and each row of the larger array
cosine_similarities = np.apply_along_axis(lambda x: 1 - cosine(predictions, x), 1, dataset_audio_features)

In [142]:
most_similar_index = np.argsort(cosine_similarities)[::-1][:5]
print(f"Most similar row index(es): {most_similar_index}")

Most similar row index(es): [ 3510 14421  8043   125   846]


In [143]:
def get_song_details(df, songs : list) -> list:
    song_rows = df.iloc[songs]
    return song_rows

recommended_songs = get_song_details(df, songs=most_similar_index)
recommended_songs.loc[:, ['track_name', 'track_artist']]

Unnamed: 0,track_name,track_artist
3510,raindrops (an angel cried),Ariana Grande
14421,No Guarantee - Remix Version,Chico DeBarge
8043,Episode VIII,Madlib
125,Rearranged,Killstation
846,Attitude,Guns N' Roses


In [144]:
def get_song_artist_titles(df, songs : list) -> list:
    song_rows = df.iloc[songs]
    print("Serving up your top five song recommendations!")
    print("-"*55)
    print(song_rows.loc[:, ['track_name', 'track_artist']])

    

get_song_artist_titles(df=df, songs=most_similar_index)

Serving up your top five song recommendations!
-------------------------------------------------------
                         track_name   track_artist
3510     raindrops (an angel cried)  Ariana Grande
14421  No Guarantee - Remix Version  Chico DeBarge
8043                   Episode VIII         Madlib
125                      Rearranged    Killstation
846                        Attitude  Guns N' Roses
