# CSE 144 Group 3
## Music Recommendation System (MRS)

In this notebook, we write the predictive model for our music recommendation system. Our work leverages modern tools including recurrent neural networks (RNN) and BERT sentence transformers...

<br>

Our work leverages this RNN model:

https://github.com/taylorhawks/RNN-music-recommender/blob/master/cloud/model.ipynb


In [58]:
# import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format="retina"
import numpy as np
import random
import torch
import os
import plotly.graph_objects as go
import numpy as np

from skimage.util.shape import view_as_windows as viewW
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.decomposition import PCA

from keras.models import Sequential, load_model
from keras.layers import Dense, SimpleRNN, Input
from keras.losses import *


### Load the data

In [59]:
song_features_data = pd.read_csv('misc/processed_music_info_extended.csv')
user_listening_data = pd.read_csv('misc/processed_user_listening_hist.csv')

#For running on Google Colab

# from google.colab import drive
# drive.mount('/content/drive')
# song_features_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/music_info.csv')
# user_listening_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/user_listening_hist.csv')

### Set Random Seed

In [60]:
torch.manual_seed(24)

<torch._C.Generator at 0x20a36218c50>

### Read and Display Data

In [61]:
print('# of rows of Song Data: ' + str(len(song_features_data)))
print('# of unique songs: ' + str(len(song_features_data['track_id'].unique())))
song_features_data.head()

# of rows of Song Data: 50683
# of unique songs: 50683


Unnamed: 0,track_id,name,artist,spotify_id,tags,year,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",2004,222200,0.355,0.918,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",2006,258613,0.409,0.892,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4
2,TROUVHL128F426C441,Come as You Are,Nirvana,0keNu0t0tqsWtExGM3nT1D,"rock, alternative, alternative_rock, 90s, grunge",1991,218920,0.508,0.826,4,-5.783,0,0.04,0.000175,0.000459,0.0878,0.543,120.012,4
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,0ancVQ9wEcHVd0RrGICTE4,"rock, alternative, indie, alternative_rock, in...",2004,237026,0.279,0.664,9,-8.851,1,0.0371,0.000389,0.000655,0.133,0.49,104.56,4
4,TRLNZBD128F935E4D8,Creep,Radiohead,01QoK9DA7VTeTSE3MNzp4I,"rock, alternative, indie, alternative_rock, in...",2008,238640,0.515,0.43,7,-9.935,1,0.0369,0.0102,0.000141,0.129,0.104,91.841,4


In [62]:
print('# of rows of User Listening Data: ' + str(len(user_listening_data)))
print('# of unique users: ' + str(len(user_listening_data['user_id'].unique())))
user_listening_data.head()

# of rows of User Listening Data: 806745
# of unique users: 25343


Unnamed: 0,track_id,user_id,playcount
0,TRLATHU128F92FC275,5a905f000fc1ff3df7ca807d57edb608863db05d,11
1,TRMKFPN128F42858C3,5a905f000fc1ff3df7ca807d57edb608863db05d,2
2,TRGAOLV128E0789D40,5a905f000fc1ff3df7ca807d57edb608863db05d,2
3,TREAQSX128E07818CA,5a905f000fc1ff3df7ca807d57edb608863db05d,2
4,TRUMDRI128F424FEFC,5a905f000fc1ff3df7ca807d57edb608863db05d,3


### Data Preprocessing


In [63]:
# Drop unnecessary columns
song_features_data = song_features_data.drop(columns=['year', 'time_signature', 'key'])

In [64]:
# Convert song duration from milliseconds to minutes
song_features_data["duration_mins"] = song_features_data["duration_ms"] / 60000
song_features_data.drop("duration_ms", axis=1, inplace=True)


song_features_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50683 entries, 0 to 50682
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          50683 non-null  object 
 1   name              50683 non-null  object 
 2   artist            50683 non-null  object 
 3   spotify_id        50683 non-null  object 
 4   tags              49556 non-null  object 
 5   danceability      50683 non-null  float64
 6   energy            50683 non-null  float64
 7   loudness          50683 non-null  float64
 8   mode              50683 non-null  int64  
 9   speechiness       50683 non-null  float64
 10  acousticness      50683 non-null  float64
 11  instrumentalness  50683 non-null  float64
 12  liveness          50683 non-null  float64
 13  valence           50683 non-null  float64
 14  tempo             50683 non-null  float64
 15  duration_mins     50683 non-null  float64
dtypes: float64(10), int64(1), object(5)
memo

In [65]:
data = pd.merge(song_features_data, user_listening_data, on='track_id')
data.head()

Unnamed: 0,track_id,name,artist,spotify_id,tags,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_mins,user_id,playcount
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",0.355,0.918,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,3.703333,fe31db6d197a667d265ff5a35d80d60f3660f729,2
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",0.409,0.892,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4.310217,67874d1a189c83326c529e554be6f7acf55effae,12
2,TRRIVDJ128F429B0E8,Wonderwall,Oasis,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",0.409,0.892,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4.310217,e3ee8846c9a5a0916700a9e7abfc1c5b2fcb8e36,5
3,TRRIVDJ128F429B0E8,Wonderwall,Oasis,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",0.409,0.892,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4.310217,cbb6b8dccf0af0d221dfd4684072c04bb0346f30,2
4,TRRIVDJ128F429B0E8,Wonderwall,Oasis,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",0.409,0.892,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4.310217,2cdf67cd70a64964cb914835af0043fcc28a8f48,12


### Obtain total number of listens per song

In [66]:
play_counts = data.groupby('name')['playcount'].sum().reset_index()
play_counts

Unnamed: 0,name,playcount
0,#1 Zero,13
1,#16,110
2,#17,7
3,#24,5
4,$20 for Boban,43
...,...,...
23579,慟哭と去りぬ,134
23580,我、闇とて･･･,7
23581,朔-saku-,51
23582,蜷局,368


### Create playlists for input to RNN

In [67]:
data = data.sort_values(['user_id'])
data

Unnamed: 0,track_id,name,artist,spotify_id,tags,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_mins,user_id,playcount
306346,TRTVXIH128F426625A,Come Round Soon,Sara Bareilles,0jkVXytWSisMUtrBEej9mi,"pop, female_vocalists, singer_songwriter, soul...",0.338,0.819,-4.495,0,0.0776,0.077700,0.000000,0.1590,0.545,74.751,3.552000,0000f88f8d76a238c251450913b0d070e4a77d19,2
417455,TRWUFEW128F14782F3,Forever My Friend,Ray LaMontagne,0Ev7atdl0qS2n39OO7051O,"folk, singer_songwriter, soul, blues, acoustic...",0.493,0.524,-13.553,1,0.0423,0.334000,0.014100,0.3570,0.379,176.233,5.788883,0000f88f8d76a238c251450913b0d070e4a77d19,2
32466,TRNXEPE128F9339E47,My Name Is Jonas,Weezer,0YU04WSkTVomRgeDOWlEzX,"rock, alternative, indie, alternative_rock, in...",0.261,0.947,-3.031,1,0.0488,0.000197,0.003320,0.3100,0.550,185.942,3.435333,0000f88f8d76a238c251450913b0d070e4a77d19,2
698954,TRMKCCV128F92EB22E,Light On,David Cook,1BnoZbPDh9dbYqabvM6qZg,"rock, alternative_rock, male_vocalists",0.448,0.830,-4.156,0,0.0332,0.067300,0.000000,0.1130,0.362,131.991,3.816883,0000f88f8d76a238c251450913b0d070e4a77d19,3
227171,TRJGJTH128F4291A81,"Oh My God, Whatever, Etc.",Ryan Adams,0sUzPqm1gdsabzX5htMvf7,"rock, indie, folk, singer_songwriter, acoustic...",0.572,0.395,-10.630,1,0.0304,0.700000,0.000250,0.1260,0.483,79.552,2.532667,0000f88f8d76a238c251450913b0d070e4a77d19,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
802077,TRSEFCM128F429354D,Set It Up,Xavier Rudd,1sF9FiOQivhgedQnS1j3fK,"acoustic, 00s",0.469,0.385,-11.300,0,0.0270,0.503000,0.001390,0.1150,0.116,130.767,4.141550,fffbab4b8416fc41d05fcbdcf0e6735c4f37cb39,2
417463,TRWUFEW128F14782F3,Forever My Friend,Ray LaMontagne,0Ev7atdl0qS2n39OO7051O,"folk, singer_songwriter, soul, blues, acoustic...",0.493,0.524,-13.553,1,0.0423,0.334000,0.014100,0.3570,0.379,176.233,5.788883,fffbab4b8416fc41d05fcbdcf0e6735c4f37cb39,8
649627,TRDVGIH128F429353C,Come Let Go,Xavier Rudd,258CEuV9zzGk2PraoCH2yx,"reggae, male_vocalists",0.547,0.546,-8.634,1,0.0470,0.114000,0.037200,0.3810,0.280,140.477,6.870433,fffbab4b8416fc41d05fcbdcf0e6735c4f37cb39,28
553208,TROXFVJ128F1465265,Bottom Of the Barrel,Amos Lee,1VWGfrhpY8IiNmqMHavRXS,"folk, soul, acoustic, guitar",0.609,0.346,-12.703,1,0.1460,0.761000,0.000000,0.1100,0.550,178.137,2.006433,fffbab4b8416fc41d05fcbdcf0e6735c4f37cb39,4


In [68]:
# Changed name to track_id
playlists = data.groupby('user_id')['track_id'].apply(lambda x: list(x.head(20)))
playlist_dict = playlists.to_dict()
print(playlists)

user_id
0000f88f8d76a238c251450913b0d070e4a77d19    [TRTVXIH128F426625A, TRWUFEW128F14782F3, TRNXE...
0005eb11fd1dad47e6e6719a4db30340073a9e38    [TRGOJNK128F92F2A03, TRQPSHM128F92F29ED, TRTUW...
000d80cd9b58a8f77b33aa613dcfc5cbf1daf5e8    [TRDYYKS128F4275626, TRBHLYP12903D0D107, TRABF...
000e9296161b73a1821aaed3d7f50d95e8665bf6    [TROPEIV128F428F5A8, TRIAZQY128F934D58D, TRMKA...
00100482b3f3074549c751e718c57ed211b35991    [TRSNCIW128F14557BC, TRJKPFL12903CCE490, TRWJN...
                                                                  ...                        
fff7352d8ca192c451ce4fa00d18e33e261ecad3    [TRDRVJA128F4267831, TRCKWGF12903CD2DCD, TRXUW...
fff759a45a3a68de552740e8285a97d5f65d4e58    [TRDJZFF128F92D2627, TRULONW128F9302209, TRBNY...
fff9bd021bf6e07936883b9bb045207fcf372a2c    [TROHXCJ128F935A6AC, TRUMJNK12903CF465A, TRXYM...
fffb0b218640d86e5cb99d41cd3ecad977142da5    [TRZGGHL12903CDBF1F, TRCAUIX128F4277AD0, TRYIK...
fffbab4b8416fc41d05fcbdcf0e6735c4f37cb39    [TRGPCUN

In [69]:
# Changed track_id to name
data_dict = data.drop(['artist', 'tags', 'playcount'], axis=1)
# Changed name to track_id
data_dict = data_dict.set_index(['user_id', 'track_id']).to_dict('index')

In [70]:
songs_done = 0
updated_playlist_dict = {}
for user_id, songs in playlist_dict.items():
    updated_songs = []
    for song in songs:
        key = (user_id, song)
        if key in data_dict:
            the_features = list(data_dict[key].values())
            updated_songs.append([song] + the_features)
            songs_done += 1
            if songs_done % 10000 == 0:
                print(songs_done)
    updated_playlist_dict[user_id] = updated_songs

playlist_dict = updated_playlist_dict

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000


In [71]:
arr = []
for user_id, playlist in playlist_dict.items():
    arr2 = []
    for song in playlist:
        arr2.append(np.concatenate((song[0:6], song[7:12])))
    arr.append(arr2)

arr_np = np.array(arr)

In [72]:
playlists = pd.DataFrame.from_dict(playlist_dict, orient='index')
playlists.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0000f88f8d76a238c251450913b0d070e4a77d19,"[TRTVXIH128F426625A, Come Round Soon, 0jkVXytW...","[TRWUFEW128F14782F3, Forever My Friend, 0Ev7at...","[TRNXEPE128F9339E47, My Name Is Jonas, 0YU04WS...","[TRMKCCV128F92EB22E, Light On, 1BnoZbPDh9dbYqa...","[TRJGJTH128F4291A81, Oh My God, Whatever, Etc....","[TRFQYFT128F14840BC, Nobody Girl, 2YqjAMK5eeSk...","[TRKSXHR128F1455E4D, Dear Chicago, 2J8P81JjKem...","[TRGJTIY128F4296A0E, All You Need Is Love, 0BW...","[TRZLJOC128F14840BE, Enemy Fire, 13gnIRFWtBQN1...","[TRSMEUG128F14856D2, Within You, 0VRC5T7fDBY1S...","[TRZYESA128F148D67F, Please Do Not Let Me Go, ...","[TRRUNEV128F148D719, Burning Photographs, 2Mco...","[TRDKRLP128F4291A80, Halloweenhead, 04N3X9vfSz...","[TRFTUIW128E0784B9F, Bubble Toes, 1CFwwYZ58s34...","[TRKGCIA128F92C315D, Joe's Head, 0A2BgEzGWU9HB...","[TRFUCYR128F92DC67F, California Waiting, 0txCP...","[TROZZNY128F14782F7, All the Wild Horses, 0FFm...","[TRTWOCA128F14840B8, La Cienega Just Smiled, 0...","[TRQSEMJ128F4294F24, Pearls On A String, 02WVv...","[TRUNKTP12903CD1EFB, Blue Sky, 08SPbOlgCODbnWE..."
0005eb11fd1dad47e6e6719a4db30340073a9e38,"[TRGOJNK128F92F2A03, The Technicolor Phase, 27...","[TRQPSHM128F92F29ED, The Airway, 3Cy5wM1kAWdQ3...","[TRTUWMO128F92F2A09, Dear Vienna, 2LBdBoz94BqE...","[TRRNWAK128F92F29FB, Super Honeymoon, 0aMWS9ld...","[TRYEGSH12903CD2DCE, Overboard, 0cfsbkanGUO3yz...","[TRCKWGF12903CD2DCD, Never Let You Go, 7mP4fGw...","[TRNFVQI128F931BAEA, The Saltwater Room, 1eX8F...","[TRTKLFX12903CD2DC2, First Dance, 0OQuXXwwYt2j...","[TRPGPDK12903CCC651, Bring Me To Life, 0rJ8HF2...","[TRJDMHS128F92F2A0C, I'll Meet You There, 2XGM...","[TRLVQME128F931BAF3, Vanilla Twilight, 0hXBVbr...","[TRUGOGT128F92F29E9, Captains and Cruise Ships...","[TRCXWLU128F92F2A0D, This Is The Future, 17jG9...","[TRRVJCK12903CD2DCB, U Smile, 0KDJBhhe2OYnnoJt...","[TRCJAHJ128E07815B6, Stacy's Mom, 0b5Z4MPCgSFm...","[TRPWIGO128F931BAEB, Dental Care, 1IyackM7hvB1...","[TRNEITZ128F92F29EA, Designer Skyline, 30KmLL3...","[TRMIHFS128F92F2A01, Early Birdie, 1TvtrJ6uyfQ...","[TRRLGDR128F933A7C9, Injection, 0it4CBT8IGSbXv...","[TRLNFKN128F931BAF2, The Tip Of The Iceberg, 1..."
000d80cd9b58a8f77b33aa613dcfc5cbf1daf5e8,"[TRDYYKS128F4275626, Music Is Happiness, 5eWkK...","[TRBHLYP12903D0D107, 4X4, 21SudxOkg2z2LMBrghl7...","[TRABFDT12903CADD73, Up Up & Away, 0InFAWpnO2z...","[TRLNVSC12903CADD67, Simple As..., 04nE0pNbhPQ...","[TRKOCXI128F9316B54, Harmony One, 1BtLEUri7ROn...","[TRSEFCM128F429354D, Set It Up, 1sF9FiOQivhged...","[TRUWANM128F1485EE2, LDN, 016gjTKLZX8Sgaos4DRq...","[TRXKEMH128F423381D, Superfresh, 0mCoxFFYs0TRZ...","[TREMDON128F427C701, Crimewave (Crystal Castle...","[TRHPKWO128F92E01D5, The Lightning Strike, 1rE...","[TRPONOG128F4275608, The Adjustor, 6sC8fTO6Ja6...","[TRJGDTG128F421CE22, Lights & Music, 0FezhHZVm...","[TROTYPC128E07940AB, Door Peep, 0ceGoYvdbcsRll...","[TRPXIWX128F429831F, One Minute to Midnight, 0...","[TROINZB128F932F740, Crazy in Love, 0klMKiGV38...","[TROUAEG128F429354A, Message Stick, 0jFN4WAx76...","[TRQEBRP12903CADD6C, Sky Might Fall, 2Pq2jkcG8...","[TROTWMO128F42B9238, Iconography, 04gW4W5ziYM3...","[TRJYECB128F4230F29, Second Chances, 1WPZR8Kf1...","[TRJLGXB128F93043EA, Colourful, 21rILkLpA1vsYZ..."
000e9296161b73a1821aaed3d7f50d95e8665bf6,"[TROPEIV128F428F5A8, Fatal, 5HeBXKvt8Kc9wY7rrk...","[TRIAZQY128F934D58D, El Pueblo Unido, 6M3ONz42...","[TRMKAZB128F92F2F3E, Can't Keep, 08SE6CEP3gjL9...","[TRPHDFT128F92C5A75, So Com Voce, 1f0V4eqYAmy1...","[TRNXBBR128F425ECE3, We Came Along This Road, ...","[TRKPWGR128E078EE06, Where Did You Sleep Last ...","[TRLPOFY128F425ECE8, Darker With the Day, 1PKj...","[TRCHYZB128F425ECE1, The Sorrowful Wife, 3DFrC...","[TRXEAZB128E078EDCE, Something In The Way, 7hh...","[TRFVSOZ128F4281933, I'm Sleeping in a Submari...","[TRDMUWU128E078EDDB, Dumb, 13noTim30TG19L0rg9f...","[TRDRFVY128F4281937, Headlights Look Like Diam...","[TRIPLBA128F427200F, My Moon My Man, 0Bl1KVabX...","[TRJSAID128F934D596, Beautiful Drug, 50t5tH0xK...","[TRMYAYJ128F934D0AF, Until the Morning, 20F3Fc...","[TRWGIOT128F425ECDE, Sweetheart Come, 0pcV8SPE...","[TRLRCIA128F425ECD7, Fifteen Feet of Pure Whit...","[TRIAGDA128F4296176, Recycled Air, 0k0UEpGDB2x...","[TRIDPWO128F423DBC6, Faust Arp, 5SdmtFbNOD7Qej...","[TRPFLRB128F14A895D, No Cars Go, 0nev4XL4Y6hrD..."
00100482b3f3074549c751e718c57ed211b35991,"[TRSNCIW128F14557BC, Col, 4XZ9hQzKr4hUf2IRzwqx...","[TRJKPFL12903CCE490, A Well Deserved Break, 2t...","[TRWJNEC128E079654F, Part of the Process, 06yd...","[TRACWHF128F14557BB, Enjoy The Wait, 03n4bUnpU...","[TRAZCMI128F14557B9, Howling, 2PFP9QAfVE4cmPuS...","[TRUEXGL128F14557BD, Who Can You Trust?, 3e2UE...","[TREECSZ128F14557BE, Almost Done, 13ccsvjo5S9Q...","[TRUAJOJ128F14557B6, Post Houmous, 0BrSfR3QBDY...","[TRASVEM128E0796553, Trigger Hippie, 0oWC3y01a...","[TROXRVT128E079650A, Aqualung, 0NQEKTasUwXVu03...","[TRZJHGG128E079655A, Never An Easy Way, 1c1KOD...","[TRIXKKQ12903CCE495, Coming Down Gently, 1THrj...","[TRORPWW12903CCE48E, Love Is Rare, 07ZOef7Bqy9...","[TRYIASQ128E079650E, Undress Me Now, 2cZu9PrRr...","[TRDNHAW128F429DB9A, The Ballad of Michael Val...","[TRXYEKR128E079654C, Otherwise, 0NTSwjegwCGXjm...","[TRHZMPR128F42A52CB, Challengers, 33ZcFxD1Ohwj...","[TRXZMLY128E0796512, Public Displays of Affect...","[TRJSQQT128F149F9B4, Street Justice, 0lJRL3H6x...","[TRXCZNS128F428A15E, Next To You, 0rUmVbfsJQzW..."


### Train and Test Split

In [73]:
# Train and test splits for playlist

X = arr_np[:,:-1,:]
Y = arr_np[:,1:,:]
x_train, x_val, y_train, y_val = train_test_split(X,Y,train_size=0.75,random_state=3000)
x_train, x_test, y_train, y_test = train_test_split(x_train,y_train,train_size=0.92,random_state=3000)

In [74]:
x_test = x_test.tolist()
y_test = y_test.tolist()


for i in range(len(x_test)):
    x_value = y_test[i][-1]
    y_value = x_test[i][0]
    x_test[i].append(x_value)
    y_test[i].insert(0, y_value)

x_test = np.array(x_test)
x_test_file = x_test
y_test = np.array(y_test)
y_test_file = y_test

In [81]:
np.save("misc/x_test.npy", x_test_file)
np.save("misc/y_test.npy", y_test_file)

In [75]:
# Original Playlists
ops_x_train, ops_y_train, ops_x_val, ops_y_val, ops_x_test, ops_y_test = [], [], [], [], [], []

# This only works based on size if val and test sets switch in size switch them in these loops
for user in range(np.ma.size(x_train, axis=0)):
    names_x_train, names_y_train, names_x_val, names_y_val, names_x_test, names_y_test = [], [], [], [], [], []
    for song in range(np.ma.size(x_train, axis=1)):
        names_x_train.append(x_train[user, song, 0:3])
        names_y_train.append(y_train[user, song, 0:3])
        try:
            names_x_val.append(x_val[user, song, 0:3])
            names_y_val.append(y_val[user, song, 0:3])
        except IndexError:
            continue
        try:
            names_x_test.append(x_test[user, song, 0:3])
            names_y_test.append(y_test[user, song, 0:3])
        except IndexError:
            continue

    ops_x_train.append(names_x_train)
    ops_y_train.append(names_y_train)
    if not names_x_val:
        continue
    ops_x_val.append(names_x_val)
    ops_y_val.append(names_y_val)
    if not names_x_test:
        continue
    ops_x_test.append(names_x_test)
    ops_y_test.append(names_y_test)
x_train = x_train[:, :, 3:].astype(np.float64)
y_train = y_train[:, :, 3:].astype(np.float64)
x_val = x_val[:, :, 3:].astype(np.float64)
y_val = y_val[:, :, 3:].astype(np.float64)
x_test = x_test[:, :, 3:].astype(np.float64)
y_test = y_test[:, :, 3:].astype(np.float64)

### Define the Model

In [76]:
if os.path.exists('misc/mae_optimized_model.keras'):
    print("using saved model")
    model = load_model('misc/mae_optimized_model.keras')
else:
    print("training model")
    model = Sequential()
    model.add(Input(shape=(None,8)))
    model.add(SimpleRNN(
        16,
        activation='linear',
        return_sequences=True,
        kernel_initializer='random_uniform',
    ))
    model.add(SimpleRNN(
        16,
        activation='linear',
        return_sequences=True,
        kernel_initializer='random_uniform',
    ))
    model.add(Dense(8, activation='linear', kernel_initializer='random_uniform',))
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(torch.cuda.get_device_name(0))

    
    model.compile(loss='mae', optimizer='adam')
    model.fit(x_train, y_train, epochs=20, batch_size=32, validation_data=(x_val, y_val))
    model.save('misc/mae_optimized_model.keras')

using saved model


In [77]:
mae_optimized_model_adam = model

In [80]:
def predict_sample(sample,model):
    return (model.predict(np.array([sample]))[0,-1])

### Run RNN

In [35]:
print('Selecting a random index in our test dataset: ')
random_index = random.randint(0,len(x_test)-1)
print(random_index)

print('Input: ')
print(x_test[random_index])

print('\n','Output: ')
predicted = predict_sample(x_test[random_index], mae_optimized_model_adam)
print(predicted)

Selecting a random index in our test dataset: 
699
Input: 
[[ 4.9500e-01  8.9700e-01 -3.2600e+00  3.2100e-02  1.7400e-05  6.4700e-04
   7.1600e-02  2.3200e-01]
 [ 2.9000e-01  9.4400e-01 -6.3770e+00  7.8900e-02  4.1700e-05  4.9500e-02
   1.1900e-01  3.6100e-01]
 [ 8.2700e-01  6.6500e-01 -9.7080e+00  6.4800e-02  4.3800e-02  7.6200e-01
   8.2300e-02  6.0000e-01]
 [ 3.7500e-01  4.0300e-01 -1.2776e+01  2.8600e-02  7.4500e-01  5.4600e-01
   1.8100e-01  2.0700e-01]
 [ 4.0900e-01  9.5600e-01 -5.2920e+00  1.2700e-01  5.0600e-05  1.1400e-05
   1.2100e-01  2.1400e-01]
 [ 1.9300e-01  6.2600e-01 -8.5760e+00  3.2200e-02  6.2000e-04  4.8800e-05
   1.6200e-01  4.6400e-02]
 [ 2.5400e-01  9.8300e-01 -5.0570e+00  1.7300e-01  1.1300e-06  2.2400e-02
   8.3400e-02  1.7500e-01]
 [ 5.0700e-01  9.9400e-01 -3.4550e+00  8.2600e-02  5.3600e-05  7.5700e-01
   3.8200e-01  1.4200e-01]
 [ 5.4100e-01  9.2100e-01 -5.8070e+00  5.9900e-02  3.0600e-04  1.2000e-03
   6.8400e-02  6.4700e-01]
 [ 3.0300e-01  7.2100e-01 -1.217

In [36]:
distance_frame = song_features_data.drop(['artist','tags','tempo','duration_mins','mode'], axis=1)
distance_frame.head()

Unnamed: 0,track_id,name,spotify_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence
0,TRIOREW128F424EAF0,Mr. Brightside,09ZQ5TmUG8TSL56n0knqrj,0.355,0.918,-4.36,0.0746,0.00119,0.0,0.0971,0.24
1,TRRIVDJ128F429B0E8,Wonderwall,06UfBBDISthj1ZJAtX4xjj,0.409,0.892,-4.373,0.0336,0.000807,0.0,0.207,0.651
2,TROUVHL128F426C441,Come as You Are,0keNu0t0tqsWtExGM3nT1D,0.508,0.826,-5.783,0.04,0.000175,0.000459,0.0878,0.543
3,TRUEIND128F93038C4,Take Me Out,0ancVQ9wEcHVd0RrGICTE4,0.279,0.664,-8.851,0.0371,0.000389,0.000655,0.133,0.49
4,TRLNZBD128F935E4D8,Creep,01QoK9DA7VTeTSE3MNzp4I,0.515,0.43,-9.935,0.0369,0.0102,0.000141,0.129,0.104


In [37]:
distance_frame.drop_duplicates(subset='track_id', keep='first', inplace=True)
distance_frame.track_id.nunique()

50683

In [38]:
distance_frame.to_csv("misc/distance_frame.csv")

In [39]:
distance_frame.head()

Unnamed: 0,track_id,name,spotify_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence
0,TRIOREW128F424EAF0,Mr. Brightside,09ZQ5TmUG8TSL56n0knqrj,0.355,0.918,-4.36,0.0746,0.00119,0.0,0.0971,0.24
1,TRRIVDJ128F429B0E8,Wonderwall,06UfBBDISthj1ZJAtX4xjj,0.409,0.892,-4.373,0.0336,0.000807,0.0,0.207,0.651
2,TROUVHL128F426C441,Come as You Are,0keNu0t0tqsWtExGM3nT1D,0.508,0.826,-5.783,0.04,0.000175,0.000459,0.0878,0.543
3,TRUEIND128F93038C4,Take Me Out,0ancVQ9wEcHVd0RrGICTE4,0.279,0.664,-8.851,0.0371,0.000389,0.000655,0.133,0.49
4,TRLNZBD128F935E4D8,Creep,01QoK9DA7VTeTSE3MNzp4I,0.515,0.43,-9.935,0.0369,0.0102,0.000141,0.129,0.104


In [40]:
def get_distances(data, p_vector):
    names = data['name']
    data = data.drop(['name', 'spotify_id'], axis=1)
    distance_dict = data.set_index(['track_id']).to_dict('index')
    for key in distance_dict:
        distance_dict[key] = list(distance_dict[key].values())
    distance_dict = distance_calc(distance_dict, p_vector, names)
    return pd.DataFrame.from_dict(distance_dict, orient='index', columns=['id', 'distance'])

def distance_calc(dict, v1, name_list):
    distances = {}
    i = 0
    name_list = name_list.to_list()
    for id in dict.keys():
        v2 = dict[id]
        value = 0.0
        for n in range(len(v1)):
            value += np.linalg.norm(v1[n] - v2[n])
        distances[name_list[i]] = (id, value)
        i += 1
    return distances

distance_frame2 = get_distances(distance_frame, predicted)


In [41]:
POTENTIAL_N = 50 #defines size
def potential_songs(frame, n):
    temp = frame.nsmallest(n, columns='distance', keep='all')
    return temp

potential_songs_data = potential_songs(distance_frame2, POTENTIAL_N)
potential_songs_data

Unnamed: 0,id,distance
Going Blind,TRDCZEF128F42AFFC7,0.168926
Rollercoaster,TRCRRZW128F426FAD9,0.214474
The Soviet,TRNIPEZ128F14ADCA4,0.225636
Halo Of Ashes,TRSHHYM128F4277387,0.232395
What I Have To Do,TRHJEYR128F145E9C0,0.238936
You're Gone,TRZFEDU128E0791895,0.24209
Shiver,TRUGQEP12903CD3CA2,0.249044
Grain,TRJSQAD128F42963DB,0.259701
Coffee Shop Soundtrack,TRISCVQ12903CFFCAB,0.26947
California,TRFNKTW128F93116B8,0.277023


In [42]:
lyrics_embeddings_csv = pd.read_csv('misc/lyrics_embeddings.csv')
lyrics_embeddings_3d_csv = pd.read_csv('misc/lyrics_embeddings_3d.csv')

In [43]:
def get_embeddings(frame, frame3D):
    lyrics_embeddings = dict()
    lyrics_embeddings_3d  = dict()

    for i in range(len(frame)):
        lyrics_embeddings[frame.iloc[i, 0]] = frame.iloc[i, 1:].to_numpy()

    for i in range(len(frame3D)):
        lyrics_embeddings_3d[frame3D.iloc[i, 0]] = frame3D.iloc[i, 1:].to_numpy()

    return lyrics_embeddings, lyrics_embeddings_3d

lyrics_embeddings, lyrics_embeddings_3d = get_embeddings(lyrics_embeddings_csv, lyrics_embeddings_3d_csv) 


In [44]:
def get_candidates(original_playlist, index, p_songs):
    candidates = dict()
    rnn_track_ids = []
    for track in original_playlist[index]:
        rnn_track_ids.append(track[0])
        candidates[track[0]] = lyrics_embeddings_3d[track[0]]

    cutoff = len(candidates)

    for _, row in p_songs.head(100).iterrows():
        candidates[row['id']] = lyrics_embeddings_3d[row['id']]

    print(len(candidates))

    return candidates, rnn_track_ids, cutoff

candidates, rnn_track_ids, cutoff = get_candidates(ops_x_test, random_index, potential_songs_data)

69


In [45]:
# For reducing dimensions of the embeddings
def reduce_dims(lyrics_embeddings):
    raw_embeddings = np.concatenate(list(lyrics_embeddings.values())).reshape(len(lyrics_embeddings), 768)
    track_ids = list(lyrics_embeddings.keys())
    dim_model = PCA(n_components=150, random_state=42)
    dim_model.fit(raw_embeddings)
    reduced_embeddings = dim_model.transform(raw_embeddings)
    reduced_embeddings_dict = {track_ids[i]: reduced_embeddings[i] for i in range(len(track_ids))}

    og_embeddings = np.array([reduced_embeddings_dict[track_id] for track_id in rnn_track_ids])

    return reduced_embeddings_dict, og_embeddings

reduced_embeddings_dict, og_embeddings = reduce_dims(lyrics_embeddings)

At this stage, we must compare the embeddings in the predicted list against those in the original input list and find the best candidates
### Cosine Similarity

In [46]:
def calc_cosine(reduced_embeddings_dict, potential_songs_data):
    similarities = list()

    for track_id in potential_songs_data['id']:

        candidate_embedding = reduced_embeddings_dict[track_id].reshape(1, -1)
        similarity = cosine_similarity(candidate_embedding, og_embeddings)
        similarities.append(np.mean(similarity))

    similarities = np.array(similarities)
    most_similar_indices = np.argsort(similarities)[::-1]
    selected_songs_cs = potential_songs_data.iloc[most_similar_indices[:10]]
    return selected_songs_cs

selected_songs_cs = calc_cosine(reduced_embeddings_dict, potential_songs_data)

### Pairwise Distances

In [47]:
def calc_pairwise(reduced_embeddings_dict, selected_songs_cs):
    candidate_embeddings = np.array([reduced_embeddings_dict[track_id] for track_id in selected_songs_cs['id']])

    distances = pairwise_distances(candidate_embeddings, og_embeddings, metric='euclidean')
    mean_distances = np.mean(distances, axis=1)
    closest_candidates_indices = np.argsort(mean_distances)[:10]
    selected_songs_pd = selected_songs_cs.iloc[closest_candidates_indices]

    print(closest_candidates_indices)
    return selected_songs_pd, closest_candidates_indices

selected_songs_pd, closest_candidates_indices = calc_pairwise(reduced_embeddings_dict, selected_songs_cs)
selected_songs_pd

[5 4 3 7 9 6 8 1 2 0]


Unnamed: 0,id,distance
2113,TREOZBH128F92E605A,0.30773
Passenger,TRJBDBD12903CA83A3,0.321754
The Walls Of Babylon,TRDUGMI128F930C2F5,0.366795
Here & Now,TRAXEFD128F428A47D,0.356783
The Soviet,TRNIPEZ128F14ADCA4,0.225636
"Too Bright To See, Too Loud To Hear",TRSZWAD128F92C3A11,0.350149
Warriors,TRBUOWV12903CA88E1,0.321549
Popular,TRLLTXT128F145FE1C,0.335827
C,TRIMWMM128F428CB63,0.309269
Masters Of The Universe,TRHLEBE12903D07532,0.31747


In [52]:
#Predicted Data
def get_recs(song_features_data, selected_songs_pd):
    return song_features_data[song_features_data['track_id'].isin(selected_songs_pd['id'])]

rec_songs = get_recs(song_features_data, selected_songs_pd)
rec_songs.head(10)

Unnamed: 0,track_id,name,artist,spotify_id,tags,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_mins
1654,TRJBDBD12903CA83A3,Passenger,Deftones,1dKpS2vuQANrx47UDQTPG9,"rock, alternative, metal, alternative_rock, ha...",0.345,0.797,-6.152,1,0.0551,0.00188,0.0668,0.145,0.223,117.987,6.144433
5678,TRBUOWV12903CA88E1,Warriors,Imagine Dragons,02SOn7MNKnGZYxm8Yx1ViX,"rock, indie, alternative_rock, indie_rock",0.355,0.829,-6.192,0,0.072,0.0686,0.00184,0.241,0.341,78.017,2.834667
14035,TRLLTXT128F145FE1C,Popular,Nada Surf,0k1ocls4SpeGkUkUoWIKk3,"rock, alternative, indie, alternative_rock, in...",0.491,0.881,-6.241,0,0.0879,0.0136,2.8e-05,0.249,0.324,95.102,3.648667
18105,TRDUGMI128F930C2F5,The Walls Of Babylon,Symphony X,0E8yqqulk60MIn5L5i4Ldk,"progressive_rock, progressive_metal, power_met...",0.439,0.951,-6.165,1,0.0862,0.000207,0.00255,0.0766,0.192,93.983,8.270433
18334,TREOZBH128F92E605A,2113,Coheed and Cambria,0MFEW9QSrRgtdrqINJavJB,progressive_rock,0.396,0.798,-6.232,0,0.0586,0.00646,0.000412,0.0982,0.372,74.767,9.776
22411,TRHLEBE12903D07532,Masters Of The Universe,Juno Reactor,2SQlT0uBjW07KuP1TNRd2j,"electronic, industrial, psychedelic, trance, t...",0.42,0.867,-6.315,1,0.037,0.00107,8.6e-05,0.0782,0.3,170.05,4.093333
25667,TRIMWMM128F428CB63,C,DIR EN GREY,0YFwTfVGslNeOWMeZ8qyhP,japanese,0.423,0.946,-6.253,1,0.0797,3e-06,6.4e-05,0.104,0.324,93.043,3.5011
35122,TRNIPEZ128F14ADCA4,The Soviet,mewithoutYou,0D6m2y5QwQ3kUWrVT2wixO,"indie, post_hardcore",0.423,0.912,-6.141,0,0.0458,0.000259,0.0318,0.119,0.367,88.162,3.050667
41304,TRSZWAD128F92C3A11,"Too Bright To See, Too Loud To Hear",Underoath,22LwhIWG9emahUJCGPWLT0,"metalcore, post_hardcore, screamo",0.451,0.814,-6.199,0,0.0705,0.0179,0.115,0.0687,0.257,120.136,4.514217
46216,TRAXEFD128F428A47D,Here & Now,The Ernies,7pIZ4XDMrAYzSPCdvYXrKe,"rock, punk, ska",0.418,0.959,-6.12,1,0.153,5.8e-05,0.00544,0.168,0.361,92.901,3.5151


In [49]:
#Original Playlist
def get_ogp(song_features_data, rnn_track_ids):
    return song_features_data[song_features_data['track_id'].isin(rnn_track_ids)]

og_songs = get_ogp(song_features_data, rnn_track_ids)
og_songs.head(19)

Unnamed: 0,track_id,name,artist,spotify_id,tags,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_mins
1038,TRUQERG128F147CBEA,Welcome Home (Sanitarium),Metallica,0qGkuH9Ad7ttDIjgr3atv5,"rock, metal, hard_rock, 80s, heavy_metal, thra...",0.303,0.721,-12.17,1,0.0361,0.000602,0.0979,0.114,0.397,97.089,6.450667
1299,TRKGGMK128F42286FE,Screenager,Muse,033IZMP0kKR0RC1h4Z2NVy,"rock, alternative, indie, alternative_rock, in...",0.375,0.403,-12.776,0,0.0286,0.745,0.546,0.181,0.207,80.661,4.333333
4659,TRAQJBB128F4239815,Drone,Panda Bear,0cYrSvxiZIJ4tHs9xvsE5F,"electronic, indie, ambient, psychedelic",0.173,0.423,-8.247,1,0.0379,0.874,0.0022,0.0843,0.0372,119.151,4.014433
7450,TRDRAHQ128F427FE50,The Quiet Place,In Flames,0vBcI3etbwx17NzQrCM7MM,"metal, death_metal, melodic_death_metal",0.555,0.91,-8.237,0,0.069,0.000362,0.836,0.107,0.277,120.004,4.60955
7916,TREUNFY128F4293B50,Touch of Red,In Flames,0TnVICTki2jZZGL8UKyc1I,"metal, death_metal, melodic_death_metal",0.518,0.969,-5.002,0,0.0833,0.000201,0.856,0.192,0.0391,99.997,4.2151
8105,TRBCLDH128F4293B4F,Dead Alone,In Flames,0ZvvmcySNrb01wAWAqSJ7S,"metal, death_metal, melodic_death_metal",0.507,0.994,-3.455,0,0.0826,5.4e-05,0.757,0.382,0.142,108.842,3.712667
8167,TROXVFQ128F930BD41,When the Lights Are Down,Kamelot,2kHhZ63nmJYUL7pYoTpBP7,"metal, progressive_metal, power_metal, symphon...",0.29,0.944,-6.377,0,0.0789,4.2e-05,0.0495,0.119,0.361,169.959,3.6911
8465,TRFDMMO128F424D545,6:00,Dream Theater,1UQn05L6LCftnI9VoNy4Sp,"metal, progressive_rock, progressive_metal",0.541,0.921,-5.807,0,0.0599,0.000306,0.0012,0.0684,0.647,99.901,5.524
18000,TRAQTDW12903CF249C,Release,Anathema,2rWHM4O3SejQuNIDwc5uub,"progressive_rock, progressive_metal, doom_metal",0.193,0.626,-8.576,1,0.0322,0.00062,4.9e-05,0.162,0.0464,103.97,5.78755
18633,TRFARTQ128F935A659,Clad in Shadows,In Flames,5L9A4H2btUiXmcwcoodVSn,"death_metal, melodic_death_metal, swedish",0.342,0.933,-7.323,0,0.115,0.0019,3.2e-05,0.121,0.384,82.643,2.8931


In [53]:
def trim_recs(rec_songs):
    rec_songs = rec_songs.loc[:, ['name', 'artist', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
                                    'liveness', 'valence']]
    return rec_songs

rec_songs = trim_recs(rec_songs)
rec_songs.head(10)

Unnamed: 0,name,artist,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence
1654,Passenger,Deftones,0.345,0.797,-6.152,0.0551,0.00188,0.0668,0.145,0.223
5678,Warriors,Imagine Dragons,0.355,0.829,-6.192,0.072,0.0686,0.00184,0.241,0.341
14035,Popular,Nada Surf,0.491,0.881,-6.241,0.0879,0.0136,2.8e-05,0.249,0.324
18105,The Walls Of Babylon,Symphony X,0.439,0.951,-6.165,0.0862,0.000207,0.00255,0.0766,0.192
18334,2113,Coheed and Cambria,0.396,0.798,-6.232,0.0586,0.00646,0.000412,0.0982,0.372
22411,Masters Of The Universe,Juno Reactor,0.42,0.867,-6.315,0.037,0.00107,8.6e-05,0.0782,0.3
25667,C,DIR EN GREY,0.423,0.946,-6.253,0.0797,3e-06,6.4e-05,0.104,0.324
35122,The Soviet,mewithoutYou,0.423,0.912,-6.141,0.0458,0.000259,0.0318,0.119,0.367
41304,"Too Bright To See, Too Loud To Hear",Underoath,0.451,0.814,-6.199,0.0705,0.0179,0.115,0.0687,0.257
46216,Here & Now,The Ernies,0.418,0.959,-6.12,0.153,5.8e-05,0.00544,0.168,0.361


In [54]:
def pipeline_helper(data, p_vector, n, lyrics_e, song_data):
    frame = get_distances(data, p_vector)
    p_s_d = potential_songs(frame, n)
    r_e_d, _ = reduce_dims(lyrics_e)
    s_s_cs = calc_cosine(r_e_d, p_s_d)
    s_s_pd, _ = calc_pairwise(r_e_d, s_s_cs)
    rec = get_recs(song_data, s_s_pd)
    return trim_recs(rec) 

In [82]:
def calculate_euclidean_distance(v1, v2):
    return np.linalg.norm(v1 - v2)

def calculate_score(pd1, pd2):
    if pd1.shape[1] != pd2.shape[1]:
        raise ValueError("Dataframes must have the same number of features.")
    scores = []
    for _, x in pd1.iterrows():
        fx = np.array(x[1:].values)
        dist = list()
        for _, y in pd2.iterrows():
            fy = np.array(y[1:].values)
            dist.append(calculate_euclidean_distance(fx, fy))
        scores.append(np.mean(dist))
    return scores

In [55]:
def generate_fig(candidates, cutoff, closest_candidates_indices):
    fig = go.Figure()

    text_data = list(candidates.keys())
    embeddings_3d = np.concatenate(list(candidates.values())).reshape(len(candidates), 3)

    color_data = ['blue' if i < cutoff else 'red' for i in range(len(candidates))]
    for i in closest_candidates_indices:
        color_data[i] = 'green'
    color_data[closest_candidates_indices[0]] = 'purple'

    fig.add_trace(go.Scatter3d(
        x=embeddings_3d[:, 0],
        y=embeddings_3d[:, 1],
        z=embeddings_3d[:, 2],
        text=text_data,
        mode='markers',
        marker=dict(
            size=5,
            color=color_data,
            colorscale='Viridis',
            opacity=1
        )
    ))


    fig.update_layout(
        scene=dict(
            xaxis=dict(title='x'),
            yaxis=dict(title='y'),
            zaxis=dict(title='z')
        ),
        width=1000,
        height=800
    )
    fig.update_layout(legend_title_text = "Songs")

    fig.show()

generate_fig(candidates, cutoff, closest_candidates_indices)
