In [5]:
# Import necessary modules
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.cluster import MeanShift
from sklearn.preprocessing import normalize
from statistics import mean
from itertools import cycle
from random import randrange
from operator import itemgetter

import requests
import json
import psycopg2
import time
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import copy

In [6]:
# connecting to database

def connect():
  with open("/content/config.json") as f:
      config= json.load(f)

  conn = psycopg2.connect(
              host=config["db_host"],
              dbname=config["db_name"],
              port=config["db_port"],
              user=config["db_user"],
              password=config["db_pw"])
  return conn

In [7]:
# get data from db

def db_data(conn, table):
  cur = conn.cursor()
  cur.execute(f'SELECT * FROM  {table}')
  #cur.execute('SELECT * FROM music;')  # table music 140k songs
  #cur.execute('SELECT * FROM songs;')  # table songs 860ksongs
  res = cur.fetchall()
  return res

In [8]:
# preprocessing data with user input
INPUT_URI = '1oJ8EDr2pv94IvSd6ZmDgo'

data = db_data(connect(), 'spotify')
# data = data[:10_000] # make data smaller not to crash RAM
# data = data[:500_000]

# get full song data from userinput URI
user_song = [song for song in data if  song[0] == INPUT_URI][0]

# append user's song to the end for later indexing
if user_song in data:
    data.remove(user_song)
data.append(user_song)

# get rid of id (since it is string)
indices = list(range(1, len(data[0])))
#indices.pop(-2)
# indices = list(range(1, len(data[0])-1))
# indices.pop(-2)
data_no_uri = [itemgetter(*indices)(entry) for entry in data]

# encode genre: 
int_encoding_scheme = {"Movie": 0,
                       "R&B": 1,
                       "A Capella": 2,
                       "Alternative": 3,
                       "Country" : 4,
                       "Dance" : 5,
                       "Electronic" : 6,
                       "Anime" : 7,
                       "Folk": 8,
                       "Blues" : 9,
                       "Opera": 10,
                       "Hip-Hop": 11,
                       "Children Music": 12,
                       "Indie": 13, 
                       "Pop": 14,
                       "Rap": 15,
                       "Classical": 16,
                       "Reggae": 17,
                       "Reggaeton" : 18,
                       "Jazz": 19,
                       "Rock": 20,
                       "Ska": 21,
                       "Comedy": 22,
                       "Soul": 23,
                       "Soundtrack": 24,
                       "World": 25}

encoded_genre_data = []
for music in data_no_uri:
  music = list(music)
  encoded_song = []
  if 'Children' in music[-1]:
    encoded_song = [mus for mus in music[:-1]]
    encoded_song.append(int_encoding_scheme['Children Music'])
  else:
    encoded_song = [mus for mus in music[:-1]]
    encoded_song.append(int_encoding_scheme[music[-1]])
  encoded_genre_data.append(encoded_song)

# normalize everything except genre
norm_data = normalize(encoded_genre_data).tolist()
pre_proc_data = []
for n_song,c_song in zip(norm_data, encoded_genre_data):
  temp_data = copy.deepcopy(n_song[:-1])
  temp_data.append(c_song[-1])
  pre_proc_data.append(temp_data)

# apply weights (make sure the last value is 1! Because otherwise it will change genre)
# danceability | energy | loudness | speechiness | acousticness | instrumentalness | liveness | valence | popularity | genre
weights = [1,1,1,1,1,1,1,1,0.3,1]
weighted_norm_data = []
for song in pre_proc_data:
  new_song = []
  for i,value in enumerate(song):
    value = value * weights[i]
    new_song.append(value)
  weighted_norm_data.append(new_song)
weighted_norm_data = [tuple(data) for data in weighted_norm_data]

# since unsupervised --> no test data needed
# train 100% test 0%
#train, test = train_test_split(clean_data, train_size=0.99, random_state=None, shuffle=True)

In [9]:
def get_recomendation_KNN(data):
  nbrs = NearestNeighbors(n_neighbors=11, algorithm='auto').fit(data)
  distances, indices = nbrs.kneighbors(data)
  distances, indices = distances.tolist(), indices.tolist()
  # find n-cluster with the user song
  # user song was appended last, so its index is len(arr)-1
  user_song_i = len(data) - 1
  user_song_neigbors = []
  for ind_list in zip(distances, indices):
    if user_song_i in ind_list[1]:
      user_song_neigbors.append(list(ind_list))
  
  # select the best cluster -> smallest sum of distances
  sums = [sum(dist[0]) for dist in user_song_neigbors]
  # get index of smallest sum
  best_5 =  user_song_neigbors[sums.index(min(sums))]
  best_5_d = best_5[0]
  best_5_i = best_5[1]
  best_5_songs = [data[i] for i in best_5_i]
  return best_5_songs, best_5_d

In [11]:
# renew token here https://developer.spotify.com/console/get-track/?id=11dFghVXANMlKmJXsNCbNl&market=
headers = {
  'Authorization': 'Bearer BQD5SDsGHVGE876lfR6XgYPg8sDz8HsNs8xhjMLSMJ-tdDKm1OGYPl5xdpwlt4uFtsfpf3hgOMp9J6ImlBxvVtXjBkDIGe5bhA1oaR4VPuUHVyjmvO9st2vveGnKdhv3rsYz39ZAi1gNlicZFABdB9XBIG3YebC80zw'
}
user_song_uri = data[-1][0] # user's song is last

# get data about user's song (name and artist)
url = f'https://api.spotify.com/v1/tracks/{user_song_uri}'
res = requests.request("GET", url, headers=headers, data='').json()
artist = res['artists'][0]['name']
song = res['name']
print(f'User song: {song} by {artist}')
print(f'Searching thru {len(data)} songs only for you :)')

# recommend
rec_5, distance = get_recomendation_KNN(weighted_norm_data)
output = [list(data[weighted_norm_data.index(rec)]) for rec in rec_5]
print(output)
index_of_user_song = [i for i, song in enumerate(output) if song[0] == user_song_uri][0]
distance = [abs(distance[index_of_user_song] - dist) for dist in distance]
recomendations_uri = [[song[0], song[-1], song[-2],round(d,4)] for d, song in zip(distance,output)]

User song: MAMA by 6ix9ine
Searching thru 176774 songs only for you :)
[['1oJ8EDr2pv94IvSd6ZmDgo', 0.761, 0.672, -5.887, 0.212, 0.0893, 9.77e-06, 0.14, 0.437, 81, 'Rap'], ['7GX5flRQZVHRAGd6B4TmDO', 0.732, 0.75, -6.366, 0.231, 0.00264, 0.0, 0.109, 0.401, 86, 'Rap'], ['1lWK70HfYRFZtWJUwUrY48', 0.688, 0.642, -5.567, 0.321, 0.221, 0.0, 0.2, 0.341, 77, 'Rap'], ['75lcPkAkUsKWHGBlGM476R', 0.825, 0.572, -6.067, 0.0864, 0.0079, 0.0, 0.113, 0.568, 83, 'Rap'], ['67fNrBUMc7iWMofdCaoJV9', 0.845, 0.76, -6.007, 0.357, 0.221, 0.0, 0.107, 0.558, 83, 'Rap'], ['2tPcTFiQF9MbVUyjZ3zDhA', 0.856, 0.652, -5.324, 0.143, 0.177, 0.0, 0.193, 0.471, 74, 'Rap'], ['6wJrqTE6c4NtfDqAcSRcBf', 0.779, 0.767, -5.462, 0.258, 0.00214, 0.0, 0.102, 0.523, 78, 'Rap'], ['6vnfObZ4Ys70SBAtti1xZ9', 0.81, 0.582, -5.098, 0.125, 0.0862, 0.00681, 0.207, 0.503, 72, 'Rap'], ['0PG9fbaaHFHfre2gUVo7AN', 0.747, 0.57, -6.711, 0.081, 0.0642, 0.0, 0.0832, 0.65, 91, 'Rap'], ['2GGMabyHXnJmjY6CXhhB2e', 0.95, 0.59, -6.508, 0.29, 0.00534, 0.0, 0.11

Unnamed: 0_level_0,Artist,Song,Genre,Popularity,Distance
URI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1oJ8EDr2pv94IvSd6ZmDgo,6ix9ine,MAMA,Rap,81,0.0
7GX5flRQZVHRAGd6B4TmDO,Lil Uzi Vert,XO Tour Llif3,Rap,86,0.0023
1lWK70HfYRFZtWJUwUrY48,Roddy Ricch,Die Young,Rap,77,0.0028
75lcPkAkUsKWHGBlGM476R,Trippie Redd,1400 / 999 Freestyle,Rap,83,0.0028
67fNrBUMc7iWMofdCaoJV9,Blueface,Bleed It,Rap,83,0.0029
2tPcTFiQF9MbVUyjZ3zDhA,SOB X RBE,Paramedic!,Rap,74,0.0032
6wJrqTE6c4NtfDqAcSRcBf,6ix9ine,TATI,Rap,78,0.0036
6vnfObZ4Ys70SBAtti1xZ9,Travis Scott,COFFEE BEAN,Rap,72,0.0037
0PG9fbaaHFHfre2gUVo7AN,Cardi B,Please Me,Rap,91,0.0038
2GGMabyHXnJmjY6CXhhB2e,Cardi B,Money,Rap,89,0.0039


In [45]:
# evaluate the results: 
# measure the average feature value of the playlist of user and measure the average value of feautres of recommended songs
#avg_dist_res = round(mean(distance),2)
#avg_dist_res
# avg feautre value
def evaluate(rec_data):
  avg_user = 0
  recommender_avg = []
  for out in rec_data:
    if out[0] == INPUT_URI:
      avg_user = round(mean(out[1:-1]),4) # exclude URI and genre
    recommender_avg.append(round(mean(out[1:-1]),4))
  
  # calc percentages
  res = []
  for rec_avg in recommender_avg:
    if rec_avg == avg_user:
      res.append(1) 
    else:
      res.append(1 - round((abs(rec_avg - avg_user) / avg_user),4))

  out = {rec[0]: perc for rec,perc in zip(rec_data,res)}
  
  return out
   


In [50]:
perc = evaluate(output)
print('Recomended Songs:')
print(f'Overall accuracy is: {round(mean(list(perc.values())[1:]),2)}')
uri_artists = []
for recomendated_data in recomendations_uri:
  # ['1DoGY3bWXQEWqYc1jZ9Zbe', 'Movie', 0, 0.34]
  url = f'https://api.spotify.com/v1/tracks/{recomendated_data[0]}'
  res = requests.request("GET", url, headers=headers, data='').json()
  artist = res['artists'][0]['name']
  song = res['name']
  uri_artists.append([recomendated_data[0], artist, song, recomendated_data[1], 
                      recomendated_data[2], recomendated_data[-1], perc[recomendated_data[0]]])

df = pd.DataFrame(uri_artists, columns = ['URI', 'Artist','Song', 'Genre', 'Popularity', 'Distance', 'Similarity in %'])
# perc is how similar the songs are acc. to the audio features
df.set_index('URI', inplace=True)
df.sort_values(by=['Distance'])

Recomended Songs:
Overall accuracy is: 0.94


Unnamed: 0_level_0,Artist,Song,Genre,Popularity,Distance,Similarity in %
URI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1oJ8EDr2pv94IvSd6ZmDgo,6ix9ine,MAMA,Rap,81,0.0,1.0
7GX5flRQZVHRAGd6B4TmDO,Lil Uzi Vert,XO Tour Llif3,Rap,86,0.0023,0.9427
1lWK70HfYRFZtWJUwUrY48,Roddy Ricch,Die Young,Rap,77,0.0028,0.9538
75lcPkAkUsKWHGBlGM476R,Trippie Redd,1400 / 999 Freestyle,Rap,83,0.0028,0.9783
67fNrBUMc7iWMofdCaoJV9,Blueface,Bleed It,Rap,83,0.0029,0.9688
2tPcTFiQF9MbVUyjZ3zDhA,SOB X RBE,Paramedic!,Rap,74,0.0032,0.9192
6wJrqTE6c4NtfDqAcSRcBf,6ix9ine,TATI,Rap,78,0.0036,0.9683
6vnfObZ4Ys70SBAtti1xZ9,Travis Scott,COFFEE BEAN,Rap,72,0.0037,0.8941
0PG9fbaaHFHfre2gUVo7AN,Cardi B,Please Me,Rap,91,0.0038,0.883
2GGMabyHXnJmjY6CXhhB2e,Cardi B,Money,Rap,89,0.0039,0.9066
