In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.decomposition import NMF
import sys
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors

In [2]:
data=pd.read_csv("spotify.csv", index_col=[0])

In [3]:
#NMF - non-negative matrix factorisation is unsuperised alogorithm used fr decomosing matrix into two small matrices 
#Bigger matrix is also called as sparse matrix

In [4]:
#read about perplexity -- used to calculate the effect of unsupervised algorithm

In [5]:
def get_component_matrices(data):
    x = data.values
    nmf = NMF(n_components=100, max_iter=100, random_state=100)
    nmf.fit(x)
    #user matrix  - compoents of row matix
    user_matix = nmf.transform(x)
    #song matrix - compoents of column matix
    song_matrix = nmf.components_
    #transpose the song matix
    song_matrix =song_matrix.T
    return user_matix,song_matrix

In [6]:
user_matix,song_matrix = get_component_matrices(data)



In [7]:
#steps in building the recommedation system
#step 1 - consider any user whom you want to recommend song
#step 2 - find the euclidian distance between the every  user and the user selected in step number one
#step 3 - find the song that frequently listned by nearest five users to user in step 1 
#step 4 - recommend songs to user based on step 3

In [8]:
def euclidean_distance(v1,v2):
    return pow(sum([pow(v1[x]-v2[x],2) for x in range(len(v1))]),0.5)

In [9]:
def Recommended_Song (user):
    dist = []
    user_matix,song_matrix = get_component_matrices(data)
    for i in range(len(user_matix)):
        if user !=i:
            dist.append(euclidean_distance(user_matix[user],user_matix[i] )) 
        else:
            dist.append(sys.float_info.max)
    
    dist_user_index = np.argsort(dist)[:5]
    dist_user_dataframe = pd.DataFrame(data.iloc[dist_user_index,:])
    
    #find the top five songs with the highest frequencies among the songs listen by top 5
    
    highest_freq_songs = dist_user_dataframe.sum(axis=0)
    return pd.DataFrame(highest_freq_songs).sort_values(ascending=False , by=0).head(5).index

In [10]:
Recommended_Song(1)



Index(['song_4366', 'song_2001', 'song_2998', 'song_2542', 'song_4355'], dtype='object')

In [11]:
Recommended_Song(100)



Index(['song_386', 'song_2711', 'song_4173', 'song_1192', 'song_2421'], dtype='object')

In [12]:
#Clustering

In [13]:
def cluster_id(data):
    user_matix,song_matrix = get_component_matrices(data)
    all_songs=data.columns
    Km = KMeans(n_clusters=12, max_iter=1000).fit(song_matrix)
    all_songs_cluster_id= Km.predict(song_matrix)
    return all_songs_cluster_id

In [14]:
cluster_id(data)



array([11,  0,  9, ...,  8,  2, 10])

In [15]:
#This function return the songs names and songs id's present in the clsuter 
#the fucntion take clsuter id of songs being listened , song matix and all song names.

In [16]:
def song_from_cluster(cluster_id,song_matrix,all_songs):
    collect_song_names=[]
    collect_song_index=[]
    for song_index in range(len(song_matrix)):
        if Km.predict([song_matrix[song_index]])[0]==cluster_id:
            collect_song_names.append(all_songs[song_index])
            collect_song_index.append(song_index)
    return collect_song_names,collect_song_index         

In [17]:
def New_Song_recommendation(song_name,data,N_songs = 5):
    user_matix,song_matrix = get_component_matrices(data)
    km = KMeans(n_clusters=12, max_iter=1000).fit(song_matrix)
    Index_of_song = list(data.columns).index(song_name)
    song_data = song_matrix[Index_of_song]
    #print(km.predict([song_data]))
    all_song_cluster_id = list(km.predict(song_matrix))
    # collect all song belonging to selected cluster
    songs_index_in_selected_cluster_id = [x for x in range(len(all_song_cluster_id)) if all_song_cluster_id[x] == km.predict([song_data])]
    song_cluster_data = song_matrix[songs_index_in_selected_cluster_id]
    knn = NearestNeighbors(n_neighbors=N_songs)
    knn.fit(song_cluster_data)
    song_ids = knn.kneighbors([song_matrix[Index_of_song]])[1]
    
    return data.columns[song_ids]    

In [18]:
New_Song_recommendation("song_23",data)

  return data.columns[song_ids]


array([['song_5', 'song_372', 'song_313', 'song_259', 'song_240']],
      dtype=object)

In [19]:
#lets assume that user is currently listening to song number 42
current_song="song_42"

In [20]:
#Extract the songs index for the songs currently being listened 
current_song_index= 41
#Extarct the songs data from song matrix from current song index
current_song_data=song_matrix[current_song_index]

In [24]:
Km = KMeans(n_clusters=12, max_iter=1000).fit(song_matrix)
all_songs = data.columns

In [25]:
#Make call to get the songs names and song indexs from cluster of songs where the cuurent songs belongs 
current_song_names, current_song_index=song_from_cluster(Km.predict([song_matrix[current_song_index]])[0],song_matrix,all_songs)

In [26]:
#Get the data for all songs present in the cluster 
song_data_in_cluster = song_matrix[current_song_index]

In [27]:
len(song_data_in_cluster)

413

In [28]:
knn= NearestNeighbors(n_neighbors=5)
knn.fit(song_data_in_cluster)

NearestNeighbors()

In [29]:
recommended_song = knn.kneighbors(song_matrix[current_song_index])[1]

In [30]:
recommended_song

array([[  0, 138, 236,   9, 313],
       [  1, 313, 387,  81, 375],
       [  2, 324, 216, 259, 140],
       ...,
       [410,  64, 108, 341, 209],
       [411,  19, 333, 236, 217],
       [412, 351, 153, 107, 325]], dtype=int64)

In [31]:
recommended_song[0]

array([  0, 138, 236,   9, 313], dtype=int64)

In [32]:
data.columns[recommended_song[0]]

Index(['song_1', 'song_139', 'song_237', 'song_10', 'song_314'], dtype='object')

In [None]:
#what are alternative to NMF
#what are the alternativenative to euclidean distance,how to validate which parameter is good
#how to choose number of feature for NMF
#how to choose optimal no of cluster for kmeans
#what is mean by perplexity