# Recommender System: Last.FM Dataset
## Recommender based on user activity

Reference: 2nd Workshop on Information Heterogeneity and Fusion in Recommender Systems (HetRec 2011). I. Cantod, P Brusilovsky, T. Kuflik. Proceedings of the 5th ACM conference on Recommender systems.<br>
https://grouplens.org/datasets/hetrec-2011/<br>

In [93]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt

import matrix_factorization_utilities

In [2]:
# opening artist data as pandas dataframe
artists = pd.read_csv('artists.dat',
                      delimiter='\t', low_memory=False)

In [3]:
artists.head(3)

Unnamed: 0,id,name,url,pictureURL
0,1,MALICE MIZER,http://www.last.fm/music/MALICE+MIZER,http://userserve-ak.last.fm/serve/252/10808.jpg
1,2,Diary of Dreams,http://www.last.fm/music/Diary+of+Dreams,http://userserve-ak.last.fm/serve/252/3052066.jpg
2,3,Carpathian Forest,http://www.last.fm/music/Carpathian+Forest,http://userserve-ak.last.fm/serve/252/40222717...


In [53]:
# changing column title from id to artistID to match user_artists dataframe
artists = artists.rename(columns = {'id':'artistID'})
# setting artistID as index 
artists = artists.set_index('artistID')
artists.head(3)

Unnamed: 0_level_0,name,url,pictureURL
artistID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,MALICE MIZER,http://www.last.fm/music/MALICE+MIZER,http://userserve-ak.last.fm/serve/252/10808.jpg
2,Diary of Dreams,http://www.last.fm/music/Diary+of+Dreams,http://userserve-ak.last.fm/serve/252/3052066.jpg
3,Carpathian Forest,http://www.last.fm/music/Carpathian+Forest,http://userserve-ak.last.fm/serve/252/40222717...


In [5]:
print('Total number of artistID = {}'.format(len(artists)))

Total number of artistID = 17632


In [6]:
# opening user_artist data as pandas dataframe
user_artists = pd.read_csv('user_artists.dat',
                      delimiter='\t', low_memory=False)

In [7]:
# weight column corresponds to listening count
user_artists.head(5)

Unnamed: 0,userID,artistID,weight
0,2,51,13883
1,2,52,11690
2,2,53,11351
3,2,54,10300
4,2,55,8983


In [8]:
# number of artists per user
artperuser = user_artists.groupby(['userID', 'artistID']).size().groupby(['userID']).size()
print('Total number of userID = {}'.format(len(artperuser)))

Total number of userID = 1892


In [9]:
# filtering users listening to more than 5 artists
users_withenough_artists = artperuser[artperuser >= 5].reset_index()[['userID']]
print('Total number of userID listening to at least 5 artists = {}'.format(len(users_withenough_artists)))

Total number of userID listening to at least 5 artists = 1877


In [10]:
# filtering user_artists dataframe to include only users listening to more than 5 artists
sel_user_art = pd.merge(users_withenough_artists, user_artists,
                       how='inner', left_on='userID', right_on='userID')

In [11]:
# normalizing weight(= listening count) column
w = sel_user_art[['weight']].values.astype(float)
min_max_scaler = preprocessing.MinMaxScaler()
w_scaled = min_max_scaler.fit_transform(w)
w_norm = pd.DataFrame(w_scaled)
w_norm.columns = ['norm_weight']
w_norm.head(5)

Unnamed: 0,norm_weight
0,0.03936
1,0.033142
2,0.032181
3,0.029201
4,0.025467


In [12]:
# replacing weight column with normalized weight in sel_user_art dataframe
sel_user_art_norm = pd.merge(sel_user_art, w_norm,
                       how='inner', left_index=True, right_index=True)
sel_user_art_norm = sel_user_art_norm.drop(['weight'], axis=1)
sel_user_art_norm.head(5)

Unnamed: 0,userID,artistID,norm_weight
0,2,51,0.03936
1,2,52,0.033142
2,2,53,0.032181
3,2,54,0.029201
4,2,55,0.025467


In [13]:
n_users = sel_user_art_norm.userID.unique().shape[0]
n_artists = sel_user_art_norm.artistID.unique().shape[0]
print('Number of users = {} and number of artists = {}'.format(n_users, n_artists))

Number of users = 1877 and number of artists = 17617


In [14]:
# splitting sel_user_art_norm into training and testing dataframes
sel_train, sel_test = train_test_split(sel_user_art_norm,
                                       stratify=sel_user_art_norm['userID'],
                                       test_size=0.3, random_state=42)


In [15]:
print('Shape of train dataframe is ' + str(sel_train.shape))
print('Shape of test dataframe is ' + str(sel_test.shape))

Shape of train dataframe is (64960, 3)
Shape of test dataframe is (27841, 3)


In [16]:
# reshaping sel_user_art dataframe
p_sel_user_art = pd.pivot_table(sel_user_art_norm, index='userID',
                                columns='artistID', aggfunc=np.max)
p_sel_user_art.head(5)

Unnamed: 0_level_0,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight
artistID,1,2,3,4,5,6,7,8,9,10,...,18736,18737,18738,18739,18740,18741,18742,18743,18744,18745
userID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,


In [17]:
# reshaping training and testing dataframes
p_sel_train = pd.pivot_table(sel_train, index='userID',
                             columns='artistID', aggfunc=np.max)
p_sel_test = pd.pivot_table(sel_test, index='userID',
                            columns='artistID', aggfunc=np.max)

In [18]:
print('Shape of reshaped train dataframe is ' + str(p_sel_train.shape))
print('Shape of reshaped test dataframe is ' + str(p_sel_test.shape))

Shape of reshaped train dataframe is (1877, 14199)
Shape of reshaped test dataframe is (1877, 8258)


In [19]:
# replacing NaN values with 0 
p_sel_train0 = p_sel_train.fillna(0)
p_sel_test0 = p_sel_test.fillna(0)

In [20]:
# finding mean norm_weight per user
mean_p_train = p_sel_train0.mean(axis=1)
mean_p_train.head(5)

userID
2    2.572552e-05
3    3.546764e-06
4    3.974884e-06
5    1.772583e-06
6    1.234039e-07
dtype: float64

### Collaborative Filtering Approach using Cosine Similarity

In [61]:
# calculating cosine similarity for users and artists
user_sim = pairwise_distances(p_sel_train0, metric='cosine')
art_sim = pairwise_distances(p_sel_train0.T, metric='cosine')

In [70]:
print('Shape of user_sim is ' + str(user_sim.shape))
print('Shape of art_sim is ' + str(art_sim.shape))

Shape of user_sim is (1877, 1877)
Shape of art_sim is (14199, 14199)


In [107]:
# defining function for predictions using cosine similarity
def pred(weight, similarity, type='user'):
    if type == 'user':
        mean_user_weight = weight.mean(axis=1)
        weight_diff = weight.sub(mean_user_weight, axis=0)
        pred = pd.DataFrame(similarity.dot(weight_diff) / 
                            np.array([np.abs(similarity).sum(axis=1)]).T).add(mean_user_weight,
                                                                              axis=0)
    elif type == 'artist':
        pred = pd.DataFrame(weight.dot(similarity)) / pd.DataFrame(np.array([np.abs(similarity).sum(axis=1)]))
    return pred.as_matrix()

In [108]:
# calculating prediction matrix based on user similarity
user_prediction = pred(p_sel_train0, user_sim, type='user')

In [109]:
# calculating prediction matrix based on artist similarity
artist_prediction = pred(p_sel_train0, art_sim, type='artist')

In [111]:
# defining function for RMSE
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    mse = mean_squared_error(np.nan_to_num(prediction),
                             np.nan_to_num(ground_truth))
    return sqrt(mse)

In [113]:
# converting p_sel_test0 dataframe to matrix
m_sel_test = p_sel_test0.as_matrix()

print('User-based Collaborative Filtering RMSE: ' + str(rmse(user_prediction, m_sel_test)))
print('Artist-based Collaborative Filtering RMSE: ' + str(rmse(artist_prediction, m_sel_test)))

User-based Collaborative Filtering RMSE: 0.014135628133673339
Artist-based Collaborative Filtering RMSE: 0.014139934958276132


### Model-Based Collaborative Filtering Approach using Matrix Factorization

In [114]:
# due to time it takes to run, output was saved to csv, and loaded for subsequent analysis
# using entire dataset for sample recommendation list to chosen userID

'''
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(p_sel_user_art.as_matrix(),
                                                                   num_features=5,
                                                                   regularization_amount=0.1)
pred_weight = np.matmul(U, M)
'''

'\nU, M = matrix_factorization_utilities.low_rank_matrix_factorization(p_sel_user_art.as_matrix(),\n                                                                   num_features=5,\n                                                                   regularization_amount=0.1)\npred_weight = np.matmul(U, M)\n'

In [115]:
'''
pred_weight_results = pd.DataFrame(index=p_sel_user_art.index,
                             columns=p_sel_user_art.columns,
                             data=pred_weight)
pred_weight_results.to_csv('pred_weight_results.csv')
pred_weight_results.head(5)
'''

"\npred_weight_results = pd.DataFrame(index=p_sel_user_art.index,\n                             columns=p_sel_user_art.columns,\n                             data=pred_weight)\npred_weight_results.to_csv('pred_weight_results.csv')\npred_weight_results.head(5)\n"

In [21]:
# loading csv file containing matrix factorization results
pred_weight_df = pd.read_csv('pred_weight_results.csv',
                             delimiter=',', header=[0,1],
                             index_col=0, low_memory=False)

pred_weight_df.head(5)

Unnamed: 0_level_0,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight,norm_weight
artistID,1,2,3,4,5,6,7,8,9,10,...,18736,18737,18738,18739,18740,18741,18742,18743,18744,18745
userID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,1.304602e-05,6.39131e-05,1.243568e-06,3.679395e-06,1.425435e-05,0.0001019868,0.001303,0.000334,0.0001564882,9.339306e-05,...,6.875998e-06,4.377771e-06,3.187393e-06,2.960281e-06,2.498227e-06,2.34943e-06,2.29461e-06,2.23979e-06,2.231958e-06,1.610477e-05
3,1.482237e-07,9.258858e-07,1.919372e-08,5.001578e-08,1.79908e-07,1.234436e-06,3.3e-05,4e-06,2.236097e-06,9.321376e-07,...,9.295518e-08,5.918217e-08,4.30897e-08,4.001943e-08,3.377301e-08,3.176145e-08,3.102035e-08,3.027925e-08,3.017338e-08,1.864148e-07
4,4.846747e-06,1.193999e-05,4.260899e-07,5.279614e-07,2.080239e-06,1.551336e-05,0.000122,5.2e-05,6.013529e-05,1.484893e-05,...,1.710689e-06,1.089152e-06,7.929958e-07,7.364924e-07,6.215373e-07,5.845178e-07,5.708791e-07,5.572403e-07,5.552919e-07,2.016107e-06
5,1.638916e-06,8.456203e-06,1.955225e-07,4.92914e-07,1.616318e-06,1.22016e-05,0.000267,3.6e-05,2.740684e-05,1.015651e-05,...,8.813397e-07,5.611263e-07,4.085481e-07,3.794378e-07,3.202134e-07,3.011411e-07,2.941145e-07,2.870879e-07,2.860841e-07,1.757022e-06
6,4.261186e-08,2.746639e-07,5.54462e-09,1.486421e-08,5.581256e-08,4.05324e-07,1e-05,1e-06,5.969183e-07,3.233597e-07,...,2.694114e-08,1.715273e-08,1.248866e-08,1.159881e-08,9.78841e-09,9.205401e-09,8.990608e-09,8.775816e-09,8.745131e-09,6.104573e-08


In [22]:
# asking for userID input to look for recommendations
print("Enter a user_id to get recommendations:")
user_id_to_search = int(input())

Enter a user_id to get recommendations:
3


In [81]:
print("Sample artists listened to by user_id {}:".format(user_id_to_search))

art_userID = sel_user_art_norm[sel_user_art_norm['userID'] == user_id_to_search]
art_userID = art_userID.join(artists, on='artistID')
art_userID = art_userID.sort_values(by=['norm_weight'], ascending=False)

art_userID.head(5)

Sample artists listened to by user_id 3:


Unnamed: 0,userID,artistID,norm_weight,name,url,pictureURL
50,3,101,0.037355,Pleq,http://www.last.fm/music/Pleq,http://userserve-ak.last.fm/serve/252/58414481...
51,3,102,0.001874,Segue,http://www.last.fm/music/Segue,http://userserve-ak.last.fm/serve/252/38568681...
52,3,103,0.001395,Max Richter,http://www.last.fm/music/Max+Richter,http://userserve-ak.last.fm/serve/252/51974589...
53,3,104,0.001219,Celer,http://www.last.fm/music/Celer,http://userserve-ak.last.fm/serve/252/43677713...
54,3,105,0.00114,Pjusk,http://www.last.fm/music/Pjusk,http://userserve-ak.last.fm/serve/252/43518367...


In [76]:
# filtering user preferences into dataframe
user_pref = pred_weight_df.loc[pred_weight_df.index == user_id_to_search].T
user_pref = user_pref.reset_index(level=[0])
user_pref = user_pref.drop(['level_0'], axis=1)
user_pref.columns = ['pred_norm_weight']
user_pref = user_pref.reset_index()
user_pref[['artistID']] = user_pref[['artistID']].astype(int)
user_pref_df = user_pref.set_index(['artistID'])
user_pref_df.head(5)

Unnamed: 0_level_0,pred_norm_weight
artistID,Unnamed: 1_level_1
1,1.482237e-07
2,9.258858e-07
3,1.919372e-08
4,5.001578e-08
5,1.79908e-07


In [83]:
# matching predicted weight to artists
user_pref_art = pd.merge(artists, user_pref_df, how='inner', 
                         left_index=True, right_index=True)
user_pref_art.head(5)

Unnamed: 0_level_0,name,url,pictureURL,pred_norm_weight
artistID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,MALICE MIZER,http://www.last.fm/music/MALICE+MIZER,http://userserve-ak.last.fm/serve/252/10808.jpg,1.482237e-07
2,Diary of Dreams,http://www.last.fm/music/Diary+of+Dreams,http://userserve-ak.last.fm/serve/252/3052066.jpg,9.258858e-07
3,Carpathian Forest,http://www.last.fm/music/Carpathian+Forest,http://userserve-ak.last.fm/serve/252/40222717...,1.919372e-08
4,Moi dix Mois,http://www.last.fm/music/Moi+dix+Mois,http://userserve-ak.last.fm/serve/252/54697835...,5.001578e-08
5,Bella Morte,http://www.last.fm/music/Bella+Morte,http://userserve-ak.last.fm/serve/252/14789013...,1.79908e-07


In [84]:
# filtering artists out artists user is already listening to
# and then recommend top predictions

print("Artists to recommend:")
already_listening = sel_user_art_norm.loc[sel_user_art_norm['userID'] == user_id_to_search,
                                          ['artistID']]
recommend_art = user_pref_art[user_pref_art.index.isin(already_listening) == False]
recommend_art = recommend_art.sort_values(by=['pred_norm_weight'], ascending=False)

recommend_art.head(5)

Artists to recommend:


Unnamed: 0_level_0,name,url,pictureURL,pred_norm_weight
artistID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
89,Lady Gaga,http://www.last.fm/music/Lady+Gaga,http://userserve-ak.last.fm/serve/252/47390093...,0.00029
707,Metallica,http://www.last.fm/music/Metallica,http://userserve-ak.last.fm/serve/252/7560709.jpg,0.000224
289,Britney Spears,http://www.last.fm/music/Britney+Spears,http://userserve-ak.last.fm/serve/252/60126439...,0.000213
792,Thalía,http://www.last.fm/music/Thal%C3%ADa,http://userserve-ak.last.fm/serve/252/40337541...,0.000196
378,Evanescence,http://www.last.fm/music/Evanescence,http://userserve-ak.last.fm/serve/252/8403975.jpg,0.000181


In [94]:
mf_rmse = np.sqrt(np.nanmean(np.square(p_sel_user_art.as_matrix(),
                                       pred_weight_df.as_matrix())))
print('User-based MF RMSE: {}'.format(mf_rmse))

User-based MF RMSE: 0.01084039336298635
