# Lastfm music data set Recommender
## preprocessing datasets

In [1]:
import numpy as np 
import pandas as pd 
import sys
import gc

In [2]:
data = pd.read_table("../lastfm-dataset-360k/usersha1-artmbid-artname-plays.tsv", 
                         usecols=[0, 2, 3], 
                         names=['user', 'artist', 'plays'])

In [3]:
# map each artist and user to a unique numeric value
data['user'] = data['user'].astype("category")
data['artist'] = data['artist'].astype("category")

In [5]:
data.dtypes

user      category
artist    category
plays        int64
dtype: object

In [6]:
data.head()

Unnamed: 0,user,artist,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


In [38]:
data[data.plays <20]

Unnamed: 0,user,artist,plays


In [8]:
data = data[data.plays>20]

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14678583 entries, 0 to 17535629
Data columns (total 3 columns):
user      category
artist    category
plays     int64
dtypes: category(2), int64(1)
memory usage: 360.9 MB


In [10]:
from scipy.sparse import coo_matrix, csr_matrix

In [64]:
# create a sparse matrix of all the artist/user/play triples
plays = coo_matrix((data['plays'].astype(float), 
                   (data['user'].cat.codes,
                   data['artist'].cat.codes,)))

In [65]:
plays

<358868x292363 sparse matrix of type '<class 'numpy.float64'>'
	with 14678583 stored elements in COOrdinate format>

# Lightfm

In [16]:
from lightfm import LightFM
import sys
sys.path.append('../')
import helpers

In [13]:
import gc

In [66]:
gc.collect()

3873

In [15]:
import scipy.sparse as sp
plays_csr = sp.csr_matrix(plays)

In [73]:
train, test, uid_test = helpers.train_test_split(plays_csr,split_count=1,fraction=0.2)

In [74]:
print('train shape:{}'.format(train.shape))
print('test shape:{}'.format(test.shape))

train shape:(358868, 292363)
test shape:(358868, 292363)


In [67]:
plays_csr = plays.tocsr()

In [75]:
plays_csr[4,]

<1x292363 sparse matrix of type '<class 'numpy.float64'>'
	with 50 stored elements in Compressed Sparse Row format>

In [76]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

In [None]:
# plays_csr # user * artist (u-i csr_matrix)
model = LightFM(learning_rate=0.01, loss='warp')
model.fit(train, epochs=10)

train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10).mean()
train_recall = recall_at_k(model,train,k=10).mean()
test_recall = recall_at_k(model,test,k=10).mean()

In [None]:
print('train recall:{} @k=10'.format(train_recall))