In [19]:
import os

In [23]:
def clean_dataset(filename):
    """ so - i lied a little in the post about it being a one line operation
    to read in the dataset with pandas.

    it *should* be a one line operation, but there are a bunch of malformed
    lines in the dataset that trips up pandas. So lets read in the thing one
    line at a time, and strip out the bad data. After this runs it will be a
    one-liner to read in. honest this time """


    with open(filename + ".cleaned", "w") as output:
        for i, line in enumerate(open(filename)):
            tokens = line.strip().split("\t")
            if len(tokens) != 4:
                print("wrong # of tokens", i)
                continue


            if not tokens[3].isdigit():
                print("non integer play count", i)
                continue


            if tokens[2] == '""':
                print("invalid artist id", tokens[2])
                continue


            # some lines contain carriage returns (without newlines), which
            # randomly messes pandas up
            line = line.replace('\r', '')
#             print(line)


            output.write(line)


    os.rename(filename, filename + ".messy")
    os.rename(filename + ".cleaned", filename)

In [24]:
clean_dataset('./lastfm-dataset-360k/usersha1-artmbid-artname-plays.tsv')

invalid artist id ""
invalid artist id ""
non integer play count 16890410


In [25]:
import pandas as pd 
data = pd.read_table("./lastfm-dataset-360k/usersha1-artmbid-artname-plays.tsv", 
                         usecols=[0, 2, 3], 
                         names=['user', 'artist', 'plays'])

In [26]:
data.head(5)

Unnamed: 0,user,artist,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


In [27]:
# map each artist and user to a unique numeric value
data['user'] = data['user'].astype("category")
data['artist'] = data['artist'].astype("category")

In [9]:
data.dtypes

user      category
artist    category
plays        int64
dtype: object

In [10]:
from scipy.sparse import coo_matrix

In [28]:
# create a sparse matrix of all the artist/user/play triples
plays = coo_matrix((data['plays'].astype(float), 
                   (data['artist'].cat.codes, 
                    data['user'].cat.codes)))

In [44]:
plays.getcol(1).nnz

51

In [45]:
plays.getcol(1).sum()

119921.0

In [46]:
from implicit.nearest_neighbours import bm25_weight
import scipy

In [49]:
from scipy.sparse.linalg import svds, eigs

In [54]:
bm25_weight(plays)

<292363x358868 sparse matrix of type '<class 'numpy.float64'>'
	with 17535577 stored elements in COOrdinate format>

In [50]:
artist_factors, _, user_factors = svds(bm25_weight(plays), 50)

In [57]:
plays_bm25 = bm25_weight(plays,K1=100,B=0.8)

<292363x358868 sparse matrix of type '<class 'numpy.float64'>'
	with 17535577 stored elements in COOrdinate format>