In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

import scipy.sparse as sparse

from implicit.als import AlternatingLeastSquares

In [2]:
raw_data = pd.read_table('usersha1-artmbid-artname-plays.tsv')

In [3]:
raw_data = raw_data.drop(raw_data.columns[1], axis=1)
raw_data.columns = ['user', 'artist', 'plays']

In [4]:
raw_data.head()

Unnamed: 0,user,artist,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
1,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
2,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
3,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706
4,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691


In [5]:
data = raw_data.dropna()

In [6]:
data['user_id'] = data['user'].astype("category").cat.codes
data['artist_id'] = data['artist'].astype("category").cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [7]:
data.head()

Unnamed: 0,user,artist,plays,user_id,artist_id
0,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,0,90933
1,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,0,185367
2,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,0,106704
3,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,0,155241
4,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691,0,220128


In [8]:
item_lookup = data[['artist_id', 'artist']].drop_duplicates()
item_lookup['artist_id'] = item_lookup.artist_id.astype(str)

In [9]:
item_lookup.head()

Unnamed: 0,artist_id,artist
0,90933,die Ärzte
1,185367,melissa etheridge
2,106704,elvenking
3,155241,juliette & the licks
4,220128,red hot chili peppers


In [10]:
artist_id_name = {}

for index, row in tqdm_notebook(item_lookup.iterrows()):
    artist_id_name[row.artist_id] = row.artist




In [33]:
list(artist_id_name.keys())[0]

'90933'

In [11]:
data = data.drop(['user', 'artist'], axis=1)

In [12]:
data.head()

Unnamed: 0,plays,user_id,artist_id
0,1099,0,90933
1,897,0,185367
2,717,0,106704
3,706,0,155241
4,691,0,220128


In [13]:
data = data.loc[data.plays != 0]

In [14]:
data.head()

Unnamed: 0,plays,user_id,artist_id
0,1099,0,90933
1,897,0,185367
2,717,0,106704
3,706,0,155241
4,691,0,220128


In [15]:
users = list(np.sort(data.user_id.unique()))
artists = list(np.sort(data.artist_id.unique()))
plays = list(data.plays)

In [16]:
users[:5]

[0, 1, 2, 3, 4]

In [17]:
artists[:5]

[0, 1, 2, 3, 4]

In [18]:
plays[:5]

[1099, 897, 717, 706, 691]

In [40]:
len(users)

358868

In [41]:
len(artists)

292363

In [20]:
rows = data.user_id.astype(int)
cols = data.artist_id.astype(int)

In [43]:
data_sparse = sparse.csr_matrix((plays, (cols, rows)), shape=(len(artists), len(users)))

In [44]:
model = AlternatingLeastSquares(factors=50)
model.fit(data_sparse)

100%|██████████| 15.0/15 [01:36<00:00,  6.50s/it]


In [45]:
userid = 0

user_items = data_sparse.T.tocsr()
recommendations = model.recommend(userid, user_items)

In [46]:
recommendations

[(161833, 1.3764983),
 (255208, 1.3743109),
 (107103, 1.3316061),
 (255123, 1.2555544),
 (115417, 1.2264864),
 (12523, 1.2152299),
 (165921, 1.2119308),
 (108566, 1.2072706),
 (116811, 1.192959),
 (199270, 1.1789632)]

In [None]:
for r in recommendations:
    print(artist_id_name[str(r[])])

In [25]:
itemid = 107209
related = model.similar_items(itemid)

In [26]:
related

[(107209, 0.054393966),
 (86783, 0.04514241),
 (353295, 0.043349113),
 (113415, 0.043286446),
 (320319, 0.043155722),
 (325431, 0.042306796),
 (354240, 0.041874744),
 (234786, 0.04185298),
 (293190, 0.04178462),
 (160573, 0.041594908)]

In [34]:
for a in related:
    print(artist_id_name[str(a[0])])

eminem
deathound


KeyError: '353295'

In [37]:
artist_id_name['234786']

'sick on the bus'