# Play around music datasets 
- [blog](http://www.benfrederickson.com/distance-metrics/)
- lastfm datasets can be found [here](http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html)

In [20]:
import os 
import numpy as np 

In [2]:
def clean_dataset(filename):
    """ so - i lied a little in the post about it being a one line operation
    to read in the dataset with pandas.

    it *should* be a one line operation, but there are a bunch of malformed
    lines in the dataset that trips up pandas. So lets read in the thing one
    line at a time, and strip out the bad data. After this runs it will be a
    one-liner to read in. honest this time """


    with open(filename + ".cleaned", "w",encoding='utf8') as output:
        for i, line in enumerate(open(filename,'r',encoding='utf8')):
            tokens = line.strip().split("\t")
            if len(tokens) != 4:
                print("wrong # of tokens", i)
                continue


            if not tokens[3].isdigit():
                print("non integer play count", i)
                continue


            if tokens[2] == '""':
                print("invalid artist id", tokens[2])
                continue


            # some lines contain carriage returns (without newlines), which
            # randomly messes pandas up
            line = line.replace('\r', '')
#             print(line)


            output.write(line)


    os.rename(filename, filename + ".messy")
    os.rename(filename + ".cleaned", filename)

In [3]:
clean_dataset('./lastfm-dataset-360k/usersha1-artmbid-artname-plays.tsv')

FileExistsError: [WinError 183] 當檔案已存在時，無法建立該檔案。: './lastfm-dataset-360k/usersha1-artmbid-artname-plays.tsv' -> './lastfm-dataset-360k/usersha1-artmbid-artname-plays.tsv.messy'

In [1]:
import pandas as pd 
data = pd.read_table("./lastfm-dataset-360k/usersha1-artmbid-artname-plays.tsv", 
                         usecols=[0, 2, 3], 
                         names=['user', 'artist', 'plays'])

In [125]:
data.head()

Unnamed: 0,user,artist,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


In [134]:
temp = data[data['artist'].str.contains('bon jovi')]
# temp['artist'].cat.codes ## 52978

In [4]:
data.head(5)

Unnamed: 0,user,artist,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


In [2]:
# map each artist and user to a unique numeric value
data['user'] = data['user'].astype("category")
data['artist'] = data['artist'].astype("category")

In [3]:
data.dtypes

user      category
artist    category
plays        int64
dtype: object

In [4]:
from scipy.sparse import coo_matrix

In [5]:
# create a sparse matrix of all the artist/user/play triples
plays = coo_matrix((data['plays'].astype(float), 
                   (data['artist'].cat.codes, 
                    data['user'].cat.codes)))

In [6]:
plays_T = plays.T

In [11]:
plays.getcol(1).sum()

119921.0

_____

# Distance Metric
## KNN 

In [10]:
## user - artist matrix
plays_T

<358868x292363 sparse matrix of type '<class 'numpy.float64'>'
	with 17535652 stored elements in COOrdinate format>

In [11]:
from scipy.sparse import csr_matrix 
from sklearn.preprocessing import normalize

In [13]:
plays_t_csr = csr_matrix(plays_T)

In [14]:
from KNNmodel import *

In [16]:
model_i = KNNmodel(plays_t_csr,kind='ibcf')
%time model_i.jaccard_sim()
# model_i.fit(topK=100,remove=True)

similarity (jaccard) matrix built (ibcf), 
sparsity of similarity: 0.22 %
Wall time: 4min 9s


In [18]:
model_i.fit() ## 30 min at public(at least), 15GB memory cost..... not a good way 

  0%|                               | 909/292363 [06:16<33:30:43,  2.42items/s]

KeyboardInterrupt: 

上面方法極大缺點
- 記憶體開銷太大
- 效能太差

In [19]:
def jaccard(plays):
    plays = csr_matrix(plays)
    plays.data = np.ones(len(plays.data))

    rows_sum = plays.getnnz(axis=1).astype('int16')  #
    ab = plays.dot(plays.T).astype('float16') # mat x t(mat)        
    # for rows
    aa = np.repeat(rows_sum, ab.getnnz(axis=1))
    # for columns
    bb = rows_sum[ab.indices]

    similarities = ab.tocoo(copy=True)
    similarities.data /= (aa + bb - ab.data)
    return similarities

In [20]:
%time sim_jac = jaccard(plays)



Wall time: 1min 9s


In [15]:
import gc
# del model_i
gc.collect()

113

## ANN

In [14]:
import annoy 

### 暴力法 

如果把每個用戶聽過的歌手紀錄視為用戶向量
- user_vec
- 測試樂手之間的最近相似度

In [32]:
#  artist x users
plays

<292363x358868 sparse matrix of type '<class 'numpy.float64'>'
	with 17535577 stored elements in COOrdinate format>

In [60]:
from tqdm import tqdm

In [62]:
treecount = 20
plays_csr = csr_matrix(plays)
f = plays.shape[1] # length of artist vector
index = annoy.AnnoyIndex(f)
for i in tqdm(range(plays.shape[0])):
    v = plays_csr[i].A.ravel()
    index.add_item(i,v)
index.build(treecount)

# app_top_related.get_related(10)

  3%|▉                                 | 8466/292363 [08:45<4:53:53, 16.10it/s]

KeyboardInterrupt: 

不可思議的緩慢= =

In [63]:
del index

In [65]:
gc.collect()

0

___

# 降維

- SVD     
- ALS
- learning to rank

### SVD

In [7]:
from implicit.nearest_neighbours import bm25_weight
from scipy.sparse.linalg import svds, eigs
import scipy

對原始資料直接svd下去

In [8]:
%time artist_factors, _, user_factors = svds(plays, 50)

Wall time: 43.5 s


In [30]:
user_factors = user_factors.T

最近鄰樂手

In [10]:
artist_id_to_name = {}
artist_name_to_id = {}
temp = data['artist'][:100]

In [37]:
user_id_to_name = {}
user_name_to_id = {}

In [11]:
for name,idx in zip(data['artist'],data['artist'].cat.codes):
    artist_id_to_name[idx] = name
    artist_name_to_id[name] = idx

In [38]:
for name,idx in zip(data['user'], data['user'].cat.codes):
    user_id_to_name[idx] = name
    user_name_to_id[name] = idx

In [12]:
class ApproximateTopRelated(object):
    def __init__(self, artist_factors, treecount=20):
        index = annoy.AnnoyIndex(artist_factors.shape[1], 'angular')
        for i, row in enumerate(artist_factors):
            index.add_item(i, row)
        index.build(treecount)
        self.index = index

    def get_related(self, artistid, N=10):
        neighbours = self.index.get_nns_by_item(artistid, N)
        return sorted(((other, 1 - self.index.get_distance(artistid, other))
                      for other in neighbours), key=lambda x: -x[1])

In [16]:
%time music_related_approx = ApproximateTopRelated(artist_factors)

Wall time: 48.3 s


In [17]:
music_related_approx.get_related(10)

[(10, 1.0),
 (247053, 0.6462267637252808),
 (236128, 0.439098596572876),
 (240000, 0.42191368341445923),
 (152707, 0.41935133934020996),
 (173284, 0.4040932059288025),
 (159423, 0.4029509425163269),
 (287434, 0.40109145641326904),
 (286940, 0.40109091997146606),
 (92895, 0.39022666215896606)]

In [18]:
def print_related_artist(nameid,related_approx):
    print('樂手:{}'.format(artist_id_to_name[nameid]))
    print('===='*10)
    for idx, score in related_approx:
        print('相似樂手:{},\n\t分數:{:.2f}'.format(artist_id_to_name[idx],score))

In [19]:
artistid = artist_name_to_id['bon jovi'] ## bon jovi
print_related_artist(artistid,music_related_approx.get_related(artistid))

樂手:bon jovi
相似樂手:bon jovi,
	分數:1.00
相似樂手:bon jovi/jennifer nettles,
	分數:0.62
相似樂手:the republic,
	分數:0.61
相似樂手:hiroshi miyauchi,
	分數:0.60
相似樂手:cc productions,
	分數:0.56
相似樂手:hans harden,
	分數:0.54
相似樂手:trøste og bære,
	分數:0.54
相似樂手:dænsebændet,
	分數:0.54
相似樂手:torgeir og kjendisene,
	分數:0.53
相似樂手:kay rush,
	分數:0.53


要看用戶的推薦清單，可以使用微軟研究出的trick

* [xbox推薦的trick](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf)

<p>Basically we add a nomalizing factor to each item vector - making their distances equal with each other. Then when we query with a user vector, we add a 0 to the end, and the result is proportional to the inner producct of the user and item vectors. This is a sneaky way to do an aproximate maximum inner product search.

In [None]:
class Approx_Trick_related:
    def __init__(self, ):
        pass
    def get_related(self, artistid, N=10):
        neighbours = self.index.get_nns_by_item(artistid, N)
        return sorted(((other, 1 - self.index.get_distance(artistid, other))
                      for other in neighbours), key=lambda x: -x[1])

In [22]:
norms = np.linalg.norm(artist_factors, axis=1)
max_norm = norms.max()
extra_dimension = np.sqrt(max_norm ** 2 - norms ** 2)
norm_data = np.append(
    artist_factors, extra_dimension.reshape(norms.shape[0], 1), axis=1)

In [23]:
f_member = norm_data.shape[1]
t_member = annoy.AnnoyIndex(f_member,'angular')  # Length of item vector that will be indexed

for i in range(norm_data.shape[0]):
    v = norm_data[i]
    t_member.add_item(i, v)

t_member.build(20)

0 complete
1000 complete
2000 complete
3000 complete
4000 complete
5000 complete
6000 complete
7000 complete
8000 complete
9000 complete
10000 complete
11000 complete
12000 complete
13000 complete
14000 complete
15000 complete
16000 complete
17000 complete
18000 complete
19000 complete
20000 complete
21000 complete
22000 complete
23000 complete
24000 complete
25000 complete
26000 complete
27000 complete
28000 complete
29000 complete
30000 complete
31000 complete
32000 complete
33000 complete
34000 complete
35000 complete
36000 complete
37000 complete
38000 complete
39000 complete
40000 complete
41000 complete
42000 complete
43000 complete
44000 complete
45000 complete
46000 complete
47000 complete
48000 complete
49000 complete
50000 complete
51000 complete
52000 complete
53000 complete
54000 complete
55000 complete
56000 complete
57000 complete
58000 complete
59000 complete
60000 complete
61000 complete
62000 complete
63000 complete
64000 complete
65000 complete
66000 complete
67000 co

True

In [44]:
data[data['user'] == user_id_to_name[user_id]].head(100)

Unnamed: 0,user,artist,plays
506,000163263d2a41a3966a3746855b8b75b7d7aa83,david & the citizens,2373
507,000163263d2a41a3966a3746855b8b75b7d7aa83,kent,1724
508,000163263d2a41a3966a3746855b8b75b7d7aa83,säkert!,798
509,000163263d2a41a3966a3746855b8b75b7d7aa83,detektivbyrån,753
510,000163263d2a41a3966a3746855b8b75b7d7aa83,lars winnerbäck,750
511,000163263d2a41a3966a3746855b8b75b7d7aa83,raymond & maria,737
512,000163263d2a41a3966a3746855b8b75b7d7aa83,coldplay,639
513,000163263d2a41a3966a3746855b8b75b7d7aa83,placebo,626
514,000163263d2a41a3966a3746855b8b75b7d7aa83,snow patrol,611
515,000163263d2a41a3966a3746855b8b75b7d7aa83,tegan and sara,606


In [43]:
user_id = 10
topn = 100 
rec_item_for_uid = t_member.get_nns_by_vector(np.append(user_factors[user_id],0),topn)

print('\n'.join([artist_id_to_name[artistid] for artistid in rec_item_for_uid]))

coldplay
death cab for cutie
depeche mode
oasis
muse
arctic monkeys
red hot chili peppers
the killers
placebo
bright eyes
the cure
bloc party
metallica
sufjan stevens
bruce springsteen & the e street band
interpol
elliott smith
tegan and sara
modest mouse
fall out boy
the strokes
kent
sigur rós
snow patrol
radiohead
brand new
jack johnson
the kooks
kings of leon
madonna
the shins
franz ferdinand
paramore
jimmy eat world
bruce springsteen
my chemical romance
the libertines
foo fighters
iron & wine
arcade fire
the postal service
daft punk
the decemberists
moby
keane
björk
garbage
incubus
blink-182
the white stripes
kanye west
rilo kiley
stars
damien rice
beck
john mayer
dashboard confessional
air
metric
feist
the used
of montreal
travis
johnny cash
jason mraz
mando diao
david bowie
joy division
regina spektor
manic street preachers
blur
kaiser chiefs
röyksopp
broken social scene
babyshambles
the national
ryan adams
editors
minus the bear
the mountain goats
explosions in the sky
maxïmo pa

### ALS

### lightfm

* bm25
* tfidf

# Distance Metric

## set base 
- jaccard

In [75]:
# create a dictionary of artist name to the set of their users
artist_sets = dict((artist, set(users)) for artist, users in data.groupby('artist')['user'])

In [76]:
def overlap(a, b):
    return len(a.intersection(b))

In [77]:
def jaccard(a, b):
    intersection = float(len(a.intersection(b)))
    return intersection / (len(a) + len(b) - intersection)

In [36]:
fan_jayz = artist_sets.get('jay-z')
fan_coldplay = artist_sets.get('coldplay')
fan_kanyeWest = artist_sets.get('kanye west')

In [38]:
overlap(fan_jayz,fan_coldplay) # overlapy fans of jayz and coldplay

2297

In [39]:
overlap(fan_coldplay,fan_kanyeWest)

8061

In [46]:
print('similarity (kanyeWest, coldplay):{:.3f}'.format(jaccard(fan_kanyeWest,fan_coldplay)))
print('similarity (jayz, coldplay):{:.3f}'.format(jaccard(fan_jayz,fan_coldplay)))

similarity (kanyeWest, coldplay):0.094
similarity (jayz, coldplay):0.030


In [72]:
fan_lennon = artist_sets.get('john lennon')
fan_beatles = artist_sets.get('beatles')
print('similarity (John Lennon, Beatles) :{:.3f}'.format(jaccard(fan_lennon,fan_beatles)))

NameError: name 'jaccard' is not defined

## count metters
- cosine 

In [10]:
from collections import defaultdict
from scipy.sparse import csr_matrix

Computing this angle is pretty easy. The first step is to represent each artist as a sparse vector of the play counts for each user:

In [11]:
import numpy as np 


In [17]:
# map each username to a unique numeric value
userids = defaultdict(lambda: len(userids))
data['userid'] = data['user'].map(userids.__getitem__)

# map each artist to a sparse vector of their users
artists = dict((artist, csr_matrix(
                (group['plays'], (np.zeros(len(group)), group['userid'])),
                shape=[1, len(userids)]))
        for artist, group in data.groupby('artist'))

In [50]:
def cosine(a, b):
    return np.dot(a, b.T)[0, 0] / (norm2(a) * norm2(b))

def norm2(v):
    return np.sqrt((v.data ** 2).sum())

In [120]:
print(repr(artists['radiohead']))
print(repr(artists['thom yorke']))

<1x358868 sparse matrix of type '<class 'numpy.int64'>'
	with 77254 stored elements in Compressed Sparse Row format>
<1x358868 sparse matrix of type '<class 'numpy.int64'>'
	with 5617 stored elements in Compressed Sparse Row format>


In [127]:
print('similarity between radiohead, thom yorke:{:.2f}'.format(cosine(artists['radiohead'],artists['thom yorke'])))
print('similarity between radiohead, samarah:{:.2f}'.format(cosine(artists['radiohead'],artists['samarah'])))

similarity between radiohead, thom yorke:0.45
similarity between radiohead, samarah:0.40


In [130]:
SMOOTHING = 20

def smoothed_cosine(a, b):
    # calculate set intersection by converting to binary and taking the dot product
    overlap = np.dot(binarize(a), binarize(b).T)[0, 0]

    # smooth cosine by discounting by set intersection
    return (overlap / (SMOOTHING + overlap)) * cosine(a, b)

def binarize(artist):
    ret = csr_matrix(artist)
    ret.data  = np.ones(len(artist.data))
    return ret

In [131]:
print('smooth-similarity between radiohead, thom yorke:{:.2f}'.format(smoothed_cosine(artists['radiohead'],artists['thom yorke'])))
print('smooth-similarity between radiohead, samarah:{:.2f}'.format(smoothed_cosine(artists['radiohead'],artists['samarah'])))

smooth-similarity between radiohead, thom yorke:0.45
smooth-similarity between radiohead, samarah:0.02


## Method for Information retrival
- tfidf 
- bm25

In [18]:
# calculate IDF for each user
N = len(artists)
idf = [1. + np.log(N / (1. + p)) for p in data.groupby('userid').size()]

In [51]:
# weights a sparse vector by tfidf
def tfidf_weight(artist, idf):
    ret = csr_matrix(artist)
    ret.data = np.array([np.sqrt(plays) * idf[userid] for plays, userid in zip(artist.data, artist.indices)])
    return ret

# tfidf distance is just the cosine between tfidf weighted vectors
def tfidf(a, b, idf):
    return cosine(tfidf_weight(a, idf), tfidf_weight(b, idf))

In [28]:
tfidf_weight(artists['coldplay'],idf)

<1x358868 sparse matrix of type '<class 'numpy.float64'>'
	with 66658 stored elements in Compressed Sparse Row format>

In [54]:
print('tfidf-cos similarity between radiohead, thom yorke:{:.2f}'.format(tfidf(artists['radiohead'],artists['thom yorke'],idf)))
print('tfidf-cos similarity between radiohead, samarah:{:.2f}'.format(tfidf(artists['radiohead'],artists['samarah'],idf)))

# tfidf(artists['radiohead'],artists['samarah'],idf)

tfidf-cos similarity between radiohead, thom yorke:0.35
tfidf-cos similarity between radiohead, samarah:0.06


* bm25

In [55]:
def bm25_tf_weight(plays):
    return plays * (K1 + 1.0) / (K1 + plays)

In [68]:
K1 = 1.2; B=10
def bm25(a, b, idf, average_plays):
    return np.dot(bm25_weight(a, idf, average_plays),
               bm25_weight(b, idf, average_plays).T)[0, 0]


def bm25_weight(artist, idf, average_plays):
    ret = csr_matrix(artist)
    length_norm = (1.0 - B) + B * artist.sum() / average_plays
    ret.data = np.array([(plays * (K1 + 1.0) / (K1 * length_norm + plays)) * idf[userid]
                      for plays, userid in zip(artist.data, artist.indices)])
    return ret 

In [69]:
bm25(artists['coldplay'],artists['thom yorke'],idf=idf, average_plays=10)

0.0047395542942066599

In [82]:
from sklearn.preprocessing import normalize

In [84]:
normalized = normalize(plays)
cos_sim = normalized.dot(normalized.T)