# Play around music datasets 
- [blog](http://www.benfrederickson.com/distance-metrics/)
- lastfm datasets can be found [here](http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html)

In [1]:
import os

In [11]:
def clean_dataset(filename):
    """ so - i lied a little in the post about it being a one line operation
    to read in the dataset with pandas.

    it *should* be a one line operation, but there are a bunch of malformed
    lines in the dataset that trips up pandas. So lets read in the thing one
    line at a time, and strip out the bad data. After this runs it will be a
    one-liner to read in. honest this time """


    with open(filename + ".cleaned", "w",encoding='utf8') as output:
        for i, line in enumerate(open(filename,'r',encoding='utf8')):
            tokens = line.strip().split("\t")
            if len(tokens) != 4:
                print("wrong # of tokens", i)
                continue


            if not tokens[3].isdigit():
                print("non integer play count", i)
                continue


            if tokens[2] == '""':
                print("invalid artist id", tokens[2])
                continue


            # some lines contain carriage returns (without newlines), which
            # randomly messes pandas up
            line = line.replace('\r', '')
#             print(line)


            output.write(line)


    os.rename(filename, filename + ".messy")
    os.rename(filename + ".cleaned", filename)

In [12]:
clean_dataset('./lastfm-dataset-360k/usersha1-artmbid-artname-plays.tsv')

invalid artist id ""
invalid artist id ""
non integer play count 16890410


In [1]:
import pandas as pd 
data = pd.read_table("./lastfm-dataset-360k/usersha1-artmbid-artname-plays.tsv", 
                         usecols=[0, 2, 3], 
                         names=['user', 'artist', 'plays'])

In [2]:
data.head(5)

Unnamed: 0,user,artist,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


In [2]:
# map each artist and user to a unique numeric value
data['user'] = data['user'].astype("category")
data['artist'] = data['artist'].astype("category")

In [4]:
data.dtypes

user      category
artist    category
plays        int64
dtype: object

In [4]:
from scipy.sparse import coo_matrix

In [5]:
# create a sparse matrix of all the artist/user/play triples
plays = coo_matrix((data['plays'].astype(float), 
                   (data['artist'].cat.codes, 
                    data['user'].cat.codes)))

In [6]:
plays

<292363x358868 sparse matrix of type '<class 'numpy.float64'>'
	with 17535652 stored elements in COOrdinate format>

In [20]:
plays.getcol(1).sum()

119921.0

In [7]:
from implicit.nearest_neighbours import bm25_weight
import scipy

In [8]:
from scipy.sparse.linalg import svds, eigs

In [50]:
artist_factors, _, user_factors = svds(bm25_weight(plays), 50)

In [15]:
plays_bm25 = bm25_weight(plays)

In [16]:
print(plays_bm25.getcol(1).sum())
print(plays_bm25.getcol(1).nnz)

16293.7177021
51


# Distance Metrric
## KNN 

In [7]:
from scipy.sparse import csr_matrix 
from sklearn.preprocessing import normalize
plays_csr = csr_matrix(plays)
normalized = normalize(plays)
sim_cos = normalized.dot(normalized.T)

In [14]:
import numpy as np

In [21]:
plays.getnnz(axis=1).shape

(292363,)

In [22]:
def jaccard(plays):
    plays = csr_matrix(plays)
    plays.data = np.ones(len(plays.data))

    rows_sum = plays.getnnz(axis=1).astype('int16')  #
    ab = plays.dot(plays.T).astype('float16') # mat x t(mat)        
    # for rows
    aa = np.repeat(rows_sum, ab.getnnz(axis=1))
    # for columns
    bb = rows_sum[ab.indices]

    similarities = ab.tocoo(copy=True)
    similarities.data /= (aa + bb - ab.data)
    return similarities

In [23]:
%time jaccard(plays)

Wall time: 1min 4s




<292363x292363 sparse matrix of type '<class 'numpy.float16'>'
	with 191494261 stored elements in COOrdinate format>

In [25]:
from KNNmodel import *

In [26]:
from scipy.sparse import csr_matrix

In [28]:
model_i = KNNmodel(csr_matrix(plays.T),kind='ibcf')
%time model_i.jaccard_sim()
# model_i.fit(topK=100,remove=True)

  similarities.data /= (aa + bb - ab.data)


similarity (jaccard) matrix built (ibcf), 
sparsity of similarity: 0.22 %
Wall time: 2min 50s


In [35]:
sim = model_i.sim

In [None]:
from sklearn.preprocessing import 

# Distance Metric

## set base 
- jaccard

In [75]:
# create a dictionary of artist name to the set of their users
artist_sets = dict((artist, set(users)) for artist, users in data.groupby('artist')['user'])

In [76]:
def overlap(a, b):
    return len(a.intersection(b))

In [77]:
def jaccard(a, b):
    intersection = float(len(a.intersection(b)))
    return intersection / (len(a) + len(b) - intersection)

In [36]:
fan_jayz = artist_sets.get('jay-z')
fan_coldplay = artist_sets.get('coldplay')
fan_kanyeWest = artist_sets.get('kanye west')

In [38]:
overlap(fan_jayz,fan_coldplay) # overlapy fans of jayz and coldplay

2297

In [39]:
overlap(fan_coldplay,fan_kanyeWest)

8061

In [46]:
print('similarity (kanyeWest, coldplay):{:.3f}'.format(jaccard(fan_kanyeWest,fan_coldplay)))
print('similarity (jayz, coldplay):{:.3f}'.format(jaccard(fan_jayz,fan_coldplay)))

similarity (kanyeWest, coldplay):0.094
similarity (jayz, coldplay):0.030


In [72]:
fan_lennon = artist_sets.get('john lennon')
fan_beatles = artist_sets.get('beatles')
print('similarity (John Lennon, Beatles) :{:.3f}'.format(jaccard(fan_lennon,fan_beatles)))

NameError: name 'jaccard' is not defined

## count metters
- cosine 

In [10]:
from collections import defaultdict
from scipy.sparse import csr_matrix

Computing this angle is pretty easy. The first step is to represent each artist as a sparse vector of the play counts for each user:

In [11]:
import numpy as np 


In [17]:
# map each username to a unique numeric value
userids = defaultdict(lambda: len(userids))
data['userid'] = data['user'].map(userids.__getitem__)

# map each artist to a sparse vector of their users
artists = dict((artist, csr_matrix(
                (group['plays'], (np.zeros(len(group)), group['userid'])),
                shape=[1, len(userids)]))
        for artist, group in data.groupby('artist'))

In [50]:
def cosine(a, b):
    return np.dot(a, b.T)[0, 0] / (norm2(a) * norm2(b))

def norm2(v):
    return np.sqrt((v.data ** 2).sum())

In [120]:
print(repr(artists['radiohead']))
print(repr(artists['thom yorke']))

<1x358868 sparse matrix of type '<class 'numpy.int64'>'
	with 77254 stored elements in Compressed Sparse Row format>
<1x358868 sparse matrix of type '<class 'numpy.int64'>'
	with 5617 stored elements in Compressed Sparse Row format>


In [127]:
print('similarity between radiohead, thom yorke:{:.2f}'.format(cosine(artists['radiohead'],artists['thom yorke'])))
print('similarity between radiohead, samarah:{:.2f}'.format(cosine(artists['radiohead'],artists['samarah'])))

similarity between radiohead, thom yorke:0.45
similarity between radiohead, samarah:0.40


In [130]:
SMOOTHING = 20

def smoothed_cosine(a, b):
    # calculate set intersection by converting to binary and taking the dot product
    overlap = np.dot(binarize(a), binarize(b).T)[0, 0]

    # smooth cosine by discounting by set intersection
    return (overlap / (SMOOTHING + overlap)) * cosine(a, b)

def binarize(artist):
    ret = csr_matrix(artist)
    ret.data  = np.ones(len(artist.data))
    return ret

In [131]:
print('smooth-similarity between radiohead, thom yorke:{:.2f}'.format(smoothed_cosine(artists['radiohead'],artists['thom yorke'])))
print('smooth-similarity between radiohead, samarah:{:.2f}'.format(smoothed_cosine(artists['radiohead'],artists['samarah'])))

smooth-similarity between radiohead, thom yorke:0.45
smooth-similarity between radiohead, samarah:0.02


## Method for Information retrival
- tfidf 
- bm25

In [18]:
# calculate IDF for each user
N = len(artists)
idf = [1. + np.log(N / (1. + p)) for p in data.groupby('userid').size()]

In [51]:
# weights a sparse vector by tfidf
def tfidf_weight(artist, idf):
    ret = csr_matrix(artist)
    ret.data = np.array([np.sqrt(plays) * idf[userid] for plays, userid in zip(artist.data, artist.indices)])
    return ret

# tfidf distance is just the cosine between tfidf weighted vectors
def tfidf(a, b, idf):
    return cosine(tfidf_weight(a, idf), tfidf_weight(b, idf))

In [28]:
tfidf_weight(artists['coldplay'],idf)

<1x358868 sparse matrix of type '<class 'numpy.float64'>'
	with 66658 stored elements in Compressed Sparse Row format>

In [54]:
print('tfidf-cos similarity between radiohead, thom yorke:{:.2f}'.format(tfidf(artists['radiohead'],artists['thom yorke'],idf)))
print('tfidf-cos similarity between radiohead, samarah:{:.2f}'.format(tfidf(artists['radiohead'],artists['samarah'],idf)))

# tfidf(artists['radiohead'],artists['samarah'],idf)

tfidf-cos similarity between radiohead, thom yorke:0.35
tfidf-cos similarity between radiohead, samarah:0.06


* bm25

In [55]:
def bm25_tf_weight(plays):
    return plays * (K1 + 1.0) / (K1 + plays)

In [68]:
K1 = 1.2; B=10
def bm25(a, b, idf, average_plays):
    return np.dot(bm25_weight(a, idf, average_plays),
               bm25_weight(b, idf, average_plays).T)[0, 0]


def bm25_weight(artist, idf, average_plays):
    ret = csr_matrix(artist)
    length_norm = (1.0 - B) + B * artist.sum() / average_plays
    ret.data = np.array([(plays * (K1 + 1.0) / (K1 * length_norm + plays)) * idf[userid]
                      for plays, userid in zip(artist.data, artist.indices)])
    return ret 

In [69]:
bm25(artists['coldplay'],artists['thom yorke'],idf=idf, average_plays=10)

0.0047395542942066599

In [82]:
from sklearn.preprocessing import normalize

In [84]:
normalized = normalize(plays)
cos_sim = normalized.dot(normalized.T)