In [1]:
from utils import *
import pickle
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np
import itertools

In [2]:
with open( "train_test_data.pkl", "rb") as f:
    X_train, y_train, X_test, y_test = pickle.load(f)

In [3]:
X_train = pd.DataFrame(X_train.values(),index = X_train.keys()) # convert from dict to pd DataFrame
X_train['tracks'] = X_train.values.tolist() #stack all columns into one
X_train = X_train[['tracks']]

In [4]:
# create one hot encoding for all the tracks
X_train_onehot = pd.get_dummies(pd.DataFrame(X_train['tracks'].values.tolist()), prefix='', prefix_sep='').groupby(axis=1, level=0).max()
X_train_onehot = X_train_onehot.set_index(X_train.index)
X_train_onehot.shape #7473 101217

(7473, 101217)

In [37]:
def alternating_least_squares_cg(Cui, factors, regularization=0.01, iterations=15):
    users, items = Cui.shape

    # initialize factors randomly
    X = np.random.rand(users, factors) * 0.01
    Y = np.random.rand(items, factors) * 0.01

    Cui, Ciu = Cui.tocsr(), Cui.T.tocsr()

    for iteration in range(iterations):
        least_squares_cg(Cui, X, Y, regularization)
        least_squares_cg(Ciu, Y, X, regularization)

    return X, Y


def least_squares_cg(Cui, X, Y, regularization, cg_steps=3):
    users, factors = X.shape
    YtY = Y.T.dot(Y) + regularization * np.eye(factors)

    for u in range(users):
        # start from previous iteration
        x = X[u]

        # calculate residual r = (YtCuPu - (YtCuY.dot(Xu), without computing YtCuY
        r = -YtY.dot(x)
        for i, confidence in nonzeros(Cui, u):
            r += (confidence - (confidence - 1) * Y[i].dot(x)) * Y[i]

        p = r.copy()
        rsold = r.dot(r)

        for it in range(cg_steps):
            # calculate Ap = YtCuYp - without actually calculating YtCuY
            Ap = YtY.dot(p)
            for i, confidence in nonzeros(Cui, u):
                Ap += (confidence - 1) * Y[i].dot(p) * Y[i]

            # standard CG update
            alpha = rsold / p.dot(Ap)
            x += alpha * p
            r -= alpha * Ap
            rsnew = r.dot(r)
            p = r + (rsnew / rsold) * p
            rsold = rsnew

        X[u] = x

        

def recommend(X,Y,userid, user_items, N=10):
        """ Returns the top N ranked items for a single user """
        scores = Y.dot(X[userid])

        # calcualte the top N items, removing the users own liked items from the results
        liked = set(user_items[userid].indices)
        ids = np.argpartition(scores, -(N + len(liked)))[-(N + len(liked)):]
        best = sorted(zip(ids, scores[ids]), key=lambda x: -x[1])
        return list(itertools.islice((rec for rec in best if rec[0] not in liked), N))


In [7]:
tracks = list(X_train_onehot.columns) #all the tracks appear in the X_train

In [8]:
track_playlist_data = csr_matrix(X_train_onehot) #prepare the data matrix for the implicit package

In [38]:
def nonzeros(m, row):
    """ returns the non zeroes of a row in csr_matrix """
    for index in range(m.indptr[row], m.indptr[row+1]):
        yield m.indices[index], m.data[index]

In [39]:
X,Y = alternating_least_squares_cg(track_playlist_data, 50, regularization=0.01, iterations=15)

KeyboardInterrupt: 

In [40]:
def intersect_rate(lst1, lst2): 
    return len(list(set(lst1) & set(lst2))) 

In [41]:
import itertools
acc = 0
acc_ind = []
for ind in range(7473):
    playlistid = ind
    # recommend 10(default) tracks for a playlist
    playlist_track = track_playlist_data.tocsr()
    recommendations = recommend(X,Y,playlistid, playlist_track)
    playlist_pred = [tracks[i] for i in list(zip(*recommendations))[0] ]
    playlist_true = [j for j in X_train.tracks[0] if j is not None]
    acc_ind.append(intersect_rate(playlist_true,playlist_pred))

In [42]:
print('Overall accuracy: ', np.mean(acc_ind))

Overall accuracy:  0.09955841027699719
