In [1]:
import numpy
import pandas
from scipy.sparse import coo_matrix

from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
                                         TFIDFRecommender, bm25_weight)

In [2]:
MODELS = {"als":  AlternatingLeastSquares,
          "tfidf": TFIDFRecommender,
          "cosine": CosineRecommender,
          "bm25": BM25Recommender}

In [3]:
def get_model(model_name):
    model_class = MODELS.get(model_name)
    if not model_class:
        raise ValueError("Unknown Model '%s'" % model_name)

    # some default params
    if issubclass(model_class, AlternatingLeastSquares):
        params = {'factors': 50, 'dtype': numpy.float32}
    elif model_name == "bm25":
        params = {'K1': 100, 'B': 0.5}
    else:
        params = {}

    return model_class(**params)


In [28]:
def read_data(filename):
    """ Reads in the last.fm dataset, and returns a tuple of a pandas dataframe
    and a sparse matrix of artist/user/playcount """
    # read in triples of user/artist/playcount from the input dataset
    # get a model based off the input params
    data = pandas.read_table(filename,
                             usecols=[0, 1],
                             names=['user', 'artist'])

    # map each artist and user to a unique numeric value
    data['user'] = data['user'].astype("category")
    data['artist'] = data['artist'].astype("category")
    #print(data.shape)
    # create a sparse matrix of all the users/plays
    plays = coo_matrix((numpy.ones(1040522),
                       (data['artist'].cat.codes.copy(),
                        data['user'].cat.codes.copy())))

    return data, plays

In [34]:
def calculate_similar_artists(input_filename, output_filename, model_name="als"):
    """ generates a list of similar artists in lastfm by utiliizing the 'similar_items'
    api of the models """
    df, plays = read_data(input_filename)

    # create a model from the input data
    model = get_model(model_name)

    # if we're training an ALS based model, weight input for last.fm
    # by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        plays = bm25_weight(plays, K1=100, B=0.8)

        # also disable building approximate recommend index
        model.approximate_recommend = False

    model.fit(plays)
    
    # write out similar artists by popularity
    artists = dict(enumerate(df['artist'].cat.categories))
    user_count = df.groupby('artist').size()
    to_generate = sorted(list(artists), key=lambda x: -user_count[x])

    # write out as a TSV of artistid, otherartistid, score
    with open(output_filename, "w") as o:
        for artistid in to_generate:
            artist = artists[artistid]
            for other, score in model.similar_items(artistid, 5):
                o.write("%s\t%s\t%s\n" % (artist, artists[other], score))


In [72]:
def calculate_recommendations(input_filename, output_filename, model_name="als"):
    """ Generates artist recommendations for each user in the dataset """
    # train the model based off input params
    df, plays = read_data(input_filename)

    # create a model from the input data
    model = get_model(model_name)

    # if we're training an ALS based model, weight input for last.fm
    # by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        plays = bm25_weight(plays, K1=100, B=0.8)

        # also disable building approximate recommend index
        model.approximate_similar_items = False

    model.fit(plays)
    
    # generate recommendations for each user and write out to a file
    artists = dict(enumerate(df['artist'].cat.categories))
    allTracks=numpy.genfromtxt("tracks_final.csv",skip_header=1,usecols=0)
    targetT=numpy.genfromtxt("target_tracks.csv",skip_header=1)
    nonTarget=numpy.setdiff1d(allTracks,targetT)
    targetP=numpy.genfromtxt("target_playlists.csv",skip_header=1)
    user_plays = plays.T.tocsr()
    with open(output_filename, "w") as o:
        o.write("playlist_id,track_ids\n")
        for userid, username in enumerate(df['user'].cat.categories):
            if numpy.isin(username,targetP):
                o.write(str(username)+",")
                for artistid, score in model.recommend(userid, user_plays,N=5):
                    #o.write("%s\t%s\t%s\n" % (username, artists[artistid], score))
                    tosuggest=artists[artistid]
                    if numpy.isin(tosuggest,targetT):
                        o.write(str(tosuggest)+" ")
                o.write("\n")

In [73]:
calculate_recommendations("train_final_2.csv","provalibreria",model_name="cosine")