In [32]:
#The basic code was borrowed by this kaggle notebook: https://www.kaggle.com/code/pegahpooya/spotify-playlists-recommender-system
#please give a like to the creator!!!!!

In [1]:
#Importing libraries

import numpy as np 
import pandas as pd 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from scipy import sparse
import random
import lightfm
from lightfm import LightFM, cross_validation
from lightfm.evaluation import precision_at_k, auc_score
from sklearn.metrics.pairwise import cosine_similarity
import pickle



In [2]:
p = 0.5 # 50% of data due to high ammount

df_playlist = pd.read_csv('data\spotify_dataset.csv', error_bad_lines=False, warn_bad_lines=False, skiprows=lambda i: i>0 and random.random() > p)
df_playlist.head()

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney,Band On The Run,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney,"Blackbird - Live at CitiField, NYC - Digital A...",HARD ROCK 2010


In [3]:
df_playlist.shape

(6444626, 4)

In [4]:
df_playlist.columns = df_playlist.columns.str.replace('"', '')
df_playlist.columns = df_playlist.columns.str.replace('name', '')
df_playlist.columns = df_playlist.columns.str.replace(' ', '')
df_playlist.columns

Index(['user_id', 'artist', 'track', 'playlist'], dtype='object')

In [5]:
#For recommender system, I'm only keeping the artists with frequency higher than 50
df_playlist = df_playlist.groupby('artist').filter(lambda x : len(x)>=50)
#And keeping the users with at least 10 unique artists in their playlists to lessen the impact of cold start problem
df_playlist = df_playlist[df_playlist.groupby('user_id').artist.transform('nunique') >= 10]

In [6]:
#group by to get the frequnecy count for each user and artist (# of times that an artist has appeared in playlists created by a user)
size = lambda x: len(x)
df_freq = df_playlist.groupby(['user_id', 'artist']).agg('size').reset_index().rename(columns={0:'freq'})[['user_id', 'artist', 'freq']].sort_values(['freq'], ascending=False)
df_freq.head()


Unnamed: 0,user_id,artist,freq
1391501,defced0ece4ce946160b0d2698142eac,Vitamin String Quartet,1709
246584,26b51e580277e131f87e4c7ee4c0887a,Vitamin String Quartet,1636
411810,414050deadb38aafd8d4ad22ca634055,Vitamin String Quartet,1286
10982,014e695cc6df96011b90a5beb3206012,Ilaiyaraaja,1119
1356993,d993ff8f2de226e2c6803e47a22e9d7e,Lata Mangeshkar,1107


In [7]:
#create a DF for artists and add artist id
df_artist = pd.DataFrame(df_freq["artist"].unique())
df_artist = df_artist.reset_index()
df_artist = df_artist.rename(columns={'index':'artist_id', 0:'artist'})
df_artist.head()

Unnamed: 0,artist_id,artist
0,0,Vitamin String Quartet
1,1,Ilaiyaraaja
2,2,Lata Mangeshkar
3,3,Peggy Lee
4,4,Grateful Dead


In [8]:
#merge
df_freq  = pd.merge(df_freq , df_artist, how='inner', on='artist')

In [9]:
df_freq.head()

Unnamed: 0,user_id,artist,freq,artist_id
0,defced0ece4ce946160b0d2698142eac,Vitamin String Quartet,1709,0
1,26b51e580277e131f87e4c7ee4c0887a,Vitamin String Quartet,1636,0
2,414050deadb38aafd8d4ad22ca634055,Vitamin String Quartet,1286,0
3,13af1238161af701c039ef3c36009d4c,Vitamin String Quartet,143,0
4,072f97536340ac57a8becca9e035e36a,Vitamin String Quartet,140,0


In [10]:
#Helpers functions are from the repo below:
#https://github.com/aayushmnit/cookbook/blob/master/recsys.py

In [25]:
def create_interaction_matrix(df,user_col, item_col, rating_col, norm= False, threshold = None):
    '''
    Function to create an interaction matrix dataframe from transactional type interactions
    Required Input -
        - df = Pandas DataFrame containing user-item interactions
        - user_col = column name containing user's identifier
        - item_col = column name containing item's identifier
        - rating col = column name containing user feedback on interaction with a given item
        - norm (optional) = True if a normalization of ratings is needed
        - threshold (required if norm = True) = value above which the rating is favorable
    Expected output - 
        - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm
    '''
    interactions = df.groupby([user_col, item_col])[rating_col] \
            .sum().unstack().reset_index(). \
            fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    return interactions

# https://github.com/aayushmnit/cookbook/blob/master/recsys.py
def create_user_dict(interactions):
    '''
    Function to create a user dictionary based on their index and number in interaction dataset
    Required Input - 
        interactions - dataset create by create_interaction_matrix
    Expected Output -
        user_dict - Dictionary type output containing interaction_index as key and user_id as value
    '''
    user_id = list(interactions.index)
    user_dict = {}
    counter = 0 
    for i in user_id:
        user_dict[i] = counter
        counter += 1
    return user_dict


# https://github.com/aayushmnit/cookbook/blob/master/recsys.py
def create_item_dict(df,id_col,name_col):
    '''
    Function to create an item dictionary based on their item_id and item name
    Required Input - 
        - df = Pandas dataframe with Item information
        - id_col = Column name containing unique identifier for an item
        - name_col = Column name containing name of the item
    Expected Output -
        item_dict = Dictionary type output containing item_id as key and item_name as value
    '''
    item_dict ={}
    for i in range(df.shape[0]):
        item_dict[(df.loc[i,id_col])] = df.loc[i,name_col]
    return item_dict

# https://github.com/aayushmnit/cookbook/blob/master/recsys.py
def runMF(interactions, n_components=30, loss='warp', k=15, epoch=30,n_jobs = 4):
    '''
    Function to run matrix-factorization algorithm
    Required Input -
        - interactions = dataset create by create_interaction_matrix
        - n_components = number of embeddings you want to create to define Item and user
        - loss = loss function other options are logistic, brp
        - epoch = number of epochs to run 
        - n_jobs = number of cores used for execution 
    Expected Output  -
        Model - Trained model
    '''
    
    #uncommented for train test split
#     x = sparse.csr_matrix(interactions.values)
    model = LightFM(no_components= n_components, loss=loss,k=k)
    model.fit(x,epochs=epoch,num_threads = n_jobs)
    return model

# https://github.com/aayushmnit/cookbook/blob/master/recsys.py
def sample_recommendation_user(model, interactions, user_id, user_dict, 
                               item_dict,threshold = 0,nrec_items = 10, show = True):
    '''
    Function to produce user recommendations
    Required Input - 
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - user_id = user ID for which we need to generate recommendation
        - user_dict = Dictionary type input containing interaction_index as key and user_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - threshold = value above which the rating is favorable in new interaction matrix
        - nrec_items = Number of output recommendation needed
    Expected Output - 
        - Prints list of items the given user has already bought
        - Prints list of N recommended items  which user hopefully will be interested in
    '''
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index) \
								 .sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print("Known Likes:")
        counter = 1
        for i in known_items:
            print(str(counter) + '- ' + i)
            counter+=1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter+=1
    return return_score_list
    
#https://github.com/Lab41/hermes/blob/master/src/algorithms/performance_metrics.py
def calculate_population_category_diversity(y_predicted, content_array):
    """
    The higher the category diversity the better.
    Function determines the total sum of the categories for all people (rating_array).
    So for a random group of users resulting in 330 predictions in MovieLens this could look like:
        [71, 34, 11, 22, 126, 128, 0, 165, 21, 0, 35, 0, 62, 100, 5, 131, 3, 0]
    The average of each component (by total number of predictions) is then taken
        [0.21, 0.1, 0.03....0]
    The component averages are summed
        2.79
    Finally a scaling factor is utilized to take into consideration the number of categories and the average categories for an item
        0.31
    This final step is to help normalize across datasets where some may have many more/less categories and/or more/less dense item categorization
    Args:
        y_predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. Should be the n predicted ratings
        content_array: content feature array of the items which should be in the format of (item [content_feature vector])
    Returns:
        cat_diversity:
    """
    ave_coverage = content_array.map(lambda id, array: sum(array)).mean()
    rating_array_raw = y_predicted.keyBy(lambda row: row[1]).join(content_array).map(lambda (id, (rating, array)): array).collect()
    rating_array = map(sum,zip(*np.array(rating_array_raw)))
    cat_diversity = sum([r/float(len(rating_array_raw)) for r in rating_array])*ave_coverage/float(len(rating_array))

    return cat_diversity

SyntaxError: invalid syntax (332369841.py, line 140)

In [12]:
#prep data

interactions = create_interaction_matrix(df = df_freq, user_col = "user_id", item_col = 'artist_id', rating_col = 'freq', norm= False, threshold = None)
interactions.head()
#interactions.shape

artist_id,0,1,2,3,4,5,6,7,8,9,...,14252,14253,14254,14255,14256,14257,14258,14259,14260,14261
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00055176fea33f6e027cd3302289378b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0007f3dd09c91198371454c608d47f22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000b0f32b5739f052b9d40fcc5c41079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000c11a16c89aa4b14b328080f5954ee,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00123e0f544dee3ab006aa7f1e5725a7,0.0,0.0,0.0,0.0,17.0,1.0,0.0,0.0,78.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
#create user dict

user_dict = create_user_dict(interactions=interactions)
user_dict

{'00055176fea33f6e027cd3302289378b': 0,
 '0007f3dd09c91198371454c608d47f22': 1,
 '000b0f32b5739f052b9d40fcc5c41079': 2,
 '000c11a16c89aa4b14b328080f5954ee': 3,
 '00123e0f544dee3ab006aa7f1e5725a7': 4,
 '00139e9cb50fb309549e1561b476226d': 5,
 '00154ec9dd1acd4ebfb521629dcb3948': 6,
 '0019363a0d57e94d39988c31eeb8d015': 7,
 '001c4e5b73eca68ee9756bb0c7d2f855': 8,
 '0025022960e5f0d7d01af5d840014594': 9,
 '002544ea04896444d79d3e4a7e073422': 10,
 '0028736f2b131fc698d959ca9adddc4b': 11,
 '00287ecf491882a40ff34b0fd75a5b16': 12,
 '002e678084db2e201d2721bf5af4e54c': 13,
 '0042854accad3f11774312547a872cc9': 14,
 '005241c63412ee4be1955b05faf301dc': 15,
 '00526df8fabfb8c8605f180ed1880754': 16,
 '0059ac6b3ba3c3f415f09059a3bd703a': 17,
 '0065e0de8006b1a94fd9362d16ce021a': 18,
 '006ac48c011ce60c0817e4d311acf1df': 19,
 '00713b834248f13729f9dc55cbe3d33e': 20,
 '0075df30386c1755d95008ae244eec57': 21,
 '007b9bb5853488474eed02078342e65d': 22,
 '007cab633671b7475e317587ebb31d8c': 23,
 '007eb8147a9f4db33827c480

In [14]:
artists_dict = create_item_dict(df = df_artist, id_col = 'artist_id', name_col = 'artist')

In [15]:
#train-test split

x = sparse.csr_matrix(interactions.values)
train, test = lightfm.cross_validation.random_train_test_split(x, test_percentage=0.2, random_state=None)

In [20]:
#train model (Matrix Factorization)

%time
model = runMF(interactions = train,
                 n_components = 30,
                 loss = 'warp',
                 k = 15,
                 epoch = 30,
                 n_jobs = 4)

CPU times: total: 0 ns
Wall time: 0 ns


In [17]:
# save the model to disk
filename = 'rec_artists_model.sav'
#pickle.dump(model, open(filename, 'wb'))

In [18]:
# load the model from disk
model = pickle.load(open(filename, 'rb'))

In [21]:
train_auc = auc_score(model, train, num_threads=4).mean()
print('Train AUC: %s' % train_auc)


test_auc = auc_score(model, test, train_interactions=train, num_threads=4).mean()
print('Test AUC: %s' % test_auc)

Train AUC: 0.9666557
Test AUC: 0.96762854


In [29]:
#example

rec_list = sample_recommendation_user(model = model, 
                                      interactions = interactions, 
                                      user_id = '0059ac6b3ba3c3f415f09059a3bd703a', 
                                      user_dict = user_dict,
                                      item_dict = artists_dict, 
                                      threshold = 0,
                                      nrec_items = 10,
                                      show = True)

Known Likes:
1- The Protomen
2- Dead Man's Bones
3- Salem
4- Birdy
5- The Glitch Mob
6- Astronautalis
7- Ben Howard
8- Burial
9- James Blake
10- Caribou
11- Mew
12- Beyoncé

 Recommended Items:
1- M83
2- The xx
3- Röyksopp
4- Daft Punk
5- Lykke Li
6- Bon Iver
7- The Knife
8- Bonobo
9- Sigur Rós
10- Crystal Castles
