In [1]:
import pandas as pd
import numpy as np
import json
import re 
import sys
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from joblib import dump, load



# For now filter the songs data to only the ones in playlists to save on memory

In [30]:
playlists_tracks = pd.read_csv("LastFM-Playlist-Tracks.csv")
tracks = pd.read_csv("LastFM-Tracks.csv")
songs_filtered = pd.read_csv("songs-filtered-machine-learning.csv")

In [31]:
songs_filtered.drop(columns='Unnamed: 0', inplace=True)

In [37]:
songs_filtered['consolidates_genre_lists'] = songs_filtered['consolidates_genre_lists'] .apply(lambda x: [re.sub(' ','_',i) for i in re.findall(r"'([^']*)'", x)])

In [44]:
# tfidf can't handle nulls so fill any null values with an empty list
songs_filtered['consolidates_genre_lists'] = songs_filtered['consolidates_genre_lists'].apply(lambda d: d if isinstance(d, list) else [])

In [46]:
float_cols = songs_filtered.dtypes[(songs_filtered.dtypes == 'float64') | (songs_filtered.dtypes == 'int64')].index.values

In [47]:
playlist_songs_with_ids = playlists_tracks.merge(tracks, how='left', left_on='track', right_on='trackid')
playlist_songs_with_ids

Unnamed: 0,playlist,track,trackid,spotifyid,title,artist,album,imgurl,imgfile,popularity,duration
0,1,1,1,02JnoHSIDbpVW40uipjKcL,Roxanne,1,1,,,0.00,192.948
1,1,2,2,01HNAQL86oZsKECUfJiAwk,22 Acacia Avenue,2,2,,,0.00,395.720
2,1,3,3,0eCxvvcJUgMmFOEv0tphgh,Lovely Ladies,3,3,,,0.00,231.946
3,1,4,4,1pT9RHD2v3aHqENfVaFPw4,Sweet Painted Lady,4,4,,,0.32,234.733
4,1,5,5,68PVZq98OxgeWBWbskYQLt,"Build God, Then We'll Talk",5,5,,,0.36,220.720
...,...,...,...,...,...,...,...,...,...,...,...
1024706,10697,416541,416541,2dkw6BVxhDSBr91gT9hOGd,Stammgast,14577,187057,,,0.16,338.314
1024707,10697,410904,410904,0BkKep2Tf9JlPr2g1zbJiw,Treibsand,31649,174485,,,0.08,416.106
1024708,10697,352239,352239,7i5ZRYvZjg0YHFI5P1qfsb,Elevator,11893,175485,,,0.27,297.465
1024709,10697,410905,410905,0mYrkZjdEbCWfTSLAHU4fO,Well Done,44551,199938,,,0.00,501.453


In [48]:
#function to build entire feature set
def create_feature_set(df, float_cols):
    """ 
    Process spotify df to create a final set of features that will be used to generate recommendations

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        float_cols (list(str)): List of float columns that will be scaled 
        
    Returns: 
        final: final set of features 
    """
    
    #tfidf genre lists
    tfidf = TfidfVectorizer()
    tfidf_matrix =  tfidf.fit_transform(df['consolidates_genre_lists'].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names()]
    genre_df.reset_index(drop = True, inplace=True)

    # explicity_ohe = ohe_prep(df, 'Explicit','exp')    
   
    # popularity_ohe = ohe_prep(df, 'popularity_red','pop') * 0.15

    #scale float columns
    floats = df[float_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns)

    #concanenate all features
    final = pd.concat([genre_df, floats_scaled], axis = 1)
     
    #add song id
    final['TrackID']=df['TrackID'].values
    
    return final

In [49]:
complete_feature_set = create_feature_set(songs_filtered, float_cols=float_cols)



In [50]:
complete_feature_set

Unnamed: 0,genre|_hip_hop,genre|a_cappella,genre|abstract,genre|abstract_hip_hop,genre|accordion,genre|acid_house,genre|acid_jazz,genre|acid_rock,genre|acid_trance,genre|acousmatic,...,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,TimeSignature,popularity_red,TrackID
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.071320,0.022892,0.000265,0.241206,0.460685,0.543605,0.6,0.3750,7s8v7hQ80FZyL1N9FHZW5W
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.041017,0.968876,0.000000,0.221106,0.576613,0.358670,1.0,0.3750,38Mxx28rpORDuDAGuZe1F4
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.125541,0.037249,0.000006,0.242211,0.857863,0.636619,0.8,0.4375,6mGP1LAOLKTUcu1TpAF8vy
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.030087,0.012450,0.000000,0.113568,0.620968,0.549622,0.8,0.3125,4G1GhiYTkfgsmMv1xlL6jO
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.035065,0.023695,0.000000,0.369849,0.562500,0.423024,0.8,0.3125,10hWSYRQZ4B1IjPafMhiKr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.038528,0.347390,0.000000,0.050854,0.969758,0.573624,0.8,0.1875,0tNoPlTA4WasOiJNyRbRvh
10043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.053788,0.348394,0.000009,0.050955,0.842742,0.515078,0.8,0.4375,022XE9RZVU3a9nULENXfnm
10044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.074892,0.117470,0.000000,0.070452,0.919355,0.670481,0.8,0.2500,073V7jY2AD5JXEnej4JUd1
10045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.067424,0.416667,0.000251,0.050352,0.952621,0.756854,0.8,0.5000,0VVMYrDOTj2MQSXID2WMTI


In [51]:
#filter to only playlist 
playlist_1065 = playlist_songs_with_ids[playlist_songs_with_ids['playlist']==1065]
playlist_1065

Unnamed: 0,playlist,track,trackid,spotifyid,title,artist,album,imgurl,imgfile,popularity,duration
65215,1065,21856,21856,5sxIzTn9gJlsIWoMNkKyag,2112,3991,15774,,,0.0,1233.586
65216,1065,48092,48092,20bJTQOaoHqeCHBhqIgcTl,The Ultra-Violence,11126,33038,,,0.42,633.6
65217,1065,42969,42969,0b47En2lJ0yP6ZKhyOG9Kd,Closer To The Heart,3991,29755,,,0.2,171.866
65218,1065,48093,48093,0wu0hJZ61dT5uSzL0tuPnQ,Canada,14346,33039,,,0.12,331.808
65219,1065,48094,48094,0vqSdwyJMafqz53TxsqXOD,Thicker Than Blood,11126,24283,,,0.07,158.333
65220,1065,42957,42957,0vwEcik5vpKs1tgwzKltBw,La Villa Strangiato,3991,7166,,,0.18,575.066
65221,1065,12726,12726,20ww15KemlpsXO1KdiOigX,Mordecai,5024,9125,,,0.34,347.76
65222,1065,48095,48095,0ff6FUCAt7YsWgS7wih7Pp,End Of The Beginning,14347,33040,,,0.34,706.4
65223,1065,48096,48096,1LGklBuwUCv4pTeD7CIq0b,Hellion,4957,33041,,,0.0,236.893
65224,1065,20942,20942,1E0ONfqq74b4gYhdlyhMSB,YYZ,3991,15334,,,0.48,266.066


In [52]:
def generate_playlist_feature(complete_feature_set, playlist_df):
    complete_feature_set_playlist = complete_feature_set[complete_feature_set['TrackID'].isin(playlist_df['spotifyid'].values)]
    complete_feature_set_playlist = complete_feature_set_playlist.merge(playlist_df[['spotifyid']], how = 'inner', left_on = 'TrackID', right_on='spotifyid')
    complete_feature_set_nonplaylist = complete_feature_set[~complete_feature_set['TrackID'].isin(playlist_df['spotifyid'].values)]
    complete_feature_set_playlist.drop(columns=['TrackID','spotifyid'],inplace=True)

    return complete_feature_set_playlist.sum(axis = 0), complete_feature_set_nonplaylist

In [53]:
complete_feature_set_playlist_vector, complete_feature_set_nonplaylist = generate_playlist_feature(complete_feature_set, playlist_1065)

In [54]:
complete_feature_set_nonplaylist.shape

(10045, 2041)

# Generate Playlist Recommendations

In [55]:
def generate_playlist_recos(df, features, nonplaylist_features):

    non_playlist_df = df[df['TrackID'].isin(nonplaylist_features['TrackID'].values)]
    non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop('TrackID', axis = 1).values, features.values.reshape(1, -1))[:,0]
    non_playlist_df_top_40 = non_playlist_df.sort_values('sim',ascending = False).head(10)
    return non_playlist_df_top_40


In [56]:
top_40_user_10697 = generate_playlist_recos(songs_filtered, complete_feature_set_playlist_vector, complete_feature_set_nonplaylist)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop('TrackID', axis = 1).values, features.values.reshape(1, -1))[:,0]


# Here are the recommendations

In [57]:
top_40_user_10697

Unnamed: 0,TrackID,TrackName,Popularity,Duration,Explicit,Artists,ArtistsID,ReleaseYear,Danceability,Energy,...,Valence,Tempo,TimeSignature,artists_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists,popularity_red,sim
4086,2R6UrJ8uWbSIiHWmvRQvN8,Whiskey In The Jar,73,304693,0,['Metallica'],['2ye2Wgw4gimLv2eAKyk1NB'],1998,0.511,0.97,...,0.566,132.986,4,['Metallica'],[],['Metallica'],Metallica+Whiskey In The Jar,"[old_school_thrash, metal, rock, hard_rock, th...",14,0.955792
4098,0zZPCkSMAECtxBQHWJ0enW,Blitzkrieg,41,216960,0,['Metallica'],['2ye2Wgw4gimLv2eAKyk1NB'],1998,0.493,0.97,...,0.645,98.736,4,['Metallica'],[],['Metallica'],Metallica+Blitzkrieg,"[old_school_thrash, metal, rock, hard_rock, th...",8,0.953566
4090,1R29PfqelwVMRPhg5fn0lD,Slither,42,313200,0,['Metallica'],['2ye2Wgw4gimLv2eAKyk1NB'],1997,0.509,0.851,...,0.546,110.364,4,['Metallica'],[],['Metallica'],Metallica+Slither,"[old_school_thrash, metal, rock, hard_rock, th...",8,0.952824
4096,0rDPQzWWmLTv9deHn4ulG7,Hero Of The Day,54,261907,0,['Metallica'],['2ye2Wgw4gimLv2eAKyk1NB'],1996,0.35,0.902,...,0.78,115.909,4,['Metallica'],[],['Metallica'],Metallica+Hero Of The Day,"[old_school_thrash, metal, rock, hard_rock, th...",10,0.948256
4094,0R9GrbRcpBRdwDlVPj2wEQ,My Friend Of Misery,55,407773,0,['Metallica'],['2ye2Wgw4gimLv2eAKyk1NB'],1991,0.582,0.819,...,0.394,119.513,4,['Metallica'],[],['Metallica'],Metallica+My Friend Of Misery,"[old_school_thrash, metal, rock, hard_rock, th...",11,0.947081
4100,1Bpa3yoBAqpcc8mRC9jacn,Astronomy,47,397840,0,['Metallica'],['2ye2Wgw4gimLv2eAKyk1NB'],1998,0.345,0.778,...,0.148,86.221,4,['Metallica'],[],['Metallica'],Metallica+Astronomy,"[old_school_thrash, metal, rock, hard_rock, th...",9,0.946627
4099,0cKWKhciVCYNyxFVnV1Y4R,Bad Seed,42,245333,0,['Metallica'],['2ye2Wgw4gimLv2eAKyk1NB'],1997,0.532,0.952,...,0.727,126.917,4,['Metallica'],[],['Metallica'],Metallica+Bad Seed,"[old_school_thrash, metal, rock, hard_rock, th...",8,0.945376
7919,7ag4qTNensFZLQP4raCIs6,Voracious Souls,32,339000,0,['Death Angel'],['6KVc8Llznru8n9LVCYe9dz'],1987,0.192,0.988,...,0.181,162.068,4,['Death Angel'],[],['Death Angel'],Death Angel+Voracious Souls,"[old_school_thrash, metal, hard_rock, thrash_m...",6,0.943264
4088,1bEAFB39TyDfY3KtDtK7aI,The More I See,40,288667,0,['Metallica'],['2ye2Wgw4gimLv2eAKyk1NB'],1998,0.304,0.957,...,0.0455,94.84,4,['Metallica'],[],['Metallica'],Metallica+The More I See,"[old_school_thrash, metal, rock, hard_rock, th...",8,0.934355
2064,0Auq7v8eMbFXMi6ZaMCb09,Kill Again,42,296040,0,['Slayer'],['1IQ2e1buppatiN1bxUVkrk'],1985,0.301,0.93,...,0.282,99.363,4,['Slayer'],[],['Slayer'],Slayer+Kill Again,"[speed_metal, old_school_thrash, metal, hard_r...",8,0.926429
