In [2]:
import sys
import json
import os
import pandas as pd
import requests
import spotipy
from collections import defaultdict

# From Sarat
import scipy.sparse as sparse
import numpy as np
import random
import implicit
from sklearn.preprocessing import MinMaxScaler
import ipywidgets
from ipywidgets import FloatProgress

In [3]:
def extract_users(df, age, age_range):
    '''
    Extracts users from last.fm given a specific age and a +- age bound
    PARAMS:
        - df: The df that will be filtered
        - age: The center of the specified age range
        - age_range: The number of years (above or below) the center of the specified age range
    RETURNS:
        - Filtered dataframe that has ages only within the specified age range
    '''
    start = age - age_range
    end = age + age_range
    users_selected = df[(df['age'] >= start) & (df['age'] <= end)].reset_index(drop=True)
    return users_selected

def extract_histories(df, users):
    '''
    Takes a dataframe and filters it down to a specific set of users
    PARAMS:
        - df: Dataframe being filtered
        - users: Users being selected from the dataframe
    RETURNS:
        - Filtered dataframe only containing specific users
    '''
    extracted_history = df[df['user_id'].isin(users['user_id'])]
    return extracted_history

# Not used?   
def get_genres(row):
    artist = row['artist_name']
    uri = sp.search(artist)['tracks']['items'][0]['album']['artists'][0]['uri']
    artist_info = sp.artist(uri)
    genres = artist_info['genres']
    row['genres'] = genres
    return 

# Needs to have sp as an argument
def get_related_artist(uri):
    related = sp.artist_related_artists(uri)
    related_lst = []
    for artist in related['artists'][:5]:
        related_lst.append(artist['name'])
    return related_lst

# Needs to have sp as an argument
def get_top_tracks(uri):
    top_tracks = sp.artist_top_tracks(uri)
    top_lst = []
    for track in top_tracks['tracks'][:5]:
        top_lst.append(track['name'])
    return top_lst

# Needs to have sp as an argument
# Keyword argument for apply?
def extract_features(row):
    artist = row['artist_name']
    uri = sp.search(artist)['tracks']['items'][0]['album']['artists'][0]['uri']
    related_artists_extracted = get_related_artist(uri)
    top_tracks_extracted = get_top_tracks(uri)
    artist_info = sp.artist(uri)
    genres = artist_info['genres']
    popularity = artist_info['popularity']
    followers = artist_info['followers']['total']
    row['uri'] = uri
    row['genres'] = genres
    row['related_artists'] = related_artists_extracted
    row['top_tracks'] = top_tracks_extracted
    row['popularity'] = popularity
    row['followers'] = followers
    return row

# Needs to have sp as an argument
# Keyword argument for apply?
def extract_track_features(row):
    uri = sp.search(row)['tracks']['items'][0]['uri']
    features = sp.audio_features(uri)[0]
    dance = features['danceability']
    energy = features['energy']
    key = features['key']
    loudness = features['loudness']
    mode = features['mode']
    speech = features['speechiness']
    acoustic = features['acousticness']
    instrument = features['instrumentalness']
    live = features['liveness']
    valence = features['valence']
    tempo = features['tempo']
    return uri, dance, energy, key, loudness, mode, speech, acoustic, instrument, live, valence, tempo


def parse_playlist_ids(response):
    playlist_ids = []
    for item in response['items']:
        pid = item['id']

        playlist_ids.append(pid)
    return playlist_ids

def parse_track_info(response):
    track_names = []
    artist_names = []
    album_names = []
    
    for item in response['items']:
                
        # Gets the name of the track
        track = item['track']['name']
        # Gets the name of the album
        album = item['track']['album']['name']
        # Gets the name of the first artist listed under album artists
        artist = item['track']['album']['artists'][0]['name']
            
        track_names.append(track)
        album_names.append(album)
        artist_names.append(artist) 
    return track_names, album_names, artist_names

# Needs grouped df
def recommend(user_id, sparse_user_artist, user_vecs, artist_vecs, grouped_df, num_contents=10):
    user_interactions = sparse_user_artist[user_id,:].toarray()
    user_interactions = user_interactions.reshape(-1) + 1
    user_interactions[user_interactions > 1] = 0
    rec_vector = user_vecs[user_id,:].dot(artist_vecs.T)
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = user_interactions * rec_vector_scaled
    content_idx = np.argsort(recommend_vector)[::-1][:num_contents]
    artists = []
    scores = []
    for idx in content_idx:
        artists.append(grouped_df.artist_name.loc[grouped_df.artist_id == idx].iloc[0])
        scores.append(recommend_vector[idx])
    recommendations = pd.DataFrame({'artist_name': artists, 'score': scores})
    return recommendations

def get_top_recommended_tracks(artist_list, sp):
    top_list = []
    for artist in artist_list:
        uri = sp.search(artist)['tracks']['items'][0]['album']['artists'][0]['uri']
        top_tracks = sp.artist_top_tracks(uri)
        for track in top_tracks['tracks'][:5]:
            top_list.append(track['name'])
    return top_list

# Simulating User Input

In [8]:
global_vars = {
    'PARENT_AGE': 55
}

# Simulating Authentication

In [37]:
# Application information
client_id = 'f78a4f4cfe9c40ea8fe346b0576e98ea'
client_secret = 'c26db2d4c1fb42d79dc99945b2360ab4'

# Temporary placeholder until we actually get a website going
redirect_uri = 'https://google.com/'

# The permissions that our application will ask for
scope = " ".join(['playlist-modify-public',"user-top-read","user-read-recently-played","playlist-read-private"])

# Oauth object    
sp_oauth = spotipy.oauth2.SpotifyOAuth(client_id, client_secret, redirect_uri, scope=scope)

# Force auth every time
authUrl = sp_oauth.get_authorize_url()

try:
    sp = spotipy.Spotify(auth_manager=sp_oauth)
except:
    os.remove(f'.cache-{username}')
    sp = spotipy.Spotify(auth_manager=sp_oauth)

# Load Data

In [6]:
print("Loading Last.fm")
lastfm_profile = pd.read_csv('user_profile.tsv',
                            sep='\t', 
                            names=['user_id', 'gender', 'age', 'country', 'registered'])

lastfm_usersong = pd.read_csv('user_artist.tsv', 
                            sep='\t', 
                            names=['user_id', 'artist_id', 'artist_name', 'plays'])

Loading Last.fm


# Clean Data

In [23]:
print("CLEANING USER DATA")
# Cleaning user data and filtering out all non US users
cleaned_users = lastfm_profile[['user_id', 'age', 'country']].dropna().reset_index(drop=True)
cleaned_users_us = cleaned_users[cleaned_users['country'] == 'United States']
cleaned_users = cleaned_users_us[cleaned_users_us['age'] > 0]
# Choose users based on the user's specified age
chosen_users = extract_users(cleaned_users, global_vars['PARENT_AGE'], 5)

CLEANING USER DATA


In [24]:
print("CLEANING HISTORY DATA")
cleaned_history = lastfm_usersong[['user_id', 'artist_id', 'artist_name', 'plays']].dropna().reset_index(drop=True)
# Filters down the cleaned history dataframe to only include users with propper profile values
cleaned_history = extract_histories(cleaned_history, cleaned_users)
# Filters down the dataframe again to only include users that were chosen based on age
chosen_history = extract_histories(cleaned_history, chosen_users)
ap = chosen_history

CLEANING HISTORY DATA


# Build Features and Transform

In [25]:
print("CREATING NEW ARTIST FEATURES")
# Create a DataFrame of artist statistics
# For each artist finds: totalUniqueUsers, totalArtistPlays, avgUserPlays
artist_rank = ap.groupby(['artist_name']) \
.agg({'user_id' : 'count', 'plays' : 'sum'}) \
.rename(columns={"user_id" : 'totalUniqueUsers', "plays" : "totalArtistPlays"}) \
.sort_values(['totalArtistPlays'], ascending=False)
artist_rank['avgUserPlays'] = artist_rank['totalArtistPlays'] / artist_rank['totalUniqueUsers']

In [26]:
print("COMBINING NEW FEATURES WITH EXISTING DATA")
# Joins new artist information with user-artist listening history
ap = ap.join(artist_rank, on="artist_name", how="inner") \
.sort_values(['plays'], ascending=False)

In [27]:
print("MIN MAX SCALING ARTIST 'play' DATA")
# Min max scales play count
pc = ap.plays
play_count_scaled = (pc - pc.min()) / (pc.max() - pc.min())
ap = ap.assign(playCountScaled=play_count_scaled)


In [29]:
print("ENSURING ALL DATASET ENTRIES ARE UNIQUE")
# Drop duplicates just in case?
ap = ap.drop_duplicates()
# Everything is already a unique pairing, but in case groupby the identifying information
grouped_df = ap.groupby(['user_id', 'artist_id', 'artist_name']).sum().reset_index()

In [34]:
# Convert columns to category codes for implicit
grouped_df['artist_name'] = grouped_df['artist_name'].astype("category")
grouped_df['user_id'] = grouped_df['user_id'].astype("category")
grouped_df['artist_id'] = grouped_df['artist_id'].astype("category")
grouped_df['user_id'] = grouped_df['user_id'].cat.codes
grouped_df['artist_id'] = grouped_df['artist_id'].cat.codes

# Pull User Information

In [38]:
print("GETTING USER PLAYLISTS")
r = sp.current_user_playlists()

playlist_ids = parse_playlist_ids(r)


Couldn't read cache at: .cache


GETTING USER PLAYLISTS
Enter the URL you were redirected to: https://www.google.com/?code=AQCPgW35QY-rEIS36WZidwzHAKNR_UC0R7-CQX13NybEWL29nubyT1OYsCdryx4wJkSqU0s9dNfvFaCbBQoH2TMN3dkxJ-_fgp0v8pz3WvHYxJmJXPDArV6Rkt-E5sxb5DTuxtN0L5Y4pKxuD-7YXVOpMsSOiABqjvz56wZm4gvwuc5g_9Ixb1ntrGteNgwNDIz-j9ygcmqj1li-9gcnJyxUhqwCwb01ceDnBbYRAau1YTVfWUOagcAoEiGqJwCkXZ6yHTjmzcQrDCoLI7vFhQ


In [39]:
print("EXTRACTING ARTISTS FROM USER PLAYLISTS")
# Pull all the tracks from a playlist
tracks = []
albums = []
artists = []

# Loop through each playlist one by one
for pid in playlist_ids:
    # Request all track information
    r = sp.playlist_items(pid)

    tracks, albums, artists = parse_track_info(r)
    break

EXTRACTING ARTISTS FROM USER PLAYLISTS


In [40]:
# Condense into a series of normalized artist counts
playlist_artists = pd.Series(artists)
playlist_grouped = playlist_artists.value_counts(normalize=True)


In [42]:
# Essentially create a fake user-ID for our listening history
no_artist = playlist_grouped.shape[0]
curr_user = grouped_df.iloc[-1]['user_id'] + 1
curr_user_id = [curr_user] * no_artist

16
875
[875, 875, 875, 875, 875, 875, 875, 875, 875, 875, 875, 875, 875, 875, 875, 875]


In [45]:
# Creates a df of all artists in a users listening history, their normalized playcounts, 
# and the user_id of the last user in the last.fm dataset?
playlist_df = pd.DataFrame(playlist_grouped, columns=['playCountScaled']) 
playlist_df.reset_index(level=0, inplace=True)
playlist_df.columns = ['artist_name', 'playCountScaled']
playlist_df['user_id'] = pd.Series(curr_user_id)

playlist_df.head()

Unnamed: 0,artist_name,playCountScaled,user_id
0,Daryl Hall & John Oates,0.1,875
1,Bryan Adams,0.1,875
2,Mariah Carey,0.1,875
3,John Lennon,0.1,875
4,The J. Geils Band,0.05,875


In [46]:
# Reorganize df columns so that user_id comes first
cols = playlist_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
playlist_df = playlist_df[cols]
playlist_df.head()

['artist_name', 'playCountScaled', 'user_id']
['user_id', 'artist_name', 'playCountScaled']


Unnamed: 0,user_id,artist_name,playCountScaled
0,875,Daryl Hall & John Oates,0.1
1,875,Bryan Adams,0.1
2,875,Mariah Carey,0.1
3,875,John Lennon,0.1
4,875,The J. Geils Band,0.05


In [49]:
# Clean artist_name strings
playlist_df['artist_name'] = playlist_df['artist_name'].str.lower()


In [50]:
# Create a dictionary that maps artist_names from last.fm with their artist_ids
artist_pairing = dict(zip(grouped_df.artist_name, grouped_df.artist_id))


In [54]:
# In the playlist df give artists their corresponding last.fm ID
# Also drop any artists (NA values) that are not in the last.fm dataset
playlist_df['artist_id'] = playlist_df['artist_name'].map(artist_pairing)
playlist_df = playlist_df.dropna().reset_index(drop=True)
playlist_df['artist_id'] = playlist_df['artist_id'].astype(int)

In [55]:
# Append new user's listening history to the dataframe of last.fm user histories
updated_df = grouped_df.append(playlist_df)
updated_df

Unnamed: 0,user_id,artist_id,artist_name,plays,totalUniqueUsers,totalArtistPlays,avgUserPlays,playCountScaled
0,0,119,sexteto mayor,29.0,1.0,29.0,29.0,0.000542
1,0,485,libertad lamarque,7.0,2.0,187.0,93.5,0.000116
2,0,715,daniel santos,9.0,5.0,80.0,16.0,0.000155
3,0,1393,edmundo rivero,9.0,1.0,9.0,9.0,0.000155
4,0,1995,leo marini,10.0,2.0,85.0,42.5,0.000174
...,...,...,...,...,...,...,...,...
10,875,6102,styx,,,,,0.050000
11,875,7405,the police,,,,,0.050000
12,875,1011,foreigner,,,,,0.050000
13,875,6237,nirvana,,,,,0.050000


In [57]:
# Create new user_ids and artist_ids to ensure consistiency before creating CF matricies
updated_df['artist_name'] = updated_df['artist_name'].astype("category")
updated_df['user_id'] = updated_df['user_id'].astype("category")
updated_df['artist_id'] = updated_df['artist_id'].astype("category")
updated_df['user_id'] = updated_df['user_id'].cat.codes
updated_df['artist_id'] = updated_df['artist_id'].cat.codes

In [58]:
print("CREATING ARTIST-USER AND USER-ARTIST MATRICIES")
# Take user-artist pairings and their associated normalized playcounts and properly scale them
sparse_artist_user = sparse.csr_matrix((updated_df['playCountScaled'].astype(float), (updated_df['artist_id'], updated_df['user_id'])))
sparse_user_artist = sparse.csr_matrix((updated_df['playCountScaled'].astype(float), (updated_df['user_id'], updated_df['artist_id'])))
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)



CREATING ARTIST-USER AND USER-ARTIST MATRICIES


# Fit ALS Model

In [59]:
alpha = 15
data = (sparse_artist_user * alpha).astype('double')

print("FITTING ALS MODELS")
model.fit(data)

FITTING ALS MODELS


  0%|          | 0/50 [00:00<?, ?it/s]

In [60]:
user_vecs = model.user_factors
artist_vecs = model.item_factors


# Create recommendations for current user
user_id = curr_user

print("GENERATING RECOMMMENDATIONS LIST")
recommendations = recommend(user_id, sparse_user_artist, user_vecs, artist_vecs, updated_df)

updated_df.loc[updated_df['user_id'] == curr_user].sort_values(by=['playCountScaled'], ascending=False)[['artist_name', 'user_id', 'playCountScaled']].head(10)

artist_list = recommendations['artist_name'].to_list()


recommended_tracks = pd.DataFrame(get_top_recommended_tracks(artist_list, sp), columns=['track_name'])

GENERATING RECOMMMENDATIONS LIST


In [61]:
recommended_tracks

Unnamed: 0,track_name
0,Piano Cocktail
1,Love Is Here To Stay
2,In the Days of Our Love
3,There Will Never Be Another You - Live
4,Moonlight In Vermont
5,Octopath Traveler -Main Theme-
6,Decisive Battle II
7,"Cyrus, the Scholar"
8,"Primrose, the Dancer"
9,"Ophilia, the Cleric"
