## Libraries

In [6]:
import os
import json
import gensim
import sqlite3
import spotipy
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from spotipy.client import SpotifyException

## Authenticate with spotipy

In [None]:
load_dotenv()

uri = 'http://127.0.0.1:8000/callback'

def authenticate():
    token = spotipy.util.prompt_for_user_token(
        username = os.getenv('USER_ID'),
        scope = '',
        client_id = os.getenv('CLIENT_ID'),
        client_secret = os.getenv('CLIENT_SECRET'),
        redirect_uri = uri
    )
    session = spotipy.Spotify(auth = token)
    return session
    
session = authenticate()

## Fetch playlists including specific search terms

In [None]:
playlists = {}
query_words = [
    'in', 'for', 'me', 'my', 'soundtrack', 'on', 'all', 'it', 'dance', 'mix',
    'the', 'hits', 'indie', 'songs', 'music', 'this', 'to', 'in', 'love', 'party',
    'your', 'classics', 'music', 'pop', 'one', 'forever', 'best', 'rock', 'ultimate'
]

# for each query term
for word in tqdm(query_words):
    num_per_req = 50
    offsets = range(0, 1_000, num_per_req)
    # fetch 1 000 playlists in chunks on 50
    for _, i_start in enumerate(offsets):
        response = session.search(word, limit = num_per_req, offset = i_start, type = 'playlist')
        # for each playlist in each response
        for _, pl in enumerate(response['playlists']['items']):
            # check if already added
            if pl['id'] in playlists:
                continue
            # add new ones as a dict 
            elif pl['name'] and pl['owner']:
                playlists[pl['id']] = {
                    'owner': pl['owner']['id'],
                    'name': pl['name'],
                    'id': pl['id'],
                }

In [4]:
print(f'Found {len(playlists)} playlists')

Found 22197 playlists


## Fetch lists of songs for playlists

In [8]:
def fetch_songs(playlist):
    response = session.user_playlist_tracks(
        playlist['owner'],
        playlist['id'],
        fields = 'items(track(id, name, artists(name, id), duration_ms)),next'
    )
    return response

In [None]:
# remove previous db file
if os.path.isfile('./data/songs.db'):
    os.remove('./data/songs.db')

# create connection and cursor to sql db
conn = sqlite3.connect('./data/songs.db')
cursor = conn.cursor()

# initialize db
cursor.execute('CREATE TABLE songs (id text primary key, name text, artist text)')
cursor.execute('CREATE INDEX name_idx on songs(name)')

# keep track of ids in db efficiently
tracks_in_db = set()
# this is for word2vec later, a list of id lists
all_ids = []

# for each playlist dict
for pl in tqdm(playlists.values()):
    # fetch tracks in playlist with spotipy
    try:
        response = fetch_songs(pl)
    # re-authenticate if authentication expires
    except:
        session = authenticate()
        response = fetch_songs(pl)
    # keep a list of ids in current playlist
    pl_ids = []
    # for each track
    for item in response['items']:
        track = item['track']
        # skip if none type
        if not track:
            continue
        track_id = track['id']
        # add the new ones to the db
        if track_id:
            # append current id to list
            pl_ids.append(track_id)
            # if new, add to db
            if track_id not in tracks_in_db:
                # execute insert query
                cursor.execute(
                    'INSERT INTO songs VALUES (?, ?, ?)',
                    (track['id'], track['name'], track['artists'][0]['name'])
                )
                # update lookup set
                tracks_in_db.add(track_id)
        else:
            continue
        # commit db changes
        conn.commit()
    # update list of id lists with current playlist ids
    all_ids.append(pl_ids)
# close db connection
conn.close()

In [36]:
print(f'The playlists contain {np.sum([len(pl) for pl in all_ids])} songs in total of which {len(tracks_in_db)} are unique')

The playlists contain 1481283 songs in total of which 655739 are unique


## Train word2vec model on song ids present in playlists

In [None]:
class SongIdCorpus(object):
    '''An iterator class for word2vec that yields lists of song ids'''
    def __init__(self, pl_list):
        self.pl_list = pl_list
    
    def __iter__(self):
        # for each playlist
        for pl_ids in self.pl_list:
            # return list of ids in playlist
            yield pl_ids

id_corpus = SongIdCorpus(all_ids)

w2v_model = gensim.models.Word2Vec(min_count = 3)

w2v_model.build_vocab(sentences = id_corpus)

w2v_model.train(sentences = id_corpus, total_examples = w2v_model.corpus_count, epochs = 10)

w2v_model.save('./models/songs.word2vec')

In [72]:
print(f'The vocabulary containts {len(w2v_model.wv.vocab)} songs')

The vocabulary containts 99445 songs


## Evaluate embeddings

In [136]:
def search_song(connection, model, song_name, limit):
    '''Returns the best matches according to the query term'''
    cursor = connection.cursor()
    # finds any values that have `song name` in any position
    cursor.execute(f"SELECT * FROM songs WHERE UPPER(name) LIKE '%{song_name}%'")
    res = cursor.fetchall()
    # each item is a tuple to which the count (from vocab) is appended to
    res_counted = (
        item + (model.wv.vocab[item[0]].count, )
        for item in res
        if item[0] in model.wv.vocab
    )
    # sort by count
    res_sorted = sorted(res_counted, key = lambda item: item[-1], reverse = True)
    # return items up to a count specified by the limit
    res_top = [*res_sorted][:limit]
    return res_top

def suggest_songs(connection, model, song_id):
    '''Returns the most similar songs'''
    cursor = connection.cursor()
    # get the 10 most similar songs ids and similarities for the query id
    similar = model.wv.most_similar([song_id])
    similar_dict = dict(similar)
    similar = [f"'{song_id}'" for (song_id, sim) in similar]
    # query the database for the song names with those ids
    query_ids = ', '.join(similar)
    cursor.execute(f"SELECT * FROM songs WHERE id in ({query_ids})")
    res = cursor.fetchall()
    # each item is a tuple to which the similarity is appended to
    res_sims = (item + (similar_dict[item[0]], ) for item in res)
    # sort by similarity
    res_sorted = sorted(res_sims, key = lambda item: item[-1], reverse = True)
    res_sorted = [*res_sorted]
    return res_sorted

In [138]:
# create connection and cursor to sql db
conn = sqlite3.connect('./data/songs.db')

for song in search_song(conn, w2v_model, 'air', 10):
    print(*song)

57RA3JGafJm5zRtKJiKPIm Are You Bored Yet? (feat. Clairo) Wallows 166
6lV2MSQmRIkycDScNtrBXO Airplanes (feat. Hayley Williams) B.o.B 141
5CQ30WqJwcep0pYcV4AMNc Stairway to Heaven - Remaster Led Zeppelin 126
7sO5G9EABYOXQKNPNiE9NR Ric Flair Drip (& Metro Boomin) Offset 88
18AXbzPzBS8Y3AkgSxzJPb In The Air Tonight - 2015 Remastered Phil Collins 86
2qPUnoasNe4Ep43emVXEig Billionaire (feat. Bruno Mars) Travie McCoy 79
51pQ7vY7WXzxskwloaeqyj Stairway to Heaven - 1990 Remaster Led Zeppelin 75
2M9ULmQwTaTGmAdXaXpfz5 Billionaire (feat. Bruno Mars) Travie McCoy 73
7Dbg5O9nNWu6SWxDjJ9qoq In The Air Tonight Phil Collins 57
4kfeRwpq5KUaqTkgi4TbDF Solitaires (feat. Travis Scott) Future 56


In [137]:
# Elton John - Tiny Dancer
for t in suggest_songs(conn, w2v_model, '2TVxnKdb3tqe1nhQWwwZCO'):
    print(*t)

3FCto7hnn1shUyZL42YgfO Piano Man Billy Joel 0.9999197721481323
3v8PlUFGQQDBIk1J86waCo Should I Stay or Should I Go - Remastered The Clash 0.9998630285263062
0qRR9d89hIS0MHRkQ0ejxX Rich Girl Daryl Hall & John Oates 0.9998554587364197
6dGnYIeXmHdcikdzNNDMm2 Here Comes The Sun - Remastered 2009 The Beatles 0.9998552799224854
2DyHhPyCZgZzNXn1IrtsTu Any Way You Want It Journey 0.999847412109375
7f0jXNMu2xjQUtmKMuWhGA What's Up? 4 Non Blondes 0.9998024702072144
74EV0g12ihUoOUXMprFpZB Losing My Religion R.E.M. 0.9997856616973877
0LN0ASTtcGIbNTnjSHG6eO Pour Some Sugar On Me (2012) Def Leppard 0.9997833371162415
5dRQUolXAVX3BbCiIxmSsf Your Love The Outfield 0.9997764229774475
0aym2LBJBk9DAYuHHutrIl Hey Jude - Remastered 2015 The Beatles 0.9997748136520386


In [83]:
# Led Zeppelin - Stairway to Heaven 
for t in suggest_songs(conn, w2v_model, 10, '51pQ7vY7WXzxskwloaeqyj'):
    print(*t)

7MRyJPksH3G2cXHN8UKYzP American Girl Tom Petty and the Heartbreakers 0.9999043941497803
6NxsCnLeLd8Ai1TrgGxzIx Bad Moon Rising Creedence Clearwater Revival 0.9998791217803955
2vX5WL7s6UdeQyweZEx7PP Walk This Way Aerosmith 0.9998680353164673
37Tmv4NnfQeb0ZgUC4fOJj Sultans Of Swing Dire Straits 0.9998409748077393
3vV3cr2TpPqFk07zxYUbla Cum on Feel the Noize Quiet Riot 0.9997990131378174
0vOkmmJEtjuFZDzrQSFzEE Hit Me With Your Best Shot Pat Benatar 0.9997983574867249
0GTK6TesV108Jj5D3MHsYb Owner of a Lonely Heart Yes 0.9997979998588562
0LN0ASTtcGIbNTnjSHG6eO Pour Some Sugar On Me (2012) Def Leppard 0.9997971057891846
0FeCO85RKW8fDRytwXof2x Go Your Own Way Fleetwood Mac 0.9997912645339966
3qT4bUD1MaWpGrTwcvguhb Black Dog - Remaster Led Zeppelin 0.9997897744178772


In [139]:
conn.close()