In [84]:
import spotipy
from spotipy import util
from spotipy.client import SpotifyException
from collections import Counter
from gensim.utils import tokenize
import time
from tqdm import tqdm
import json
import os

from dotenv import load_dotenv

In [4]:
load_dotenv()

CLIENT_ID = os.getenv('CLIENT_ID')
CLIENT_SECRET = os.getenv('CLIENT_SECRET')
USER_ID = os.getenv('USER_ID')

In [96]:
uri = 'http://127.0.0.1:8000/callback'

token = util.prompt_for_user_token(
    username = USER_ID,
    scope = '',
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    redirect_uri=uri
)

session = spotipy.Spotify(auth = token)

In [112]:
def find_playlists(session, w, max_count=5000):
    try:
        # spotipy has some weird bug and doesn't actually apply these given parameters which results in 404s
        res = session.search(w, limit = 10, type = 'playlist')#, offset = 10)
        while res:
            for playlist in res['playlists']['items']:
                yield playlist
                max_count -= 1
                if max_count == 0:
                    raise StopIteration
            tries = 3
            while tries > 0:
                try:
                    res = session.next(res['playlists'])
                    tries = 0
                except SpotifyException as e:
                    tries -= 1
                    time.sleep(0.2)
                    if tries == 0:
                        raise
    except SpotifyException as e:
        status = e.http_status
        if status == 404:
            raise StopIteration
        raise

In [113]:
for pl in find_playlists(session, 'summer'):
    break

pl

{'collaborative': False,
 'description': 'Time for Your Summer Rewind! We’ve made you a new playlist featuring your old summer favorites.',
 'external_urls': {'spotify': 'https://open.spotify.com/playlist/37i9dQZF1CAoNRs6uvbZB5'},
 'href': 'https://api.spotify.com/v1/playlists/37i9dQZF1CAoNRs6uvbZB5',
 'id': '37i9dQZF1CAoNRs6uvbZB5',
 'images': [{'height': None,
   'url': 'https://lineup-images.scdn.co/summer-rewind-2020_LARGE-en.jpg',
   'width': None}],
 'name': 'Your Summer Rewind',
 'owner': {'display_name': 'Spotify',
  'external_urls': {'spotify': 'https://open.spotify.com/user/spotify'},
  'href': 'https://api.spotify.com/v1/users/spotify',
  'id': 'spotify',
  'type': 'user',
  'uri': 'spotify:user:spotify'},
 'primary_color': None,
 'public': None,
 'snapshot_id': 'MjY1NDgyMTksMDAwMDAwMDA5ODMxYWRhNjEyNjc5YzRiNzkxZDMzMzc3YzczN2Y5OQ==',
 'tracks': {'href': 'https://api.spotify.com/v1/playlists/37i9dQZF1CAoNRs6uvbZB5/tracks',
  'total': 50},
 'type': 'playlist',
 'uri': 'spotify:

In [114]:
word_counts = Counter({'a': 1})
words_seen = set()
playlists_found = {}

In [None]:
while len(playlists_found) < 1_000:
    for word, _ in word_counts.most_common():
        if word not in words_seen:
            words_seen.add(word)
            print('word>', word)
            for playlist in find_playlists(session, word):
                if playlist['id'] in playlists:
                    continue
                elif playlist['name'] and playlist['owner']:
                    playlists_found[playlist['id']] = {
                        'owner': playlist['owner']['id'],
                        'name': playlist['name'],
                        'id': playlist['id'],
                    }
                    for token in tokenize(playlist['name'], lowercase = True):
                        word_counts[token] += 1
            break

In [100]:
def track_yielder(session, playlist):
    res = session.user_playlist_tracks(
        playlist['owner'],
        playlist['id'],
        fields = 'items(track(id, name, artists(name, id), duration_ms)),next'
    )
    while res:
        for track in res['items']:
            if track['track']:
                yield track['track']
        tries = 3
        while tries > 0:
            try:
                res = session.next(res)
                if not res or not res.get('items'):
                    raise StopIteration
                tries = 0
            except SpotifyException as e:
                if 400 <= e.http_status <= 499:
                    raise StopIteration
                tries -= 1
                time.sleep(1)
                if tries == 0:
                    raise e

In [101]:
import sqlite3

In [None]:
if os.path.isfile('data/songs.db'):
    os.remove('data/songs.db')
    
conn = sqlite3.connect('data/songs.db')
cursor = conn.cursor()
cursor.execute('CREATE TABLE songs (id text primary key, name text, artist text)')
cursor.execute('CREATE INDEX name_idx on songs(name)')

tracks_seen = set()
with open('data/playlists.ndjson', 'w') as fout_playlists:
    with open('data/songs_ids.txt', 'w') as fout_song_ids:
        for playlist in tqdm(playlists_found.values()):
            fout_playlists.write(json.dumps(playlist) + '\n')
            track_ids = []
            for track in track_yielder(session, playlist):
                track_id = track['id']
                if not track_id:
                    continue
                if not track_id in tracks_seen:
                    cursor.execute("INSERT INTO songs VALUES (?, ?, ?)", 
                              (track['id'], track['name'], track['artists'][0]['name']))
                track_ids.append(track_id)
            fout_song_ids.write(' '.join(track_ids) + '\n')
            conn.commit()
conn.commit()

## Create embeddings with word2vec

In [None]:
from operator import itemgetter

In [None]:
class WordSplitter(object):
    def __init__(self, filename):
        self.filename = filename
 
    def __iter__(self):
        with open(self.filename) as fin:
            for line in fin:
                yield line.split()

model = gensim.models.Word2Vec(model_input, min_count=4)

In [None]:
model.save(open('zoo/15/songs.word2vec', 'wb'))


In [None]:
conn = sqlite3.connect('data/songs.db')
def find_song(song_name, limit=10):
    c = conn.cursor()
    c.execute("SELECT * FROM songs WHERE UPPER(name) LIKE '%" + song_name + "%'")
    res = sorted((x + (model.wv.vocab[x[0]].count,) 
                  for x in c.fetchall() if x[0] in model.wv.vocab),
                 key=itemgetter(-1), reverse=True)
    return [*res][:limit]

for t in find_song('the eye of the tiger'):
    print(*t)

In [None]:
def suggest_songs(song_id):
    c = conn.cursor()
    similar = dict(model.most_similar([song_id]))
    song_ids = ', '.join(("'%s'" % x) for x in similar.keys())
    c.execute("SELECT * FROM songs WHERE id in (%s)" % song_ids)
    res = sorted((rec + (similar[rec[0]],) for rec in c.fetchall()), 
                 key=itemgetter(-1),
                 reverse=True)
    return [*res]


for t in suggest_songs('4rr0ol3zvLiEBmep7HaHtx'):
    print(*t)