# Import Packages

In [11]:
import sys
import spotipy
import spotipy.util as util
import sqlite3
from spotipy.client import SpotifyException

import numpy as np

from collections import Counter
from gensim.utils import tokenize

import time
import json
import ndjson
import tqdm

import requests
from bs4 import BeautifulSoup
from requests.exceptions import ReadTimeout

from creds import *

## Get Data

Getting the initial 1 million playlists using a list of musical terms is useful to get started collecting songs. However, given that collected playlists can be deleted and new playlists are constantly being created, it is useful to continue collecting more playlists.

The Crawler will ensure that the retrieved playlists are not duplicates that have already been collected, and will add that playlist to the collection.

In [2]:
# Re-run music terms to find new playlists
music_terms =   ["a",
                 "summer",
                 "discography",
                 "collection",
                 "soundtrack",
                 "rock",
                 "hiphop",
                 "jazz",
                 "metal", 
                 "classical",
                 "pop",
                 "country",
                 "blues",
                 "rap",
                 "hits",
                 "top",
                 "workout",
                 "my",
                 "playlist",
                 "mood",
                 "hype",
                 "spotify",
                 "vibe",
                 "musical",
                 "recent",
                 "night",
                 "day",
                 "positive",
                 "all",
                 "dance",
                 "latin",
                 "southern",
                 "electropop",
                 "tropical",
                 "contemporary",
                 "alternative",
                 "gangster",
                 "soul",
                 "house",
                 "progressive",
                 "modern"
                ]

## Define Get Playlists

In [12]:
def find_playlists(session, w, max_count=750):
    try:
        res = session.search(w, limit=50, type='playlist')
        while res:
            for playlist in res['playlists']['items']:
                yield playlist
                max_count -= 1
                if max_count == 0:
                    return
            tries = 3
            while tries > 0:
                try:
                    res = session.next(res['playlists'])
                    tries = 0
                except SpotifyException as e:
                    tries -= 1
                    time.sleep(90)
                    if tries == 0:
                        raise
                
                except ReadTimeout:
                    print("Spotify timed out... Trying again...")
                    time.sleep(90)
                    
    except SpotifyException as e:
        status = e.http_status
        if status == 404:
            time.sleep(90)
            raise StopIteration
        raise

## Define Get Tracks

In [13]:
def track_yielder(session, playlist):
    res = session.user_playlist_tracks(playlist['owner'], playlist['id'],
              fields = 'items(track(id, name, artists(name, id), duration_ms)),next')
    while res:
        for track in res['items']:
            if track['track']:
                yield track['track']
        tries = 3
        while tries > 0:
            try:
                res = session.next(res)
                if not res or not res.get('items'):
                    return
                tries = 0
            except SpotifyException as e:
                if 400 <= e.http_status <= 499:
                    raise StopIteration
                tries -= 1
                time.sleep(90)
                if tries == 0:
                    raise e

## Get New Playlists from Search Terms

In [6]:
# Load Playlist object with current playlists
playlists = json.load(open("data/playlists.json"))

# Create set of seen terms and counters
words_seen = set()
new_playlists = {}
count = 0
dupes = 0

In [7]:
for term in music_terms:
    # Set the counter for the search term
    count_term = 0
    
    # Set the Counter for popular terms and only proceed with unique search terms
    if not term in words_seen:
        word_counts = Counter({term : 1})
    
        # Get 10,000 playlists for each unique term
        while count_term < 10000:
            for word, _ in word_counts.most_common():
                if not word in words_seen:
                    words_seen.add(word)
                    print(f"Total playlist count is {count}...")
                    print(f"Term playlist count is {count_term}...")
                    print('word>', word)
                    print("")
                    for playlist in find_playlists(session, word):
                        if playlist['id'] in playlists: 
                            dupes += 1
                        elif playlist['name'] and playlist['owner']: 
                            new_playlists[playlist['id']] = {
                                  'owner': playlist['owner']['id'],
                                  'name': playlist['name'],
                                  'id': playlist['id'],
                            }
                            count += 1
                            count_term += 1
                            for token in tokenize(playlist['name'], lowercase=True): 
                                word_counts[token] += 1
                    break
    else:
        print("Repeated Term")
        print("")

Total playlist count is 0...
Term playlist count is 0...
word> a

Total playlist count is 436...
Term playlist count is 436...
word> y

Total playlist count is 809...
Term playlist count is 809...
word> en

Total playlist count is 1225...
Term playlist count is 1225...
word> e

Total playlist count is 1569...
Term playlist count is 1569...
word> and

Total playlist count is 1917...
Term playlist count is 1917...
word> s

Total playlist count is 2467...
Term playlist count is 2467...
word> rodrigo

Total playlist count is 3109...
Term playlist count is 3109...
word> olivia

Total playlist count is 3798...
Term playlist count is 3798...
word> songs

Total playlist count is 4283...
Term playlist count is 4283...
word> the

Total playlist count is 4656...
Term playlist count is 4656...
word> of

Total playlist count is 4996...
Term playlist count is 4996...
word> best

Total playlist count is 5351...
Term playlist count is 5351...
word> hits

Total playlist count is 5662...
Term playlist c

Total playlist count is 51307...
Term playlist count is 9965...
word> as

Repeated Term

Total playlist count is 51685...
Term playlist count is 0...
word> metal

Total playlist count is 52012...
Term playlist count is 327...
word> metallica

Total playlist count is 52614...
Term playlist count is 929...
word> heavy

Total playlist count is 52961...
Term playlist count is 1276...
word> bass

Total playlist count is 53381...
Term playlist count is 1696...
word> boosted

Total playlist count is 54012...
Term playlist count is 2327...
word> car

Total playlist count is 54475...
Term playlist count is 2790...
word> house

Total playlist count is 54828...
Term playlist count is 3143...
word> deep

Total playlist count is 55215...
Term playlist count is 3530...
word> workout

Total playlist count is 55608...
Term playlist count is 3923...
word> gym

Total playlist count is 56041...
Term playlist count is 4356...
word> motivation

Total playlist count is 56440...
Term playlist count is 4755..

Total playlist count is 107961...
Term playlist count is 4648...
word> kidcore

Total playlist count is 108564...
Term playlist count is 5251...
word> internetcore

Total playlist count is 108736...
Term playlist count is 5423...
word> oddcore

Total playlist count is 108891...
Term playlist count is 5578...
word> nostalgiacore

Total playlist count is 109019...
Term playlist count is 5706...
word> o

Total playlist count is 109422...
Term playlist count is 6109...
word> w

Total playlist count is 109862...
Term playlist count is 6549...
word> go

Total playlist count is 110271...
Term playlist count is 6958...
word> indie

Total playlist count is 110668...
Term playlist count is 7355...
word> d

Total playlist count is 111095...
Term playlist count is 7782...
word> dream

Total playlist count is 111624...
Term playlist count is 8311...
word> dreams

Total playlist count is 112195...
Term playlist count is 8882...
word> sweet

Total playlist count is 112781...
Term playlist count is 94

ReadTimeout: HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)

## Get Tracks for New Playlists

In [8]:
# Get collected tracks
conn = sqlite3.connect('data/songs.db')
c = conn.cursor()
c.execute("SELECT id FROM songs")

sql_tracks = c.fetchall()

In [9]:
# Clean tuple format from SQL response
tracks_seen = set()
for i in range(len(sql_tracks)):
    tracks_seen.add(sql_tracks[i][0])

In [14]:
len(new_playlists.keys())

95726

In [15]:
# Retrieve new tracks and save them
count = 0
with open('data/playlists.ndjson', 'a') as fout_playlists:
    with open('data/songs_ids.txt', 'a') as fout_song_ids:
        for playlist in tqdm.tqdm_notebook(new_playlists.values()):
            try:
                fout_playlists.write(json.dumps(playlist) + '\n')
                track_ids = []
                for track in track_yielder(session, playlist):
                    track_id = track['id']
                    if not track_id:
                        continue
                    if not track_id in tracks_seen:
                        c.execute("INSERT INTO songs VALUES (?, ?, ?)", 
                                  (track['id'], track['name'], track['artists'][0]['name']))
                    track_ids.append(track_id)
                    tracks_seen.add(track_id)
                fout_song_ids.write(' '.join(track_ids) + '\n')
                conn.commit()
            except:
                session = spotipy.Spotify(auth_manager = spotipy.SpotifyOAuth(
                    client_id = client_id,
                    client_secret = client_secret,
                    scope = scope,
                    username = '9qwpnpaxd9zbx1q1jrdh9bxz1',
                    redirect_uri = redirect_uri
                ))
                count += 1
                print(f"Playlist Failures: {count}")
                print(playlist)
conn.commit()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for playlist in tqdm.tqdm_notebook(new_playlists.values()):


HBox(children=(FloatProgress(value=0.0, max=95726.0), HTML(value='')))

Playlist Failures: 1
{'owner': 'zeeicktempeer', 'name': 'POLOBOYS', 'id': '0kUdxXsgYG2aUOD0AS0bfX'}


HTTP Error for GET to https://api.spotify.com/v1/playlists/6WsFVqMU2qSped5Bpfcjvc/tracks returned 404 due to Not found.


Playlist Failures: 2
{'owner': 'u2u66mhrb4qvg6v1noh2xzj8m', 'name': 'carti’s gf', 'id': '6WsFVqMU2qSped5Bpfcjvc'}


HTTP Error for GET to https://api.spotify.com/v1/playlists/7GPloZvmAgzpyjHfpvNv1d/tracks returned 404 due to Not found.


Playlist Failures: 3
{'owner': 'y3n9g32vu16eek7v2s3n5cp88', 'name': 'dreams playlist (h word)', 'id': '7GPloZvmAgzpyjHfpvNv1d'}
Playlist Failures: 4
{'owner': 'joseph1384', 'name': 'joeys country ', 'id': '5BR968UOijm253AqNpnIXO'}


HTTP Error for GET to https://api.spotify.com/v1/playlists/2xzRZDfhhKaVGKKPAumHp9/tracks returned 404 due to Not found.


Playlist Failures: 5
{'owner': '0tiuwqx8xxeoqqv0t1y0qfymx', 'name': 'Ceiling ', 'id': '2xzRZDfhhKaVGKKPAumHp9'}
Playlist Failures: 6
{'owner': '1111831730', 'name': 'Rat holder🚜', 'id': '5c4DQX6zpZx7s5FervWxe9'}
Playlist Failures: 7
{'owner': 'hamelinck', 'name': 'Drill USA: NY Edition (Brooklyn Drill)', 'id': '1xxUda8w0R1fPsP2ugT3aY'}
Playlist Failures: 8
{'owner': 'willoughbyseago', 'name': 'MUSICALS GALORE', 'id': '4dv2aVt0WSzzU8pBwlcAnJ'}

