# Import Packages

In [1]:
import sys
import spotipy
import spotipy.util as util
import sqlite3
from spotipy.client import SpotifyException

import numpy as np

from collections import Counter
from gensim.utils import tokenize

import time
import json
import ndjson
import tqdm

import requests
from bs4 import BeautifulSoup

from creds import *

## Get Data

Getting the initial 1 million playlists using a list of musical terms is useful to get started collecting songs. However, given that collected playlists can be deleted and new playlists are constantly being created, it is useful to continue collecting more playlists.

The Crawler will ensure that the retrieved playlists are not duplicates that have already been collected, and will add that playlist to the collection.

In [2]:
# Re-run music terms to find new playlists
music_terms =   ["a",
                 "discography",
                 "collection",
                 "soundtrack",
                 "rock",
                 "hiphop",
                 "jazz",
                 "metal", 
                 "classical",
                 "pop",
                 "country",
                 "blues",
                 "rap",
                 "hits",
                 "top",
                 "workout",
                 "my",
                 "playlist",
                 "mood",
                 "hype",
                 "spotify",
                 "vibe",
                 "musical",
                 "recent",
                 "night",
                 "day",
                 "positive",
                 "all",
                 "dance",
                 "latin",
                 "southern",
                 "electropop",
                 "tropical",
                 "contemporary",
                 "alternative",
                 "gangster",
                 "soul",
                 "house",
                 "progressive",
                 "modern"
                ]

## Define Get Playlists

In [3]:
def find_playlists(session, w, max_count=1950):
    try:
        res = session.search(w, limit=50, type='playlist')
        while res:
            for playlist in res['playlists']['items']:
                yield playlist
                max_count -= 1
                if max_count == 0:
                    return
            tries = 3
            while tries > 0:
                try:
                    res = session.next(res['playlists'])
                    tries = 0
                except SpotifyException as e:
                    tries -= 1
                    time.sleep(0.2)
                    if tries == 0:
                        raise
    except SpotifyException as e:
        status = e.http_status
        if status == 404:
            raise StopIteration
        raise

## Define Get Tracks

In [4]:
def track_yielder(session, playlist):
    res = session.user_playlist_tracks(playlist['owner'], playlist['id'],
              fields = 'items(track(id, name, artists(name, id), duration_ms)),next')
    while res:
        for track in res['items']:
            if track['track']:
                yield track['track']
        tries = 3
        while tries > 0:
            try:
                res = session.next(res)
                if not res or not res.get('items'):
                    return
                tries = 0
            except SpotifyException as e:
                if 400 <= e.http_status <= 499:
                    raise StopIteration
                tries -= 1
                time.sleep(1)
                if tries == 0:
                    raise e

## Get New Playlists from Search Terms

In [5]:
# Load Playlist object with current playlists
playlists = json.load(open("data/playlists.json"))

# Create set of seen terms and counters
words_seen = set()
new_playlists = {}
count = 0
dupes = 0

In [6]:
for term in music_terms:
    # Set the counter for the search term
    count_term = 0
    
    # Set the Counter for popular terms and only proceed with unique search terms
    if not term in words_seen:
        word_counts = Counter({term : 1})
    
        # Get 10,000 playlists for each unique term
        while count_term < 10000:
            for word, _ in word_counts.most_common():
                if not word in words_seen:
                    words_seen.add(word)
                    print(f"Total playlist count is {count}...")
                    print(f"Term playlist count is {count_term}...")
                    print('word>', word)
                    print("")
                    for playlist in find_playlists(session, word):
                        if playlist['id'] in playlists: 
                            dupes += 1
                        elif playlist['name'] and playlist['owner']: 
                            new_playlists[playlist['id']] = {
                                  'owner': playlist['owner']['id'],
                                  'name': playlist['name'],
                                  'id': playlist['id'],
                            }
                            count += 1
                            count_term += 1
                            for token in tokenize(playlist['name'], lowercase=True): 
                                word_counts[token] += 1
                    break
    else:
        print("Repeated Term")
        print("")

Total playlist count is 0...
Term playlist count is 0...
word> a

Total playlist count is 897...
Term playlist count is 897...
word> songs

Total playlist count is 1819...
Term playlist count is 1819...
word> all

Total playlist count is 2780...
Term playlist count is 2780...
word> alle

Total playlist count is 4226...
Term playlist count is 4226...
word> hörspiele

Total playlist count is 6086...
Term playlist count is 6086...
word> die

Total playlist count is 7432...
Term playlist count is 7432...
word> folgen

Total playlist count is 8444...
Term playlist count is 8444...
word> drei

Total playlist count is 9689...
Term playlist count is 9689...
word> und

Total playlist count is 10410...
Term playlist count is 0...
word> discography

Total playlist count is 11124...
Term playlist count is 714...
word> complete

Total playlist count is 11603...
Term playlist count is 1193...
word> the

Total playlist count is 12485...
Term playlist count is 2075...
word> collection

Total playlist 

Total playlist count is 102816...
Term playlist count is 9272...
word> that

Total playlist count is 103893...
Term playlist count is 0...
word> hype

Total playlist count is 105160...
Term playlist count is 1267...
word> up

Total playlist count is 106007...
Term playlist count is 2114...
word> upbeat

Total playlist count is 107394...
Term playlist count is 3501...
word> happy

Total playlist count is 108626...
Term playlist count is 4733...
word> updated

Total playlist count is 109474...
Term playlist count is 5581...
word> weekly

Total playlist count is 110408...
Term playlist count is 6515...
word> dance

Total playlist count is 111126...
Term playlist count is 7233...
word> party

Total playlist count is 111891...
Term playlist count is 7998...
word> edm

Total playlist count is 112724...
Term playlist count is 8831...
word> summer

Total playlist count is 113961...
Term playlist count is 0...
word> spotify

Total playlist count is 114562...
Term playlist count is 601...
word> 

HTTP Error for GET to https://api.spotify.com/v1/search?query=canciones&type=playlist&offset=2000&limit=50 returned 404 due to Not found.
HTTP Error for GET to https://api.spotify.com/v1/search?query=canciones&type=playlist&offset=2000&limit=50 returned 404 due to Not found.
HTTP Error for GET to https://api.spotify.com/v1/search?query=canciones&type=playlist&offset=2000&limit=50 returned 404 due to Not found.


RuntimeError: generator raised StopIteration

## Get Tracks for New Playlists

In [7]:
# Get collected tracks
conn = sqlite3.connect('data/songs.db')
c = conn.cursor()
c.execute("SELECT id FROM songs")

sql_tracks = c.fetchall()

In [8]:
# Clean tuple format from SQL response
tracks_seen = set()
for i in range(len(sql_tracks)):
    tracks_seen.add(sql_tracks[i][0])

In [13]:
len(new_playlists.keys())

167021

In [15]:
# Retrieve new tracks and save them
count = 0
with open('data/playlists.ndjson', 'a') as fout_playlists:
    with open('data/songs_ids.txt', 'a') as fout_song_ids:
        for playlist in tqdm.tqdm_notebook(new_playlists.values()):
            try:
                fout_playlists.write(json.dumps(playlist) + '\n')
                track_ids = []
                for track in track_yielder(session, playlist):
                    track_id = track['id']
                    if not track_id:
                        continue
                    if not track_id in tracks_seen:
                        c.execute("INSERT INTO songs VALUES (?, ?, ?)", 
                                  (track['id'], track['name'], track['artists'][0]['name']))
                    track_ids.append(track_id)
                    tracks_seen.add(track_id)
                fout_song_ids.write(' '.join(track_ids) + '\n')
                conn.commit()
            except:
                session = spotipy.Spotify(auth_manager = spotipy.SpotifyOAuth(
                    client_id = client_id,
                    client_secret = client_secret,
                    scope = scope,
                    username = user_id,
                    redirect_uri = redirect_uri
                ))
                count += 1
                print(f"Playlist Failures: {count}")
                print(playlist)
conn.commit()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for playlist in tqdm.tqdm_notebook(new_playlists.values()):


HBox(children=(FloatProgress(value=0.0, max=167021.0), HTML(value='')))

Playlist Failures: 1
{'owner': 'tsn2h1b8vlso76s57r18ypbgy', 'name': 'Bad Day Songs', 'id': '39dsya7LXa138jJQ3SxCXT'}
Playlist Failures: 2
{'owner': 'ibrahimazalan', 'name': 'MALAY JIWANG SONGSSS', 'id': '52xB8CslxfwjgxKuoqkZUD'}
Playlist Failures: 3
{'owner': 'phillcrap', 'name': 'UFC entrance songs', 'id': '3ZBsmwzFkOiLEKHmp5GUIr'}
Playlist Failures: 4
{'owner': '31f6abr4mxtewvs5tcwqle53fbx4', 'name': 'Mizo Songs', 'id': '4Cgr2TjeSxWgv51kU5xciC'}
Playlist Failures: 5
{'owner': 'manelijamal', 'name': 'Reading Music (Focus, Concentration, Study)', 'id': '6tpnEvjXNhjYm1nXDVfCnB'}
Playlist Failures: 6
{'owner': '11181340712', 'name': 'Piano & Violin & Cello', 'id': '3XAqOmvG9TI0FEu59wgbkX'}
Playlist Failures: 7
{'owner': 'ninaxlice', 'name': 'ROCK/ METAL', 'id': '6dGI96SVg4FkI6txjIUOgT'}


Max Retries reached


Playlist Failures: 8
{'owner': 'kevin.engevik', 'name': 'Vibezzz', 'id': '5OE2k55TLljq6Ao4HZcr39'}
Playlist Failures: 9
{'owner': 'andybiersackismyhusband', 'name': 'underrated bops', 'id': '5pevnYh1W40Kl3enmVdx8p'}
Playlist Failures: 10
{'owner': 'annamahrt', 'name': 'well played', 'id': '52kTlgLYj9GOKFF90GbbBW'}
Playlist Failures: 11
{'owner': 'jlaroa69', 'name': 'Nightshift', 'id': '7khg1KoQh2axzxeGB4VPte'}
Playlist Failures: 12
{'owner': 'ewanney', 'name': 'nights like these', 'id': '286al9xj4atDSimvIZnfPU'}
Playlist Failures: 13
{'owner': '22rcgzjaspkgzhkpive7twziq', 'name': 'Naija 2019 latest', 'id': '4lP9dg7CTZEDubRObKimB8'}
Playlist Failures: 14
{'owner': '763fj5pu4ep7lqnjxgl9px1pt', 'name': 'Trap Town Records', 'id': '4nag0wtET3KIKRrj7wJQ2y'}
Playlist Failures: 15
{'owner': 'chingoncito', 'name': 'Latest Salsa Jams', 'id': '1XrWlxzw1uRX9qa2nWkQtm'}
Playlist Failures: 16
{'owner': '11766014', 'name': 'PTV, SWS, ATL, P!ATD, MCR, ADTR, BMTH, FIR, PVRIS, PARAMORE, TA, YMAS, TWL, A

Max Retries reached


Playlist Failures: 70
{'owner': '11145391561', 'name': 'Positive vipes', 'id': '0T196BOScdv6aBRSdt3HU1'}


Max Retries reached


Playlist Failures: 71
{'owner': 'broncosfan102134', 'name': 'Car- Positive rap', 'id': '2nQff1vRJ0u1SPD36k8Coy'}
Playlist Failures: 72
{'owner': 'tpf7li1ny1efmytmylm5kru85', 'name': 'Bakery Bitch Bangaz 🤘🏼', 'id': '0G2kpChKzAq1iFBz2zBLWm'}
Playlist Failures: 73
{'owner': 'keiraxquinn', 'name': 'sad bitch', 'id': '6Ie1y999UKgXd7IzV1WKcW'}
Playlist Failures: 74
{'owner': 'ruby_grace__', 'name': 'QUEEN SHIT ', 'id': '3UVanJ6YSWModhQ5pNmGIc'}
Playlist Failures: 75
{'owner': 'muk4jru1hy90ayehbqdn1t6vg', 'name': 'Stupid Like Bitch Fuck', 'id': '0tY2ZKWrcCkpzrMOH2N537'}
Playlist Failures: 76
{'owner': 'i3uvq32d09zi1xernmxyj45d9', 'name': 'i’m your bitch, you’re my bitch✨', 'id': '38sdMXo8imv98tOAML1mpv'}
Playlist Failures: 77
{'owner': '21c6b7szotlqr6u6p5zua45ni', 'name': 'Bad Bitches Playlist', 'id': '2UZnvJLPgWoOjlB2h3btVr'}
Playlist Failures: 78
{'owner': 'p9yk1yafz8c1ygqxmmkcn9obm', 'name': 'BAD BITCH (en español) ', 'id': '1wq9Boc3m7KymqCZJoGaLP'}
Playlist Failures: 79
{'owner': 'roosama