# Query Spotify data on daily/weekly chart playlists

2022 June 05

In [1]:
from __future__ import division, print_function

#import json
import requests

In [2]:
import coding_club

In [3]:
# for testing with expired tokens
#token = "BQDqHhMHQ8Foyth_kehVIYYmM_GR1_d8aBg3H3xQmxNF77FkOLBtnbBsGZ41JhXYIDcHQAP4qP9Qio7rDdY"
# good until ~1640 on Sunday June 05, 2022
token = "BQC5M9AKsYreUsSMz7C_IhnpczsRU2orTUt1lYOFQSt7RT3OVL9XRWuBWsT0Q6zcGnkhen7aXBZo8z9UjDg"

# Generic Spotify API query methods

In [4]:
def spquery(url, token, **kwargs):
    """
    HTTPS GET query to Spotify Web API using client credentials,
    returns JSON-formatted output
    """
    x = requests.get(
        url,
        headers = {
            "Content-Type": "application/json",
            "Accept": "application/json",
            "Authorization": "Bearer {:s}".format(token),
        },
        params = kwargs
    )
    j = x.json()
    if 'error' in j:
        raise Exception("Error {:d}, {:s}".format(j['error']['status'], j['error']['message']))
    return j

In [5]:
def get_playlist_tracks(spid, token):
    """
    Obtain list of ordered tracks in a playlist,
    discarding playlist metadata (only care about individual tracks)
    Input: Spotify ID of playlist, OAuth token
    Returns: list of Spotify tracks
    """
    j = spquery("https://api.spotify.com/v1/playlists/{:s}/tracks".format(spid), token=token)
    return [entry['track'] for entry in j['items']]


def get_track_audio_features(spid, token):
    """Returns: dict of track audio features"""
    j = spquery("https://api.spotify.com/v1/audio-features/{:s}".format(spid), token=token)
    return j


def get_multiple_track_audio_features(spids, token):
    """Returns: list of dicts with track audio features"""
    j = spquery("https://api.spotify.com/v1/audio-features", token=token, ids=",".join(spids))
    return j['audio_features']

# Construct the data product we want: list of tracks, 1 dict per track holding just the most relevant/interesting audio features

In [6]:
def build_dataset_from_playlist(spid, token):
    """
    Input: spotify ID of playlist
    Output: our custom-munged dataset
    """
    tracks = get_playlist_tracks(spid, token=token)
    ids = [track['id'] for track in tracks]
    features = get_multiple_track_audio_features(ids, token=token)
    
    # attach interesting audio features
    # https://developer.spotify.com/documentation/web-api/reference/#/operations/get-audio-features
    wanted_features = [
        'acousticness',
        'danceability',
        'energy',
        'instrumentalness',
        'key',
        'liveness',
        'loudness',
        'mode',
        'speechiness',
        'tempo',
        'time_signature',
        'valence',
    ]

    for f, t in zip(features,tracks):
        assert f['id'] == t['id']  # really basic sanity check
        for k in wanted_features:
            assert k not in t
            t[k] = f[k]
    
    # pare down unnecessarily long features
    for t in tracks:
        
        # keep only album name
        album_name = t['album']['name']
        t['album'] = album_name
        
        # keep only a list of artist names
        #artists = [a['name'] for a in t['artists']]
        #t['artists'] = artists
        
        # keep only the very first artist
        artist = t['artists'][0]['name']
        del t['artists']
        t['artist'] = artist
    
    # remove unneeded features
    # note, "id" is useful for unqiuely identifying song on spotify, and for further data queries,
    # but suppress this b/c the envisioned users will not do any querying.
    delete_keys = [
        'available_markets',
        'disc_number',
        'episode',
        'external_ids',
        'external_urls',
        'href',
        'id',
        'is_local',
        'preview_url',
        'track',
        'track_number',
        'type',
        'uri',
    ]
    for t in tracks:
        for k in delete_keys:
            del t[k]
    
    return tracks

# Save playlist data to json

In [7]:
top_50_weekly_global_id = "37i9dQZEVXbNG2KDcFcKOF"

top_50_weekly_australia_id = "37i9dQZEVXbK4fwx2r07XW"
top_50_weekly_brazil_id = "37i9dQZEVXbKzoK95AbRy9"
top_50_weekly_dominican_republic_id = "37i9dQZEVXbMPoK06pe7d6"
top_50_weekly_nigeria_id = "37i9dQZEVXbLw80jjcctV1"
top_50_weekly_norway_id = "37i9dQZEVXbLWYFZ5CkSvr"
top_50_weekly_south_korea_id = "37i9dQZEVXbJZGli0rRP3r"

top_tracks_2021_usa = "37i9dQZF1DXbJMiQ53rTyJ"
top_tracks_2020_usa = "37i9dQZF1DXaqCgtv7ZR3L"

In [8]:
for fname, spid in [
    
    ("viral_50_daily_usa.json", "37i9dQZEVXbKuaTI1Z1Afx"),
    ("top_50_daily_usa.json", "37i9dQZEVXbLRQDuF5jeBp"),
    ("top_50_weekly_usa.json", "37i9dQZEVXbLp5XoPON0wI"),
    
    ("top_hits_2021.json", "37i9dQZF1DX18jTM2l2fJY"),
    ("top_hits_2020.json", "2fmTTbBkXi8pewbUvG3CeZ"),
    ("top_hits_2019.json", "37i9dQZF1DWVRSukIED0e9"),
    ("top_hits_2018.json", "37i9dQZF1DXe2bobNYDtW8"),
    ("top_hits_2017.json", "37i9dQZF1DWTE7dVUebpUW"),
    
    # I suspect spotify curates these playlists
    # so that immediately adjacent years probably do not have repeats,
    # even if by chart-sampling they should...
    #("top_hits_2011.json", "37i9dQZF1DXcagnSNtrGuJ"),
    
    ("top_hits_2010.json", "37i9dQZF1DXc6IFF23C9jj"),
    ("top_hits_2000.json", "37i9dQZF1DWUZv12GM5cFk"),
    ("top_hits_1990.json", "37i9dQZF1DX4joPVMjBCAo"),
    ("top_hits_1980.json", "37i9dQZF1DWXbLOeOIhbc5"),
    ("top_hits_1970.json", "37i9dQZF1DWXQyLTHGuTIz"),
]:
    tracks = build_dataset_from_playlist(spid, token=token)
    coding_club_playlist.write_dataset(fname, tracks)
    print("wrote", fname)

wrote viral_50_daily_usa.json
wrote top_50_daily_usa.json
wrote top_50_weekly_usa.json
wrote top_hits_2021.json
wrote top_hits_2020.json
wrote top_hits_2019.json
wrote top_hits_2018.json
wrote top_hits_2017.json
wrote top_hits_2010.json
wrote top_hits_2000.json
wrote top_hits_1990.json
wrote top_hits_1980.json
wrote top_hits_1970.json
