In [1]:
#Imports
from apikeys import CLIENT_ID, CLIENT_SECRET
import requests
from pprint import pprint
import json
import time

## Restructure Billboard Output to a more useful format

In [5]:
#Read in Data
json_path = 'Outputs/billboard-hot-100.json'
with open(json_path) as data:
    data = json.load(data)
    data = data['data']

In [6]:
#Extract just the date and song titles (Formatted for use in Spotify API)
cleaned_list = []
for period in data:
    song_list = []
    for song in period['songs']['songs']:
        song_list.append(f"track:{song['title']} artist:{song['artist']}&type=track&limit=1")
    week_dict = {'date': period['songs']['week'],
                 'top_songs': song_list}
    cleaned_list.append(week_dict)

In [7]:
#Save JSON
with open('./Outputs/cleaned_billboard.json', 'w', encoding='utf-8') as f:
    json.dump({'data':cleaned_list}, f, ensure_ascii=False, indent=4)

## Get Track Data from Spotify API

In [6]:
#Get Spotify AUTH Token
def get_auth():
    print('Re-Auth')
    auth_url = 'https://accounts.spotify.com/api/token'

    auth_response = requests.post(auth_url, {
        'grant_type': 'client_credentials',
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET,
    })

    auth_response_data = auth_response.json()
    access_token = auth_response_data['access_token']
    headers = {
        'Authorization': 'Bearer {token}'.format(token=access_token)
    }
    return headers

In [None]:
#Get Spotify information for each track in the Billboard list
#Note: This section takes ~3 hours to run with appropriate API Usage times, be patient
head = get_auth()
base_url = 'https://api.spotify.com/v1/'
total = 0
failed = 0
spotify_data = []
#Loop through each period
for period in cleaned_list:
    track_list = []
    #Loop through each track
    for track in period['top_songs']:
        #Sleep as to avoid Spotify API Rate Limits
        time.sleep(1)
        #Get a new Access Token every 750 tracks so it doesn't time out
        if total % 750 == 0:
            head = get_auth()
        total += 1
        response = requests.get(base_url + 'search?q=' + track, headers=head).json()
        #Check if track exists
        try:
            #Get TrackID, ArtistID, Title, and Artist
            track_id = response['tracks']['items'][0]['id']
            track_title = response['tracks']['items'][0]['name']
            track_artist = response['tracks']['items'][0]['artists'][0]['name']
            artist_id = response['tracks']['items'][0]['artists'][0]['id']
            #Get album genres
            genres = requests.get(base_url + 'artists/' + artist_id, headers=head).json()
            #Get track features
            features = requests.get(base_url + 'audio-features/' + track_id, headers=head).json()
            #Add ID, Artist, Title, and Features dict to Track List
            track_list.append({'track_id': track_id,
                               'track_title': track_title,
                               'track_artist': track_artist,
                               'features': features,
                               'genres': genres['genres'][:3]})
        except:
            failed += 1
            print(f'{track} not found. Failed {failed}/{total} tracks. {(1-(failed/total))*100}% Success Rate')
            print(response)
            
    print('----------------------------Appending Period----------------------------')
    
    spotify_data.append({'date': period['date'],
                         'top_tracks': track_list})

In [11]:
#Save JSON
with open('./Outputs/features.json', 'w', encoding='utf-8') as f:
    json.dump(spotify_data, f, ensure_ascii=False, indent=4)