In [2]:
import csv
import json
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from collections import defaultdict
import os
from dotenv import load_dotenv

In [3]:
# Fetch environment variables
load_dotenv()
CLIENT_ID = os.getenv('CLIENT_ID')
CLIENT_SECRET = os.getenv('CLIENT_SECRET')

# Set up Spotify API credentials
client_credentials_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [4]:
mil_tracks = pd.read_csv('spotify_data.csv')
known_tracks = pd.read_csv('intermediate.csv')
id = "1lzr43nnXAijIGYnCT8M8H"
found1 = mil_tracks[mil_tracks['track_id'] == id]
id = "spotify:track:" + id
found2 = known_tracks.loc[known_tracks['track_id'] == id]
if not found1.empty:
    print(found1)
if not found2.empty:
    print(found2)

In [11]:
import time
# ---------- YOUR FILEPATHS HERE ----------
FILENAME = "mpd.slice.9000-9999.json"
FOLDER = r"spotify_million_playlist_dataset/data"
PATH = FOLDER + "/" + FILENAME
# ---------- YOUR FILEPATHS HERE ----------

# fields = ['','artist_name','track_name','track_id','popularity','year','genre','danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']
# no popularity,year,genre fields, not queryable in spotipy api
features = ['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']
track_list = []
# request_ids = []
print(f"Loading {PATH}...")
with open(PATH, "r") as playlist_file:
    batch = json.load(playlist_file)
    print(f"Loading complete! Beginning parsing...")
    
    # This is our running count, serves as the index for the song
    i = 0

    for playlist_index, playlist in enumerate(batch["playlists"]):

        print(f"\nScanned playlist {playlist_index} ({len(playlist['tracks'])} songs)")
        # List of IDs we will request for via spotipy
        request_track_ids = []

        # Intermediate list we use to store known data before we make API call
        track_info_100 = []
        for current_index, track in enumerate(playlist["tracks"]):
            current_id = track["track_uri"][14:]

            track_info = {
                '':i,
                'artist_name': track['artist_name'],
                'track_name': track['track_name'],
                'track_id': current_id,
                # ... we will add more attributes later
            }
            
            # Check if its in the millions, skip if so
            temp_mil = mil_tracks[mil_tracks['track_id'] == id]
            if not temp_mil.empty:
                continue
            
            i += 1
            # If song is undiscovered, we add its ID to the list.
            request_track_ids.append(current_id)
            track_info_100.append(track_info)

            # If we have 100 songs in our request_track_ids, then we make the call for all 100 and reset the list
            if len(request_track_ids) == 100 or current_index == len(playlist["tracks"]) - 1:
                success = False
                retry_after = 1
                while not success:
                    try:
                        response = sp.audio_features(request_track_ids) 
                        print(f"Successful response with {len(response)} entries")
                        for response_index, audio_features in enumerate(response):
                            if audio_features:  # Check if audio_features is not None
                                for feature in features:
                                    track_info_100[response_index][feature] = audio_features[feature]
                                track_list.append(track_info_100[response_index])
                        print(f"Total songs stored so far: {len(track_list) = }")
                        retry_after = 0
                        success = True
                        track_info_100 = []

                    except spotipy.exceptions.SpotifyException as e:
                        if e.http_status == 429:
                            # retry_after = int(e.headers.get('Retry-After', 1))
                            if retry_after < 32: retry_after *= 2 
                            print(f"Rate limit exceeded. Retrying after {retry_after} seconds...")
                        else:
                            print(f"Error: HTTP Code {e.http_status}, {e}")
                        
                    except Exception as e:
                        print(f"Ran into other error: {e}")
                        print(f"Retrying...")
                    
                    finally:
                        time.sleep(retry_after)
                
                request_track_ids = []

Loading spotify_million_playlist_dataset/data/mpd.slice.9000-9999.json...
Loading complete! Beginning parsing...

Scanned playlist 0 (87 songs)
Successful response with 87 entries
Total songs stored so far: len(track_list) = 87

Scanned playlist 1 (76 songs)


Max Retries reached


Rate limit exceeded. Retrying after 2 seconds...


Max Retries reached


Rate limit exceeded. Retrying after 4 seconds...


Max Retries reached


Rate limit exceeded. Retrying after 8 seconds...


Max Retries reached


Rate limit exceeded. Retrying after 16 seconds...


Max Retries reached


Rate limit exceeded. Retrying after 32 seconds...


Max Retries reached


Rate limit exceeded. Retrying after 32 seconds...


Max Retries reached


Rate limit exceeded. Retrying after 32 seconds...


Max Retries reached


Rate limit exceeded. Retrying after 32 seconds...


Max Retries reached


Rate limit exceeded. Retrying after 32 seconds...


KeyboardInterrupt: 

In [9]:
print(len(track_list))

for i in track_list[0:100]:
    print(i)

39270
{'': 0, 'artist_name': 'Ed Sheeran', 'track_name': 'Thinking Out Loud - Alex Adair Remix', 'track_id': '1cWepLwVVDgyqxso6G4TQU', 'danceability': 0.828, 'energy': 0.745, 'key': 2, 'loudness': -6.717, 'mode': 1, 'speechiness': 0.152, 'acousticness': 0.21, 'instrumentalness': 0.00113, 'liveness': 0.087, 'valence': 0.517, 'tempo': 117.008, 'duration_ms': 182185, 'time_signature': 4}
{'': 1, 'artist_name': 'filous', 'track_name': 'How Hard I Try', 'track_id': '7uzudGwu8EG5deuFD7EEJm', 'danceability': 0.742, 'energy': 0.564, 'key': 5, 'loudness': -8.761, 'mode': 1, 'speechiness': 0.0451, 'acousticness': 0.299, 'instrumentalness': 0.000742, 'liveness': 0.109, 'valence': 0.505, 'tempo': 120.012, 'duration_ms': 198000, 'time_signature': 4}
{'': 2, 'artist_name': 'Life of Dillon', 'track_name': 'Overload', 'track_id': '74OPbAeDYrpKJL4IlBiHxm', 'danceability': 0.662, 'energy': 0.733, 'key': 8, 'loudness': -5.83, 'mode': 1, 'speechiness': 0.0286, 'acousticness': 0.0128, 'instrumentalness': 2

In [10]:
# Write back data into CSV file, similar style as the million track dataset
fields = ['','artist_name','track_name','track_id']
fields = fields + features
OUTFILE = FILENAME[:-5] + ".csv" # csv version of input json file
with open(OUTFILE, "w", encoding="utf-8") as output:
    writer = csv.DictWriter(output, fieldnames=fields)
    writer.writeheader()
    writer.writerows(track_list)