In [48]:
import csv
import json
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from collections import defaultdict
import os
from dotenv import load_dotenv

In [49]:
# Fetch environment variables
load_dotenv()
CLIENT_ID = os.getenv('CLIENT_ID')
CLIENT_SECRET = os.getenv('CLIENT_SECRET')

# Set up Spotify API credentials
client_credentials_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [50]:
mil_tracks = pd.read_csv('spotify_data.csv')
known_tracks = pd.read_csv('intermediate.csv')
id = "1lzr43nnXAijIGYnCT8M8H"
found1 = mil_tracks[mil_tracks['track_id'] == id]
id = "spotify:track:" + id
found2 = known_tracks.loc[known_tracks['track_id'] == id]
if not found1.empty:
    print(found1)
if not found2.empty:
    print(found2)

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
import time
# ---------- YOUR FILEPATHS HERE ----------
FILENAME = "mpd.slice.0-999.json"
FOLDER = r"spotify_million_playlist_dataset/data"
PATH = FOLDER + "/" + FILENAME
# ---------- YOUR FILEPATHS HERE ----------

# fields = ['','artist_name','track_name','track_id','popularity','year','genre','danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']
# no popularity,year,genre fields, not queryable in spotipy api
features = ['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']
track_list = []
# request_ids = []
print(f"Loading {PATH}...")
with open(PATH, "r") as playlist_file:
    batch = json.load(playlist_file)
    print(f"Loading complete! Beginning parsing...")
    
    # This is our running count, serves as the index for the song
    i = 0

    for playlist_index, playlist in enumerate(batch["playlists"]):

        print(f"\nScanned playlist {playlist_index} ({len(playlist['tracks'])} songs)")
        # List of IDs we will request for via spotipy
        request_track_ids = []

        # Intermediate list we use to store known data before we make API call
        track_info_100 = []
        for current_index, track in enumerate(playlist["tracks"]):
            current_id = track["track_uri"][14:]

            track_info = {
                '':i,
                'artist_name': track['artist_name'],
                'track_name': track['track_name'],
                'track_id': current_id,
                # ... we will add more attributes later
            }
            
            # Check if its in the millions, skip if so
            temp_mil = mil_tracks[mil_tracks['track_id'] == id]
            if not temp_mil.empty:
                continue
            
            i += 1
            # If song is undiscovered, we add its ID to the list.
            request_track_ids.append(current_id)
            track_info_100.append(track_info)

            # If we have 100 songs in our request_track_ids, then we make the call for all 100 and reset the list
            if len(request_track_ids) == 100 or current_index == len(playlist["tracks"]) - 1:
                success = False
                retry_after = 1
                while not success:
                    try:
                        response = sp.audio_features(request_track_ids) 
                        print(f"Successful response with {len(response)} entries")
                        for response_index, audio_features in enumerate(response):
                            if audio_features:  # Check if audio_features is not None
                                for feature in features:
                                    track_info_100[response_index][feature] = audio_features[feature]
                                track_list.append(track_info_100[response_index])
                        print(f"Total songs stored so far: {len(track_list) = }")
                        retry_after = 0
                        success = True
                        track_info_100 = []

                    except spotipy.exceptions.SpotifyException as e:
                        if e.http_status == 429:
                            # retry_after = int(e.headers.get('Retry-After', 1))
                            if retry_after < 32: retry_after *= 2 
                            print(f"Rate limit exceeded. Retrying after {retry_after} seconds...")
                        else:
                            print(f"Error: HTTP Code {e.http_status}, {e}")
                        
                    except Exception as e:
                        print(f"Ran into other error: {e}")
                        print(f"Retrying...")
                    
                    finally:
                        time.sleep(retry_after)
                
                request_track_ids = []

Loading spotify_million_playlist_dataset/data/mpd.slice.0-999.json...
Loading complete! Beginning parsing...

Scanned playlist 0 (52 songs)
Ran into other error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Retrying...
Successful response with 52 entries
Total songs stored so far: len(track_list) = 52

Scanned playlist 1 (39 songs)
Successful response with 39 entries
Total songs stored so far: len(track_list) = 91

Scanned playlist 2 (64 songs)
Successful response with 64 entries
Total songs stored so far: len(track_list) = 155


In [None]:
print(len(track_list))

for i in track_list[0:100]:
    print(i)

155
{'': 0, 'artist_name': 'Missy Elliott', 'track_name': 'Lose Control (feat. Ciara & Fat Man Scoop)', 'track_id': '0UaMYEvWZi0ZqiDOoHU3YI', 'danceability': 0.904, 'energy': 0.813, 'key': 4, 'loudness': -7.105, 'mode': 0, 'speechiness': 0.121, 'acousticness': 0.0311, 'instrumentalness': 0.00697, 'liveness': 0.0471, 'valence': 0.81, 'tempo': 125.461, 'duration_ms': 226864, 'time_signature': 4}
{'': 1, 'artist_name': 'Britney Spears', 'track_name': 'Toxic', 'track_id': '6I9VzXrHxO9rA9A5euc8Ak', 'danceability': 0.774, 'energy': 0.838, 'key': 5, 'loudness': -3.914, 'mode': 0, 'speechiness': 0.114, 'acousticness': 0.0249, 'instrumentalness': 0.025, 'liveness': 0.242, 'valence': 0.924, 'tempo': 143.04, 'duration_ms': 198800, 'time_signature': 4}
{'': 2, 'artist_name': 'Beyoncé', 'track_name': 'Crazy In Love', 'track_id': '0WqIKmW4BTrj3eJFmnCKMv', 'danceability': 0.664, 'energy': 0.759, 'key': 2, 'loudness': -6.583, 'mode': 0, 'speechiness': 0.209, 'acousticness': 0.00238, 'instrumentalness'

In [None]:
# Write back data into CSV file, similar style as the million track dataset
fields = ['','artist_name','track_name','track_id']
fields = fields + features
OUTFILE = FILENAME[:-5] + ".csv" # csv version of input json file
with open(OUTFILE, "w", encoding="utf-8") as output:
    writer = csv.DictWriter(output, fieldnames=fields)
    writer.writeheader()
    writer.writerows(track_list)