# Spotify API Scrape

In [1]:
# run this once
# !pip install spotipy --upgrade

In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
import json
import os

In [3]:
with open("../api_cred.json") as json_file:
    creds = json.load(json_file)

In [4]:
client_id = creds['client_id']
client_secret = creds['client_secret']
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [5]:
def load_cached_requests(fp):
    if os.path.exists(fp):
        with open(fp) as json_file:
            ret_dic = json.load(json_file)
            print("Loaded cached data at:",fp)
    else:
        print("Failed to load cached data at:",fp)
        ret_dic = {}
    return ret_dic

def save_cached_requests(save_dic, save_dir,fname):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    with open(save_dir+fname,'w+') as outfile:
        json.dump(save_dic,outfile)
    print("Requests Cached Successfully at:", save_dir+fname)

def save_csvs(data, save_dir, fname):
    for genre in data:
        genre_save_dir = save_dir.format(genre)
        genre_fname = fname.format(genre)
        if not os.path.exists(genre_save_dir):
            os.mkdir(genre_save_dir)
        data[genre].to_csv(genre_save_dir + genre_fname, index=False)
        print("CSV Saved at:", genre_save_dir + genre_fname)

In [7]:
def get_playlists_data_from_category(cat_id,master_playlist_dic):
    # api call
    search_result = sp.category_playlists(category_id=cat_id,country="US")
    playlist_ids = []
    for playlist in search_result['playlists']['items']:
        playlist_ids.append(playlist['id'])
    playlists_data = []
    for playlist_id in playlist_ids:
        # avoids api call using cached request, could hold outdated info if playlist is changed recently
        if playlist_id in master_playlist_dic.keys():
            playlist_data = master_playlist_dic[playlist_id]
        else:
            # api call
            playlist_data = sp.playlist(playlist_id)
            master_playlist_dic[playlist_id] = playlist_data
        playlists_data.append(playlist_data)
    return playlists_data

In [8]:
def get_track_data_from_playlists(playlists_data, master_track_dic, master_audio_features_dic):
    ret_df = pd.DataFrame()
    for cur_playlist_data in playlists_data: # only doing two for api reasons currently

        # can add more features here (album, artist, etc..) MOVED TO FUNCTION USING CACHED DICS
        cur_playlist_track_ids = []
        for track in cur_playlist_data['tracks']['items']:
            if track['track']:
                cur_track_id = track['track']['id']
                if cur_track_id != None:
                    # caches track metadata if not cached
                    if cur_track_id not in master_track_dic.keys():
                        master_track_dic[cur_track_id] = track
                    cur_playlist_track_ids.append(cur_track_id)
                
        # making a list of non cached track audio features
        not_cached_ids = []
        for track_id in cur_playlist_track_ids:
            if track_id not in master_audio_features_dic.keys():
                not_cached_ids.append(track_id)

        # get audio features of non cached tracks
        if len(not_cached_ids) > 0:
            print("Making",len(not_cached_ids),"API Calls")
            # api call
            not_cached_data = sp.audio_features(not_cached_ids)
            # caching the non cached track audio features
            for i in range(len(not_cached_data)):
                track_feats = not_cached_data[i]
                if track_feats:
                    master_audio_features_dic[track_feats['id']] = track_feats
                else:
                    master_audio_features_dic[not_cached_ids[i]] = None

        # iterating current playlist tracks and accessing the cached audio features
        playlist_track_feats = []
        for track_id in cur_playlist_track_ids:
            cur_feats = master_audio_features_dic[track_id]
            if cur_feats:
                playlist_track_feats.append(cur_feats)

        # building return df
        cur_playlist_df = pd.DataFrame(playlist_track_feats)
        ret_df = pd.concat([ret_df, cur_playlist_df])
    ret_df = ret_df.reset_index(drop=True)
    return ret_df

In [6]:
cache_dir = "../data/local/cached_requests/"
master_track_fname = "track_data.json"
master_playlist_fname = "playlist_data.json"
master_audio_features_fname = "track_audio_features.json"
master_track_dic = load_cached_requests(cache_dir + master_track_fname)
master_playlist_dic = load_cached_requests(cache_dir + master_playlist_fname)
master_audio_features_dic = load_cached_requests(cache_dir + master_audio_features_fname)

Loaded cached data at: ../data/local/cached_requests/track_data.json
Loaded cached data at: ../data/local/cached_requests/playlist_data.json
Loaded cached data at: ../data/local/cached_requests/track_audio_features.json


In [20]:
target_genres = ["kpop", "pop", "rock"]

genres_data = {}
for genre in target_genres:
    genre_data = get_playlists_data_from_category(genre, master_playlist_dic)
    genre_df = get_track_data_from_playlists(genre_data, master_track_dic, master_audio_features_dic)
    genres_data[genre] = genre_df

In [21]:
save_cached_requests(master_track_dic,cache_dir,master_track_fname)
save_cached_requests(master_playlist_dic,cache_dir,master_playlist_fname)
save_cached_requests(master_audio_features_dic,cache_dir,master_audio_features_fname)

Requests Cached Successfully at: ../data/local/cached_requests/track_data.json
Requests Cached Successfully at: ../data/local/cached_requests/playlist_data.json
Requests Cached Successfully at: ../data/local/cached_requests/track_audio_features.json


In [22]:
csv_save_dir = "../data/test/{}/metadata/"
csv_fname = "{}_metadata.csv"
save_csvs(genres_data, csv_save_dir, csv_fname)

CSV Saved at: ../data/test/kpop/metadata/kpop_metadata.csv
CSV Saved at: ../data/test/pop/metadata/pop_metadata.csv
CSV Saved at: ../data/test/rock/metadata/rock_metadata.csv


In [23]:
analysis_df = pd.read_csv("../data/test/kpop/metadata/kpop_metadata.csv")

In [26]:
analysis_df

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.692,0.886,1,-2.519,1,0.0519,0.2840,0.000000,0.3410,0.879,100.030,audio_features,4jdiqVbH1eUwc4MzDV7GMu,spotify:track:4jdiqVbH1eUwc4MzDV7GMu,https://api.spotify.com/v1/tracks/4jdiqVbH1eUw...,https://api.spotify.com/v1/audio-analysis/4jdi...,211013,4
1,0.618,0.760,6,-4.312,1,0.0596,0.0066,0.000000,0.1270,0.541,150.000,audio_features,6lM6yIDZ7kxFsgQCCfaLE2,spotify:track:6lM6yIDZ7kxFsgQCCfaLE2,https://api.spotify.com/v1/tracks/6lM6yIDZ7kxF...,https://api.spotify.com/v1/audio-analysis/6lM6...,221341,4
2,0.689,0.678,5,-4.287,0,0.1190,0.0702,0.000000,0.0889,0.335,145.883,audio_features,0yB4jrSwN0bFtFRDR5vyMj,spotify:track:0yB4jrSwN0bFtFRDR5vyMj,https://api.spotify.com/v1/tracks/0yB4jrSwN0bF...,https://api.spotify.com/v1/audio-analysis/0yB4...,200306,4
3,0.704,0.729,7,-3.851,1,0.0624,0.0024,0.000000,0.2180,0.710,129.992,audio_features,7fK0csBoqbcgUuWGV0cpoD,spotify:track:7fK0csBoqbcgUuWGV0cpoD,https://api.spotify.com/v1/tracks/7fK0csBoqbcg...,https://api.spotify.com/v1/audio-analysis/7fK0...,203520,4
4,0.681,0.814,11,-2.830,0,0.0603,0.1450,0.000000,0.5560,0.876,89.990,audio_features,6xy9JYFKcpb9L62PRNnYW5,spotify:track:6xy9JYFKcpb9L62PRNnYW5,https://api.spotify.com/v1/tracks/6xy9JYFKcpb9...,https://api.spotify.com/v1/audio-analysis/6xy9...,207053,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162,0.765,0.769,2,-3.160,1,0.0595,0.1670,0.000000,0.0854,0.581,103.006,audio_features,02UNF1uo5f0UNgc1qx5XGt,spotify:track:02UNF1uo5f0UNgc1qx5XGt,https://api.spotify.com/v1/tracks/02UNF1uo5f0U...,https://api.spotify.com/v1/audio-analysis/02UN...,203057,4
1163,0.719,0.652,2,-4.222,0,0.0409,0.0322,0.000005,0.2700,0.823,112.005,audio_features,4N5jdijK8DSBbBd4A30DUe,spotify:track:4N5jdijK8DSBbBd4A30DUe,https://api.spotify.com/v1/tracks/4N5jdijK8DSB...,https://api.spotify.com/v1/audio-analysis/4N5j...,185760,4
1164,0.685,0.777,1,-4.654,0,0.0630,0.3530,0.000000,0.3290,0.417,95.049,audio_features,21vcyfOwNirS7n968tunZl,spotify:track:21vcyfOwNirS7n968tunZl,https://api.spotify.com/v1/tracks/21vcyfOwNirS...,https://api.spotify.com/v1/audio-analysis/21vc...,218897,4
1165,0.409,0.743,1,-4.536,1,0.0496,0.2250,0.000000,0.2760,0.441,167.864,audio_features,3zM2yi75oB3v00sZvWUzIn,spotify:track:3zM2yi75oB3v00sZvWUzIn,https://api.spotify.com/v1/tracks/3zM2yi75oB3v...,https://api.spotify.com/v1/audio-analysis/3zM2...,239572,4
