# Spotify API Scrape

In [None]:
# run this once
#!pip install spotipy --upgrade

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
import json
import os

In [2]:
with open("../api_cred.json") as json_file:
    creds = json.load(json_file)

In [3]:
client_id = creds['client_id']
client_secret = creds['client_secret']
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [156]:
def load_cached_requests(fp):
    if os.path.exists(fp):
        with open(fp) as json_file:
            ret_dic = json.load(json_file)
            print("Loaded cached data at:",fp)
    else:
        print("Failed to load cached data at:",fp)
        ret_dic = {}
    return ret_dic

def save_cached_requests(save_dic, save_dir,fname):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    with open(save_dir+fname,'w+') as outfile:
        json.dump(save_dic,outfile)
    print("Requests Cached Successfully at:", save_dir+fname)

def save_csvs(df_list,dir_list,fname_list):
    if len(df_list) == len(dir_list) == len(fname_list):
        for i in range(len(df_list)):
            cur_df = df_list[i]
            cur_dir = dir_list[i]
            cur_fname = fname_list[i]
            if not os.path.exists(cur_dir):
                os.makedirs(cur_dir)
            cur_df.to_csv(cur_dir+cur_fname,index=False)
            print("CSV Saved at:",cur_dir+cur_fname)
    else:
        print("Save Failed: List lengths must be the same")

In [36]:
cache_dir = "../data/local/cached_requests/"
master_track_fname = "track_data.json"
master_playlist_fname = "playlist_data.json"
master_audio_features_fname = "track_audio_features.json"
master_track_dic = load_cached_requests(cache_dir + master_track_fname)
master_playlist_dic = load_cached_requests(cache_dir + master_playlist_fname)
master_audio_features_dic = load_cached_requests(cache_dir + master_audio_features_fname)

Loaded cached data at: ../data/local/cached_requests/track_data.json
Loaded cached data at: ../data/local/cached_requests/playlist_data.json
Loaded cached data at: ../data/local/cached_requests/track_audio_features.json


In [106]:
def get_playlists_data_from_category(cat_id,master_playlist_dic):
    # api call
    search_result = sp.category_playlists(category_id=cat_id,country="US")
    playlist_ids = []
    for playlist in search_result['playlists']['items']:
        playlist_ids.append(playlist['id'])
    playlists_data = []
    for playlist_id in playlist_ids:
        # avoids api call using cached request, could hold outdated info if playlist is changed recently
        if playlist_id in master_playlist_dic.keys():
            playlist_data = master_playlist_dic[playlist_id]
        else:
            # api call
            playlist_data = sp.playlist(playlist_id)
            master_playlist_dic[playlist_id] = playlist_data
        playlists_data.append(playlist_data)
    return playlists_data

In [146]:
def get_track_data_from_playlists(playlists_data, master_track_dic, master_audio_features_dic):
    ret_df = pd.DataFrame()
    for cur_playlist_data in playlists_data: # only doing two for api reasons currently

        # can add more features here (album, artist, etc..) MOVED TO FUNCTION USING CACHED DICS
        cur_playlist_track_ids = []
        for track in cur_playlist_data['tracks']['items']:
            if track['track']:
                cur_track_id = track['track']['id']
                if cur_track_id != None:
                    # caches track metadata if not cached
                    if cur_track_id not in master_track_dic.keys():
                        master_track_dic[cur_track_id] = track
                    cur_playlist_track_ids.append(cur_track_id)
                
        # making a list of non cached track audio features
        not_cached_ids = []
        for track_id in cur_playlist_track_ids:
            if track_id not in master_audio_features_dic.keys():
                not_cached_ids.append(track_id)

        # get audio features of non cached tracks
        if len(not_cached_ids) > 0:
            print("Making",len(not_cached_ids),"API Calls")
            # api call
            not_cached_data = sp.audio_features(not_cached_ids)
            # caching the non cached track audio features
            for i in range(len(not_cached_data)):
                track_feats = not_cached_data[i]
                if track_feats:
                    master_audio_features_dic[track_feats['id']] = track_feats
                else:
                    master_audio_features_dic[not_cached_ids[i]] = None

        # iterating current playlist tracks and accessing the cached audio features
        playlist_track_feats = []
        for track_id in cur_playlist_track_ids:
            cur_feats = master_audio_features_dic[track_id]
            if cur_feats:
                playlist_track_feats.append(cur_feats)

        # building return df
        cur_playlist_df = pd.DataFrame(playlist_track_feats)
        ret_df = pd.concat([ret_df, cur_playlist_df])
    ret_df = ret_df.reset_index(drop=True)
    return ret_df

In [163]:
kpop_playlists_data = get_playlists_data_from_category("kpop",master_playlist_dic)
kpop_df = get_track_data_from_playlists(kpop_playlists_data, master_track_dic,master_audio_features_dic)

pop_playlists_data = get_playlists_data_from_category("pop",master_playlist_dic)
pop_df = get_track_data_from_playlists(pop_playlists_data, master_track_dic,master_audio_features_dic)

rock_playlists_data = get_playlists_data_from_category("rock",master_playlist_dic)
rock_df = get_track_data_from_playlists(rock_playlists_data, master_track_dic,master_audio_features_dic)

In [160]:
save_cached_requests(master_track_dic,cache_dir,master_track_fname)
save_cached_requests(master_playlist_dic,cache_dir,master_playlist_fname)
save_cached_requests(master_audio_features_dic,cache_dir,master_audio_features_fname)

Requests Cached Successfully at: ../data/local/cached_requests/track_data.json
Requests Cached Successfully at: ../data/local/cached_requests/playlist_data.json
Requests Cached Successfully at: ../data/local/cached_requests/track_audio_features.json


In [162]:
kpop_csv_dir = "../data/test/kpop/metadata/"
kpop_fname = "kpop_metadata.csv"

pop_csv_dir = "../data/test/pop/metadata/"
pop_fname = "pop_metadata.csv"

rock_csv_dir = "../data/test/rock/metadata/"
rock_fname = "rock_metadata.csv"

dfs = [kpop_df,pop_df,rock_df]
csv_dirs = [kpop_csv_dir,pop_csv_dir,rock_csv_dir]
csv_fnames = [kpop_fname,pop_fname,rock_fname]

save_csvs(dfs,csv_dirs,csv_fnames)

CSV Saved at: ../data/test/kpop/metadata/kpop_metadata.csv
CSV Saved at: ../data/test/pop/metadata/pop_metadata.csv
CSV Saved at: ../data/test/rock/metadata/rock_metadata.csv


In [79]:
analysis_df = pd.read_csv("../data/test/kpop/metadata/kpop_metadata.csv")