# Spotify API Scrape

## Setup

In [None]:
# run this once
#!pip install spotipy --upgrade

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
import json
import os
import time

# Library Code

In [None]:
def get_credentials(api_cred_fp):
    if os.path.exists(api_cred_fp):
        with open(api_cred_fp) as json_file:
            creds = json.load(json_file)
        return creds
    else:
        "Credentials File Not Found"

def load_cached_requests(fp,is_master):
    if os.path.exists(fp):
        with open(fp) as json_file:
            ret_dic = json.load(json_file)
            print("Loaded cached data at:",fp)
    else:
        print("Failed to load cached data at:",fp)
        ret_dic = {}
        if is_master:
            ret_dic['master_tracks'] = {}
            ret_dic['master_playlists'] = {}
            ret_dic['master_audio_features'] = {}
            ret_dic['playlist_tracks'] = {}
            
    return ret_dic

def save_cached_requests(save_dic, save_dir,fname):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    with open(save_dir+fname,'w+') as outfile:
        json.dump(save_dic,outfile)

def save_csvs(df_list,dir_list,fname_list):
    if len(df_list) == len(dir_list) == len(fname_list):
        for i in range(len(df_list)):
            cur_df = df_list[i]
            cur_dir = dir_list[i]
            cur_fname = fname_list[i]
            if not os.path.exists(cur_dir):
                os.makedirs(cur_dir)
            cur_df.to_csv(cur_dir+cur_fname,index=False)
            print("CSV Saved at:",cur_dir+cur_fname)
    else:
        print("Save Failed: List lengths must be the same")

def save_csv(df, save_dir, fname):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    df.to_csv(save_dir + fname,index=False)
    print("CSV Saved at:",save_dir+fname)
    
def get_playlists_data_from_category(cat_id,master_playlist_dic,sp,playlist_tracks_dic,master_track_dic):
    # api call
    search_result = sp.category_playlists(category_id=cat_id,limit=50,country="US")
    playlist_ids = []
    for playlist in search_result['playlists']['items']:
        playlist_ids.append(playlist['id'])
    playlists_data = []
    # updates the playlist dic with info about playlist
    for playlist_id in playlist_ids:
        # avoids api call using cached request, could hold outdated info if playlist is changed recently
        if playlist_id in master_playlist_dic.keys():
            playlist_data = master_playlist_dic[playlist_id]
        else:
            # api call
            playlist_data = sp.playlist(playlist_id)
            # update master playlist dic with playlist data
            master_playlist_dic[playlist_id] = playlist_data
        playlists_data.append(playlist_data)
    # updates the playlist track dic with all of the track ids in the playlist
    all_playlists_tracks = []
    for playlist_id in playlist_ids:
        cur_playlist_tracks = get_full_playlist_tracks(playlist_tracks_dic, playlist_id, 
                                                       master_playlist_dic, master_track_dic,sp)
        
        all_playlists_tracks.extend(cur_playlist_tracks)
    return all_playlists_tracks

def get_track_data_from_playlists(genre_tracks,master_audio_features_dic,
                                  sp,master_playlist_dic):
    timed_out = False
    ret_df = pd.DataFrame()
    valid_genre_track_ids = []
    for cur_track_id in genre_tracks:
        if cur_track_id != None:
            valid_genre_track_ids.append(cur_track_id)            
                
    # making a list of non cached track audio features
    not_cached_ids = []
    for track_id in valid_genre_track_ids:
        if track_id not in master_audio_features_dic.keys():
            not_cached_ids.append(track_id)

    # get audio features of non cached tracks
    if len(not_cached_ids) > 0:
        split_not_cached_ids = [not_cached_ids[x:x+80] for x in range(0,len(not_cached_ids),80)]
        not_cached_data = []
        print("Making",len(not_cached_ids),"API Calls")
        for batch_ids in split_not_cached_ids:
            try:
                # api call
                not_cached_data.extend(sp.audio_features(batch_ids))
                time.sleep(2)
            except:
                print("Error getting audio features, retrying...")
                timed_out = True
                time.sleep(10)
            err_count = 0
            while timed_out:
                try:
                    not_cached_data.extend(sp.audio_features(batch_ids))
                    timed_out=False
                except:
                    if err_count > 5:
                        print("Too many errors")
                        raise
                    else:
                        print("Error getting audio features, retrying...")
                        time.sleep(10*(err_count+1))
                        err_count += 1
        # caching the non cached track audio features
        for i in range(len(not_cached_data)):
            track_feats = not_cached_data[i]
            if track_feats:
                master_audio_features_dic[track_feats['id']] = track_feats
            else:
                master_audio_features_dic[not_cached_ids[i]] = None

    # iterating current playlist tracks and accessing the cached audio features
    playlist_track_feats = []
    for track_id in valid_genre_track_ids:
        cur_feats = master_audio_features_dic[track_id]
        if cur_feats:
            playlist_track_feats.append(cur_feats)

    # building return df
    genre_df = pd.DataFrame(playlist_track_feats)
    return genre_df

def main_scrape(cat_ids,csv_save_dir,csv_fname,cred_fp,cache_dir,all_requests_fname):
    # initializing vars
    creds = get_credentials(cred_fp)
    client_id = creds['client_id']
    client_secret = creds['client_secret']
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    all_requests_dic = load_cached_requests(cache_dir + all_requests_fname,is_master=True)
    master_track_dic = all_requests_dic['master_tracks']
    master_playlist_dic = all_requests_dic['master_playlists']
    master_audio_features_dic = all_requests_dic['master_audio_features']
    playlist_tracks_dic = all_requests_dic['playlist_tracks']
    
    master_df = pd.DataFrame()
    for cat_id in cat_ids:
        cat_tracks = get_playlists_data_from_category(cat_id,master_playlist_dic,sp,
                                                      playlist_tracks_dic,master_track_dic)
        
        cat_df = get_track_data_from_playlists(cat_tracks,master_audio_features_dic,
                                               sp,master_playlist_dic)
        cat_df['genre(s)'] = cat_id
        master_df = pd.concat([master_df,cat_df])
        print("Caching " + cat_id + " requests...")
        # may not save new requests to the master cache dic, need to confirm/deny
        save_cached_requests(all_requests_dic,cache_dir,all_requests_fname)
        
    # cleaning master df    
    master_df = master_df.drop_duplicates(subset=['id','genre(s)'])
    master_df = master_df.reset_index(drop=True)
    master_df = master_df.drop(['type','track_href','uri'],axis=1)
    master_df = get_track_metadata(master_df,master_track_dic).reset_index(drop=True)
    
    # write df to disk
    save_csv(master_df,csv_save_dir,csv_fname)
    return master_df

def get_full_playlist_tracks(playlist_tracks_dic, cur_id, master_playlist_dic,master_track_dic,sp):
    timed_out = False
    if cur_id in playlist_tracks_dic.keys():
        print("already cached playlist",cur_id)
    else:
        time.sleep(.5)
        print("Scraping playlist",cur_id)
        playlist_tracks_dic[cur_id] = []
        # first run
        cur_playlist_items = master_playlist_dic[cur_id]['tracks']['items']
        cur_tracks = []
        for cur_track in cur_playlist_items:
            if cur_track and cur_track['track'] and cur_track['track']['id']:
                cur_tracks.append(cur_track['track']['id'])
                # updating master track dic for initial 100 songs
                if cur_track['track']['id'] not in master_track_dic.keys():
                    master_track_dic[cur_track['track']['id']] = cur_track
        playlist_tracks_dic[cur_id].extend(cur_tracks)
        # get entire playlist
        total_songs = master_playlist_dic[cur_id]['tracks']['total']
        cur_data = master_playlist_dic[cur_id]['tracks']
        
        # getting the rest of the track ids (init batch is only first 100 playlist songs)
        while len(playlist_tracks_dic[cur_id]) < total_songs:  
            # initial attempt
            try:
                next_data = sp.next(cur_data)
            except:
                print("Error getting playlist tracks, retrying...")
                timed_out = True
                time.sleep(10)
            # retries if initial attempt fails
            err_count = 0
            while timed_out:
                try:
                    next_data = sp.next(cur_data)
                    timed_out=False
                except:
                    if err_count > 5:
                        print("Too many failed attempts")
                        raise
                    else:
                        print("Error getting playlist tracks, retrying...")
                        time.sleep(10*(err_count+1))
                        err_count += 1
            cur_tracks = []
            cur_playlist_items = cur_data['items']
            for cur_track in cur_playlist_items:
                if cur_track and cur_track['track'] and cur_track['track']['id']:
                    cur_tracks.append(cur_track['track']['id'])
                    # updating master track dic for latter 100+ songs
                    if cur_track['track']['id'] not in master_track_dic.keys():
                        master_track_dic[cur_track['track']['id']] = cur_track
            playlist_tracks_dic[cur_id].extend(cur_tracks)         
            cur_data = next_data
            time.sleep(2)
            print("Songs gathered:",len(playlist_tracks_dic[cur_id]),"of",total_songs)
    playlist_tracks_dic[cur_id] = list(set(playlist_tracks_dic[cur_id])) # remove duplicates
    return playlist_tracks_dic[cur_id]

def get_track_metadata(df,tracks_dic):
    # getting metadata for each track
    artist_ids = []
    track_names = []
    artist_names = []
    for targ_track_id in df['id']: 
        if targ_track_id in tracks_dic.keys():
            track_data = tracks_dic[targ_track_id]['track']
            track_name = track_data['name']
            track_names.append(track_name)
            artists = track_data['artists']
            artists_string = ""
            artists_names_string = ""
            for i in range(len(artists)):
                if 'name' in artists[i].keys() and artists[i]['name']:
                    if i == len(artists) - 1:
                        artists_names_string += artists[i]['name']
                    else:
                        artists_names_string += artists[i]['name'] + "//"
                if 'id' in artists[i].keys() and artists[i]['id']:
                    if i == len(artists) - 1:
                        artists_string += artists[i]['id']
                    else:
                        artists_string += artists[i]['id'] + "//"
            artist_names.append(artists_names_string)
            artist_ids.append(artists_string)
        else:
            artist_ids.append(np.nan)
    df['artist_ids'] = artist_ids
    df['artist_names'] = artist_names
    df['name'] = track_names
    # accounts for multiple genres (aka same song in different genre playlists)
    df['genre(s)'] = df['id'].apply(lambda x:"//".join(df[df['id'] == x]['genre(s)'].values))
    df = df.drop_duplicates(subset='id') # new duplicates created after combining genres
    return df

# Loading Vars

In [None]:
genres = ['kpop','pop','rock','hiphop','reggae','jazz','rnb','classical','country','edm_dance']
audio_features_dir = "../data/test/all/metadata/"
audio_features_fname = "all_audio_features.csv"
cred_fp = "../api_cred.json"
cache_dir = "../data/local/cached_requests/"
all_requests_fname = "all_requests.json"

# Pipeline

In [None]:
df = main_scrape(genres,audio_features_dir,audio_features_fname,cred_fp,cache_dir,all_requests_fname)

In [None]:
df

# Work in Progress

In [None]:
# possibly iterate albums from the category or search result playlists for the genre key word

In [None]:
# easy startup
all_requests_dic = load_cached_requests(cache_dir + all_requests_fname,True)
master_track_dic = all_requests_dic['master_tracks']
playlist_tracks_dic = all_requests_dic['playlist_tracks']
master_playlist_dic = all_requests_dic['master_playlists']
master_audio_features_dic = all_requests_dic['master_audio_features']

creds = get_credentials(cred_fp)
client_id = creds['client_id']
client_secret = creds['client_secret']
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
cats = sp.categories(country="US",limit=50)

In [None]:
for cat in cats['categories']['items']:
    print(cat['id'])

In [None]:
['reggae','jazz','rnb','classical','country','edm_dance']

# Old

In [None]:
# # takes like 30 sec
# df['genre(s)'] = df['id'].apply(lambda x:"//".join(df[df['id'] == x]['genre(s)'].values))

In [None]:
# df = df.drop_duplicates(subset='id')

In [None]:
# df['genre(s)'].apply(lambda x: "//" in x).sum()

In [None]:
# genres = df[]['genre(s)'].value_counts().index
# genres_string = ""
# for i in range(len(genres)):
#     if i == len(genres) - 1:
#         genres_string += genres[i]
#     else:
#         genres_string += genres[i] + "//"
# genres_string    

In [None]:
# t = sp.track("3qqcavKhQkzyyqGC5UDIAL")

In [None]:
# df['genre'].value_counts()

In [None]:
# def test1():
#     main_dic = {"test": {"sub":2}}
#     test2(main_dic)
#     print(main_dic)

# def test2(main_dic):
#     sub_dic = main_dic['test']
#     sub_dic['sub'] = 1
# test1()

In [None]:
# len(tracks_dic.keys())

In [None]:
# all_requests_dic.keys()

In [None]:
# playlist_data = all_requests_dic['master_playlists']

In [None]:
# search_result = sp.category_playlists(category_id="rock",limit=50,country="US")

In [None]:
# search_result['playlists']['total']

In [None]:
# for p in search_result['playlists']['items']:
#     print(p['name'])

In [None]:
# playlist_data["37i9dQZF1DX5JcPJgYjGcf"]['name']

In [None]:
# for t in playlist_data["37i9dQZF1DX5JcPJgYjGcf"]['tracks']['items']:
#     print(t['track']['name'])

In [None]:

# df['artist_ids'].isnull().sum()

In [None]:
# not_in[0]
# track_data = sp.tracks(not_in[0:10])

In [None]:
# playlist_dic = all_requests_dic['master_playlists']


In [None]:
# next_data = sp.next(playlist_dic['660VrDfeGlKRFkMDy15JP0']['tracks'])

In [None]:
# next_data['items'][0]

In [None]:
# for track in next_data['items']:
#     if track['id']:
#         master_track_dic[track['id']] = track

In [None]:
# master_track_dic = load_cached_requests(cache_dir + master_track_fname)
# master_playlist_dic = load_cached_requests(cache_dir + master_playlist_fname)
# master_audio_features_dic = load_cached_requests(cache_dir + master_audio_features_fname)
# playlist_tracks_dic = load_cached_requests(cache_dir + playlist_tracks_fname)
# all_requests_dic = {"playlist_tracks": playlist_tracks_dic, "master_tracks":master_track_dic, 
#                     "master_playlists":master_playlist_dic, "master_audio_features": master_audio_features_dic}

In [None]:
# save_cached_requests(all_requests_dic,cache_dir,all_requests_fname)

In [None]:
# playlist_tracks_dic = load_cached_requests(cache_dir + playlist_tracks_fname)

In [None]:
# c = 0
# unique = []
# for p in playlist_tracks_dic.keys():
#     print(len(playlist_tracks_dic[p]))
#     for track in playlist_tracks_dic[p]:
#         if track in unique:
#             continue
#         else:
#             unique.append(track)
# len(unique)

In [None]:
# temp = [1]*1234
# split = [temp[x:x+100] for x in range(0,len(temp),100)]

In [None]:
# for col in df.columns:
#     print(col, len(df[df[col].apply(lambda x: x == -1)]))

In [None]:
# search_result = sp.category_playlists(category_id="kpop",country="US")

In [None]:
# search_result['playlists'].keys()

In [None]:
# master_playlist_dic['660VrDfeGlKRFkMDy15JP0']['tracks']['items'][98]['track']['id']

In [None]:
# kpop_playlists_data = get_playlists_data_from_category("kpop",master_playlist_dic)
# kpop_df = get_track_data_from_playlists(pop_playlists_data, master_track_dic,master_audio_features_dic)

# pop_playlists_data = get_playlists_data_from_category("pop",master_playlist_dic)
# pop_df = get_track_data_from_playlists(pop_playlists_data, master_track_dic,master_audio_features_dic)

# rock_playlists_data = get_playlists_data_from_category("rock",master_playlist_dic)
# rock_df = get_track_data_from_playlists(rock_playlists_data, master_track_dic,master_audio_features_dic)

In [None]:
# save_cached_requests(master_track_dic,cache_dir,master_track_fname)
# save_cached_requests(master_playlist_dic,cache_dir,master_playlist_fname)
# save_cached_requests(master_audio_features_dic,cache_dir,master_audio_features_fname)

In [None]:
# rock_df['genre'] = 'rock'
# pop_df['genre'] = 'pop'


In [None]:
# kpop_csv_dir = "../data/test/kpop/metadata/"
# kpop_fname = "kpop_metadata.csv"

# pop_csv_dir = "../data/test/pop/metadata/"
# pop_fname = "pop_metadata.csv"

# rock_csv_dir = "../data/test/rock/metadata/"
# rock_fname = "rock_metadata.csv"

# dfs = [kpop_df,pop_df,rock_df]
# csv_dirs = [kpop_csv_dir,pop_csv_dir,rock_csv_dir]
# csv_fnames = [kpop_fname,pop_fname,rock_fname]

# save_csvs(dfs,csv_dirs,csv_fnames)

In [None]:
# analysis_df = pd.read_csv("../data/test/kpop/metadata/kpop_metadata.csv")