# Spotify API Scrape

## Setup

In [None]:
# run this once
#!pip install spotipy --upgrade

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
import json
import os

# Library Code

In [None]:
def get_credentials(api_cred_fp):
    if os.path.exists(api_cred_fp):
        with open(api_cred_fp) as json_file:
            creds = json.load(json_file)
        return creds
    else:
        "Credentials File Not Found"

def load_cached_requests(fp):
    if os.path.exists(fp):
        with open(fp) as json_file:
            ret_dic = json.load(json_file)
            print("Loaded cached data at:",fp)
    else:
        print("Failed to load cached data at:",fp)
        ret_dic = {}
    return ret_dic

def save_cached_requests(save_dic, save_dir,fname):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    with open(save_dir+fname,'w+') as outfile:
        json.dump(save_dic,outfile)

def save_csvs(df_list,dir_list,fname_list):
    if len(df_list) == len(dir_list) == len(fname_list):
        for i in range(len(df_list)):
            cur_df = df_list[i]
            cur_dir = dir_list[i]
            cur_fname = fname_list[i]
            if not os.path.exists(cur_dir):
                os.makedirs(cur_dir)
            cur_df.to_csv(cur_dir+cur_fname,index=False)
            print("CSV Saved at:",cur_dir+cur_fname)
    else:
        print("Save Failed: List lengths must be the same")

def save_csv(df, save_dir, fname):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    df.to_csv(save_dir + fname,index=False)
    print("CSV Saved at:",save_dir+fname)
    
def get_playlists_data_from_category(cat_id,master_playlist_dic,sp):
    # api call
    search_result = sp.category_playlists(category_id=cat_id,country="US")
    playlist_ids = []
    for playlist in search_result['playlists']['items']:
        playlist_ids.append(playlist['id'])
    playlists_data = []
    for playlist_id in playlist_ids:
        # avoids api call using cached request, could hold outdated info if playlist is changed recently
        if playlist_id in master_playlist_dic.keys():
            playlist_data = master_playlist_dic[playlist_id]
        else:
            # api call
            playlist_data = sp.playlist(playlist_id)
            master_playlist_dic[playlist_id] = playlist_data
        playlists_data.append(playlist_data)
    return playlists_data

def get_track_data_from_playlists(playlists_data, master_track_dic, master_audio_features_dic,sp):
    ret_df = pd.DataFrame()
    for cur_playlist_data in playlists_data: # only doing two for api reasons currently

        # can add more features here (album, artist, etc..) MOVED TO FUNCTION USING CACHED DICS
        cur_playlist_track_ids = []
        for track in cur_playlist_data['tracks']['items']:
            if track['track']:
                cur_track_id = track['track']['id']
                if cur_track_id != None:
                    # caches track metadata if not cached
                    if cur_track_id not in master_track_dic.keys():
                        master_track_dic[cur_track_id] = track
                    cur_playlist_track_ids.append(cur_track_id)
                
        # making a list of non cached track audio features
        not_cached_ids = []
        for track_id in cur_playlist_track_ids:
            if track_id not in master_audio_features_dic.keys():
                not_cached_ids.append(track_id)

        # get audio features of non cached tracks
        if len(not_cached_ids) > 0:
            print("Making",len(not_cached_ids),"API Calls")
            # api call
            not_cached_data = sp.audio_features(not_cached_ids)
            # caching the non cached track audio features
            for i in range(len(not_cached_data)):
                track_feats = not_cached_data[i]
                if track_feats:
                    master_audio_features_dic[track_feats['id']] = track_feats
                else:
                    master_audio_features_dic[not_cached_ids[i]] = None

        # iterating current playlist tracks and accessing the cached audio features
        playlist_track_feats = []
        for track_id in cur_playlist_track_ids:
            cur_feats = master_audio_features_dic[track_id]
            if cur_feats:
                playlist_track_feats.append(cur_feats)

        # building return df
        cur_playlist_df = pd.DataFrame(playlist_track_feats)
        ret_df = pd.concat([ret_df, cur_playlist_df])
    ret_df = ret_df.reset_index(drop=True)
    return ret_df

def main_scrape(cat_ids,csv_save_dir,csv_fname,cred_fp,cache_dir, 
                master_track_fname, master_playlist_fname, master_audio_features_fname):
    creds = get_credentials(cred_fp)
    client_id = creds['client_id']
    client_secret = creds['client_secret']
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    master_track_dic = load_cached_requests(cache_dir + master_track_fname)
    master_playlist_dic = load_cached_requests(cache_dir + master_playlist_fname)
    master_audio_features_dic = load_cached_requests(cache_dir + master_audio_features_fname)
    
    master_df = pd.DataFrame()
    for cat_id in cat_ids:
        cat_playlists_data = get_playlists_data_from_category(cat_id,master_playlist_dic,sp)
        cat_df = get_track_data_from_playlists(cat_playlists_data, master_track_dic,master_audio_features_dic,sp)
        cat_df['genre'] = cat_id
        master_df = pd.concat([master_df,cat_df])
        print("Caching " + cat_id + " requests...")
        save_cached_requests(master_track_dic,cache_dir,master_track_fname)
        save_cached_requests(master_playlist_dic,cache_dir,master_playlist_fname)
        save_cached_requests(master_audio_features_dic,cache_dir,master_audio_features_fname)
    master_df = master_df.drop_duplicates(subset='id')
    save_csv(master_df,csv_save_dir,csv_fname)
    return master_df

# Loading Vars

In [None]:
genres = ['kpop','pop','rock']
audio_features_dir = "../data/test/all/metadata/"
audio_features_fname = "all_audio_features.csv"
cred_fp = "../api_cred.json"
cache_dir = "../data/local/cached_requests/"
master_track_fname = "track_data.json"
master_playlist_fname = "playlist_data.json"
master_audio_features_fname = "track_audio_features.json"

# Pipeline

In [None]:
df = main_scrape(genres,audio_features_dir,audio_features_fname,cred_fp,cache_dir, 
                 master_track_fname, master_playlist_fname, master_audio_features_fname)

# Work in Progress

In [None]:
# add more songs 
creds = get_credentials(cred_fp)
client_id = creds['client_id']
client_secret = creds['client_secret']
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
master_track_dic = load_cached_requests(cache_dir + master_track_fname)
master_playlist_dic = load_cached_requests(cache_dir + master_playlist_fname)
master_audio_features_dic = load_cached_requests(cache_dir + master_audio_features_fname)

In [None]:
playlist_id = '660VrDfeGlKRFkMDy15JP0'
master_playlist_dic[playlist_id]['tracks']['next']

In [None]:
playlist_data = sp.next(master_playlist_dic[playlist_id]['tracks'])

In [None]:
len(playlist_data['items'])
len(master_playlist_dic[playlist_id]['tracks']['items'])

In [None]:
master_playlist_dic[playlist_id]['tracks']['total']

# Old

In [None]:
# kpop_playlists_data = get_playlists_data_from_category("kpop",master_playlist_dic)
# kpop_df = get_track_data_from_playlists(pop_playlists_data, master_track_dic,master_audio_features_dic)

# pop_playlists_data = get_playlists_data_from_category("pop",master_playlist_dic)
# pop_df = get_track_data_from_playlists(pop_playlists_data, master_track_dic,master_audio_features_dic)

# rock_playlists_data = get_playlists_data_from_category("rock",master_playlist_dic)
# rock_df = get_track_data_from_playlists(rock_playlists_data, master_track_dic,master_audio_features_dic)

In [None]:
# save_cached_requests(master_track_dic,cache_dir,master_track_fname)
# save_cached_requests(master_playlist_dic,cache_dir,master_playlist_fname)
# save_cached_requests(master_audio_features_dic,cache_dir,master_audio_features_fname)

In [None]:
# rock_df['genre'] = 'rock'
# pop_df['genre'] = 'pop'


In [None]:
# kpop_csv_dir = "../data/test/kpop/metadata/"
# kpop_fname = "kpop_metadata.csv"

# pop_csv_dir = "../data/test/pop/metadata/"
# pop_fname = "pop_metadata.csv"

# rock_csv_dir = "../data/test/rock/metadata/"
# rock_fname = "rock_metadata.csv"

# dfs = [kpop_df,pop_df,rock_df]
# csv_dirs = [kpop_csv_dir,pop_csv_dir,rock_csv_dir]
# csv_fnames = [kpop_fname,pop_fname,rock_fname]

# save_csvs(dfs,csv_dirs,csv_fnames)

In [None]:
# analysis_df = pd.read_csv("../data/test/kpop/metadata/kpop_metadata.csv")