link for outline
<a id='acquire_playlist_data_working'></a>

# Imports

In [4]:
import pandas as pd
import numpy as np
import requests

import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src import api


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# What are we doing?

## Data pull stuff
- Get list of playlist urls for specific user to use in API endpoint
- Get songs off a specific playlist and store as a list
- Look up spotify data for songs in specific playlist and store them
- Figure out what raw data to use
- Download raw data for songs in specific playlist
- Operationalize raw data further

### Get user profile + playlist

#### Format POST requests (details in spotify_scratchpad.ipynb)

In [3]:
# CLIENT_ID = '1c2e33a8bebd41fbbb8a1ecf0e8c4273'
# CLIENT_SECRET = '4964098dcc7b41f99c4178e6403645c1'

# AUTH_URL = 'https://accounts.spotify.com/api/token'

# # POST

# auth_response = requests.post(AUTH_URL, {
#     'grant_type' : 'client_credentials', 
#     'client_id' : CLIENT_ID,
#     'client_secret' : CLIENT_SECRET,   
# })

# # convert response to json
# auth_response_data = auth_response.json()

# # save the access token
# access_token = auth_response_data['access_token']

#### Set correct headers for POST request

In [4]:
headers = {
    'Authorization' : 'Bearer {token}'.format(token=access_token)
    
}

#### Set base URL of all spotify API endpoints

In [5]:
BASE_URL = 'https://api.spotify.com/v1/'

#### Get profile

In [6]:
user_id = 'bothsidesdoit'

user_profile = requests.get(BASE_URL + 'users/' + user_id + '/playlists', headers=headers)

user_profile = user_profile.json()

#### Get playlist url endpoints for api

In [7]:
playlist_number = len(user_profile['items'])

playlist_urls = [
    user_profile['items'][number]['href']
    for number 
    in range(0, playlist_number)
]

In [7]:
playlist_urls

['https://api.spotify.com/v1/playlists/1oqsGOPQF8TqVtiFdfqkd6',
 'https://api.spotify.com/v1/playlists/2NcJyzjvUlvKQJsScpLOpu',
 'https://api.spotify.com/v1/playlists/1Zd7dmudNrNwpfShtK6ywf',
 'https://api.spotify.com/v1/playlists/1u6zqNx6dIAwvKHedLl0E2',
 'https://api.spotify.com/v1/playlists/5kXBwKZOEVT7PXV6rZuhnV',
 'https://api.spotify.com/v1/playlists/4fcUMfrQr7h3j6UpVndlKq']

#### Get specific playlist from above url

In [8]:
single_playlist_request = requests.get(
    playlist_urls[0],
    headers=headers
)

In [9]:
single_playlist_request = single_playlist_request.json()

#### Get list of song ids from the playlist request

In [10]:
single_playlist_ids = \
[
    single_playlist_request
        ['tracks']
        ['items']
        [track]
        ['track']
        ['id']
    for
        track
    in
        range(
            0, 
            len(single_playlist_request['tracks'])
        )
]

#### Get list of spotify features 

In [11]:
spotify_features_url = BASE_URL + 'audio-features/'

single_playlist_spotify_features_data = [
    requests.get(
        spotify_features_url + track_id,
        headers=headers
    )
    .json()
    for
        track_id
    in
        single_playlist_ids
]

#### Got the data!  Turning the above into a single function
#### input playlist url -> ouput data

In [9]:
def get_playlist_response(playlist_url):
    '''
    GET response for given playlist url from spotify api 
    
    input:
        playlist_url 
            - spotify API url for given playlist
            - string
        
    output:
        json of playlist_url RESPONSE from spotify api
    '''
    return requests.get(
        playlist_url,
        headers=headers
    ).json()
    

def parse_playlist_song_ids(playlist_url_response):
    '''
    parse playlist url RESPONSE for unique track IDs of songs in playlist 
    
    input:
        playlist_url_response
            - RESPONSE from spotify API for url for given playlist
            - json
        
    output:
        list of strings of unique track IDs 
    '''
    
    return [
    single_playlist_request
        ['tracks']
        ['items']
        [track]
        ['track']
        ['id']
    for
        track
    in
        range(
            0, 
            len(playlist_url_response['tracks'])
        )
    ]

def get_spotify_features_from_trackid(track_ids):
    '''
    GET response of spotify features for given list of track ids
    
    input:
        list of track ids
            - track id elements of list are strings
        
    output:
        df of spotify features for given track list 
    '''
    BASE_URL = 'https://api.spotify.com/v1/'
    
    spotify_features_url = BASE_URL + 'audio-features/'

    data = [
        requests.get(
            spotify_features_url + track_id,
            headers=headers
        )
        .json()
        for
            track_id
        in
            track_ids
    ]
    
    return pd.DataFrame(data)

def get_spotify_features(playlist_url):
    '''
    gather spotify features for each track in given playlist url
    
    input:
        playlist_url 
            - spotify API url for given playlist
            - string
        
    output:
        df of spotify features for that playlist
    '''
    
    playlist_response = get_playlist_response(
        playlist_url
    )
    
    playlist_track_ids = parse_playlist_song_ids(
        playlist_response
    )
    
    spotify_features = get_spotify_features_from_trackid(
        playlist_track_ids
    )
    
    return spotify_features

    
    

### Figure out what raw data to use

Goal is to be relatively quick 'n dirty but still provide info

In [2]:
playlist_response = api.get_playlist_response('https://api.spotify.com/v1/playlists/1oqsGOPQF8TqVtiFdfqkd6')

In [3]:
playlist_response

{'collaborative': False,
 'description': '',
 'external_urls': {'spotify': 'https://open.spotify.com/playlist/1oqsGOPQF8TqVtiFdfqkd6'},
 'followers': {'href': None, 'total': 0},
 'href': 'https://api.spotify.com/v1/playlists/1oqsGOPQF8TqVtiFdfqkd6',
 'id': '1oqsGOPQF8TqVtiFdfqkd6',
 'images': [{'height': 640,
   'url': 'https://mosaic.scdn.co/640/ab67616d0000b2730c577c93959ad4f76aab306aab67616d0000b273170683264052615fd667788dab67616d0000b2735cb41ba5f1e01d3a932b2e24ab67616d0000b273faa00320d0021bd65b8613db',
   'width': 640},
  {'height': 300,
   'url': 'https://mosaic.scdn.co/300/ab67616d0000b2730c577c93959ad4f76aab306aab67616d0000b273170683264052615fd667788dab67616d0000b2735cb41ba5f1e01d3a932b2e24ab67616d0000b273faa00320d0021bd65b8613db',
   'width': 300},
  {'height': 60,
   'url': 'https://mosaic.scdn.co/60/ab67616d0000b2730c577c93959ad4f76aab306aab67616d0000b273170683264052615fd667788dab67616d0000b2735cb41ba5f1e01d3a932b2e24ab67616d0000b273faa00320d0021bd65b8613db',
   'width': 60}]

In [180]:
song_ids = api.parse_playlist_song_ids(playlist_response)

KeyError: 'tracks'

In [13]:
def get_track_raw(song_id):
    '''
    GET the low-level features of a single track through API call
    
    input:
        track_id - unique track ID in spotify api db
        str
        
    output:
        df of result of API call: low-level variables of single track
    '''
    
    data = requests.get(
    BASE_URL + 'audio-analysis/' + '6y0igZArWVi6Iz0rj35c1Y',
    headers=headers
    ) \
    .json()
    
    return = pd.DataFrame(data)

In [14]:
track_data = api.get_raw_data_track(song_ids[0])

In [176]:
track_dictionary = {
    'meta': ['status_code'],
    'track': [
        'duration', 'end_of_fade_in',
        'start_of_fade_out', 'window_seconds'
    ],
    'bars': 'all',
    'beats': 'all',
    'sections': 'all',
    'segments': 'all', 
    'tatums': 'all'
}

def unpack_selected_json(track_raw_json, track_raw_json_key, cols):
    '''
    unpack the nested dictionaries within specific key of track's low-level audio analysis data into df

    input:
        track_raw_json - JSON results from track audio analysis POST request
        json
        
        track_raw_json_key - specific key to unpack
        str
        
        cols - list of strings of columns to unpack eg ['key', 'tempo']
        
    output:
        df with one row (the track) and unpacked columns
    '''
    
    #individual data_groups are eg `section`, `beats` etc
    data_group = track_raw_json[track_raw_json_key]
    
    
    #syntax is eg
    # for every section in datagroup
    #    for every provided column
    #        create new dict entry section(int)_column:datagroup[section][column]
    unpacked_dict = {
        
        #create dict entry of eg `section10_tempo: '6.7'`
        '{}{}_{}'.format(track_raw_json_key, count, col):data_group[count][col] 
        
        #for every unit in the subgroup of data
        for count, _ in enumerate(data_group)
        
        #for every selected column in that unit
        for col in cols
    }
    
    #`pitch` and `timbre` in each `segment` are 12-dim vectors, need to unpack them
    #or else they won't fit onto one row of subsequent df
    
    key_list = list(unpacked_dict.keys())
    
    for key in key_list:
        update_dict = {}
        if 'pitch' in key:
            for count, item in enumerate(unpacked_dict[key]):
                update_dict.update({key+str(count):item})
            unpacked_dict.update(update_dict)
            del unpacked_dict[key]    
        if 'timbre' in key:
            for count, item in enumerate(unpacked_dict[key]):
                update_dict.update({key+str(count):item})
            unpacked_dict.update(update_dict)
            del unpacked_dict[key]  
    
    
    #index[0] to keep all the data on one row
    frame = pd.DataFrame(
        unpacked_dict, 
        index=[0]
    )
    
    return frame
    

def unpack_json(track_raw_json, columns_dictionary):
    '''
    unpack the nested dictionaries from a result of a POST request for a specific track's low-level audio analysis data

    input:
        track_raw_json - JSON results from track audio analysis POST request
        json
        
        columns_dictionary - keys are subgroups of data eg 'beats', items are either:
            'all' (str): flag to unpack all columns in subunit 
            list (list): list of strings of columns in subunit to unpack
        
    output:
        df with one row (the track) and unpacked columns
    '''
    
    frame_list = []
    
    for subgroup in columns_dictionary.keys():
        cols = columns_dictionary[subgroup]
        
        #subgroups with deeper lists unpack with unpack_selected_json function above
        if cols == 'all':
            
            cols = list(
                track_raw_json[subgroup][0].keys()
            )
        
            frame = unpack_selected_json(track_raw_json, subgroup, cols)
        
            frame_list.append(frame)            
        
        
        
        #subgroups with just key value pairs are unpacked here with similar structure
        #but w/o that additional nested list
        else:
            for col in cols:
                frame = pd.DataFrame({
                    '{}_{}'.format(subgroup, col):track_raw_json[subgroup][col]
                    },
                    index=[0]
                )

                frame_list.append(frame)

    return frame_list

## Putting track data together to get playlist data

#### Single track data

In [7]:
# GET overall data from specific playlist data

playlist_response = api.get_playlist_response('https://api.spotify.com/v1/playlists/1oqsGOPQF8TqVtiFdfqkd6')

# parse song IDs in RESPONSE from sp. playlist 

song_ids = api.parse_playlist_song_ids(playlist_response)

# GET low-level audio data from first song

track_data = api.get_raw_data_track(song_ids[0])

track_frame = api.unpack_json(track_data, song_ids[0])

### From playlist response, add playlist and song identifying data eg name, album etc

In [82]:
playlist_response.keys()

In [85]:
def unpack_playlist_metadata(playlist_response, track_number=0):
    '''
    put each track's metadata from playlist RESPONSE into dict and zip w/ track id from playlist
    
    input:
        playlist_response: RESPONSE from playlist GET
        json
        
        track_number: element in list corresponding to playlist track being unpacked
        
    output:
        df of track metadata, all one row
    '''
    
    song_metadata_dict = {
    'id': playlist_response['tracks']['items'][track_number]['track']['id'],
    'popularity': playlist_response['tracks']['items'][track_number]['track']['popularity'],
    'artist': playlist_response['tracks']['items'][track_number]['track']['artists'][0]['name'],
    'artist_type': playlist_response['tracks']['items'][track_number]['track']['artists'][0]['type'],
    'album_type': playlist_response['tracks']['items'][track_number]['track']['album']['album_type'],
    'album_release_date': playlist_response['tracks']['items'][track_number]['track']['album']['release_date'],
    'release_precision': playlist_response['tracks']['items'][track_number]['track']['album']['release_date_precision'],
    'track_name': playlist_response['tracks']['items'][track_number]['track']['name'],
    'track_flag': playlist_response['tracks']['items'][track_number]['track']['track'],
    'track_type': playlist_response['tracks']['items'][track_number]['track']['type'],
    'episode': playlist_response['tracks']['items'][track_number]['track']['episode']
}
    
    frame = pd.DataFrame(
        song_metadata_dict, 
        index=[0]
    )
    
    return frame

In [86]:
unpack_playlist_metadata(playlist_response)

Unnamed: 0,id,popularity,artist,artist_type,album_type,album_release_date,release_precision,track_name,track_flag,track_type,episode
0,4B4xOuj22g8WAMvTzPV7Bq,0,The Handsome Family,artist,album,2003-09-29,day,Far From Any Road,True,track,False


### Combine metadata and track together

In [88]:
track_frames = [
    api.unpack_json(track_data)
    for track_data, song_id
    in tracks_data
]

tracks_frame = pd.concat(track_frames)

metadata_list = [
    unpack_playlist_metadata(playlist_response, track_number)
    for track_number
    in range(0, len(playlist_response['tracks']['items'])
            )
] 

In [94]:
metadata_frames = pd.concat(metadata_list)
metadata_frames

Unnamed: 0,id,popularity,artist,artist_type,album_type,album_release_date,release_precision,track_name,track_flag,track_type,episode
0,4B4xOuj22g8WAMvTzPV7Bq,0,The Handsome Family,artist,album,2003-09-29,day,Far From Any Road,True,track,False
0,6qRmb4uhIUaWE5LySgSfQS,44,Vashti Bunyan,artist,album,2007-10-30,day,Train Song,True,track,False
0,3uroe3JBpsFOOHgOAjyzhG,0,Final Fantasy,artist,album,2005-06-06,day,The CN Tower Belongs To The Dead,True,track,False
0,7f5C7QT51TYQJ7SfOxx0Rm,0,Sun Dula Amen,artist,album,2007-11-20,day,Home,True,track,False
0,6K5GAopLZzzg0ZX8k5GWwh,18,Voxtrot,artist,single,2005-01-01,day,The Start Of Something,True,track,False
0,2ShuJDejHCsnULbn9IYitk,36,Golden Shoulders,artist,album,2004-08-24,day,I Will Light You on Fire,True,track,False
0,0Qty2GKY7Ce1eIsR44QEnu,24,Tilly And The Wall,artist,album,2004,year,Nights of the Living Dead,True,track,False
0,6qVwmzQADVElWXKIfdJvGm,32,Drive-By Truckers,artist,album,2008-01-22,day,Self Destructive Zones,True,track,False
0,2kgxYag8woDOgxFIrkGyYc,44,Apollo Sunshine,artist,album,2017-04-07,day,We Are Born When We Die,True,track,False
0,0vSDvxE6tYOswOvaaoz4PZ,0,D.D Dumbo,artist,single,2014-10-27,day,Tropical Oceans,True,track,False


In [98]:
track_frames = [
    api.unpack_json(
        api.get_raw_data_track(song_id),
        song_id
    )
    for song_id
    in song_ids
]

tracks_frame = pd.concat(track_frames)

metadata_list = [
    unpack_playlist_metadata(playlist_response, track_number)
    for track_number
    in range(0, len(playlist_response['tracks']['items'])
            )
] 

metadata_frame = pd.concat(metadata_list)

pd.merge(tracks_frame, metadata_frame, on='id')

Unnamed: 0,meta_status_code,track_duration,track_end_of_fade_in,track_start_of_fade_out,track_window_seconds,bars0_start,bars0_duration,bars0_confidence,bars1_start,bars1_duration,...,popularity,artist,artist_type,album_type,album_release_date,release_precision,track_name,track_flag,track_type,episode
0,0,166.49333,0.0,159.99129,0,0.43111,2.0023,0.869,2.43342,2.00329,...,0,The Handsome Family,artist,album,2003-09-29,day,Far From Any Road,True,track,False
1,0,137.36,0.25161,128.30186,0,2.57234,2.36786,0.585,4.9402,2.3525,...,44,Vashti Bunyan,artist,album,2007-10-30,day,Train Song,True,track,False
2,0,211.84,0.32154,206.94785,0,0.39785,2.40242,0.673,2.80028,2.40705,...,0,Final Fantasy,artist,album,2005-06-06,day,The CN Tower Belongs To The Dead,True,track,False
3,0,211.45332,0.43524,203.01787,0,2.3608,0.63196,0.339,2.99276,2.54491,...,0,Sun Dula Amen,artist,album,2007-11-20,day,Home,True,track,False
4,0,271.74667,0.0,264.48688,0,0.89693,2.21122,0.653,3.10816,2.24216,...,18,Voxtrot,artist,single,2005-01-01,day,The Start Of Something,True,track,False
5,0,190.54668,0.62671,187.53017,0,1.16943,1.92111,0.491,3.09054,1.95775,...,36,Golden Shoulders,artist,album,2004-08-24,day,I Will Light You on Fire,True,track,False
6,0,235.4,0.1683,225.47737,0,0.95497,1.88616,0.513,2.84113,1.88348,...,24,Tilly And The Wall,artist,album,2004,year,Nights of the Living Dead,True,track,False


## Next step notes:

Can turn the above into a series of functions in order to get full sparse raw data

Also look into: why does the cell showing "metadata frames" object above have so many more tracks than the 7 tracks with audio analysis?