# Getting Fetaures for each Track using Spotify API w. Spotipy Library
Before I look through each track to pull audio features from Spotify, the dataset needs a little cleaning of Null Values.

### Imports and Credentials

In [1]:
import pandas as pd   
import numpy as np

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth
import spotipy.util as util

In [2]:
token = util.prompt_for_user_token('222ugx5gr6cxvoyfbs5z2xpvy', # my user id on spotify
                                   client_id="28926b2395b54676a52b39cbd9477300", # my 'app' information
                                   client_secret="6acaf3dbedba475c8c2a5d431de42129",
                                   redirect_uri='http://localhost:8000/')
sp = spotipy.Spotify(auth=token) # getting the authorization token

In [3]:
tracks_df = pd.read_csv('./data/all_tracks.csv')

In [4]:
tracks_df.head()

Unnamed: 0,artists,title,track_id,popularity,explicit,release_date
0,Shawn Mendes,Wonder,5KCbr5ndeby4y4ggthdiAb,0,False,2020-10-02
1,Justin Bieber,Holy (feat. Chance The Rapper),5u1n1kITHCxxp8twBcZxWy,92,False,2020-09-18
2,24kGoldn,Mood (feat. Iann Dior),3tjFYV6RSFtuktYl3ZtYcq,100,True,2020-07-24
3,Internet Money,Lemonade,02kDW379Yfd5PzW5A6vuGt,93,True,2020-08-14
4,BLACKPINK,Bet You Wanna (feat. Cardi B),1hPkiovjTqiJAJen4uyNRg,0,False,2020-10-02


### Tracks Dataset Cleanup

There were 46 rows where no value was pulled from Spotify accross all 6 columns. In the grand scale of 103k rows, dropping 46 is a good choice, 
alternatively I would need to repull everything in hopes that I get these 46 rows filled, which in my opinion is not worth it.

In [5]:
tracks_df = tracks_df.loc[tracks_df['release_date'] != '0']

In [6]:
tracks_df.isnull().sum()

artists          6
title            5
track_id        18
popularity       0
explicit         0
release_date    25
dtype: int64

I need to fill the missing values of date, and looking at Spotify the information I can gather myself is in year only format, so I will create a release_year column that extracts the first 4 characters from the release_date. This will also match the column in the Skip Challenge dataset

In [7]:
tracks_df['release_year'] = tracks_df['release_date'].str[:4]

In [8]:
tracks_df.shape

(103598, 7)

In [9]:
tracks_df.head()

Unnamed: 0,artists,title,track_id,popularity,explicit,release_date,release_year
0,Shawn Mendes,Wonder,5KCbr5ndeby4y4ggthdiAb,0,False,2020-10-02,2020
1,Justin Bieber,Holy (feat. Chance The Rapper),5u1n1kITHCxxp8twBcZxWy,92,False,2020-09-18,2020
2,24kGoldn,Mood (feat. Iann Dior),3tjFYV6RSFtuktYl3ZtYcq,100,True,2020-07-24,2020
3,Internet Money,Lemonade,02kDW379Yfd5PzW5A6vuGt,93,True,2020-08-14,2020
4,BLACKPINK,Bet You Wanna (feat. Cardi B),1hPkiovjTqiJAJen4uyNRg,0,False,2020-10-02,2020


There are 25 missing values in release_date (which matches release year), I will manually fill these in looking at Spotify's web player. Some of these are also missing track ID, so I'll fill those at the same time in my search. The ones that have track_id but have release year missing are episodes and not songs, so I'll drop those first.

In [10]:
tracks_df.drop('release_date', axis = 1, inplace = True) # dropping the original date column

In [12]:
tracks_df.loc[tracks_df['release_year'].isnull()] # SEE IF WE CAN FIX THIS WITH FILLING THE TRACK ID AND RELEASE YEAR

Unnamed: 0,artists,title,track_id,popularity,explicit,release_year
23979,Ray LaMontagne,I Still Care For You,,0,False,
25580,Shania Twain,Youre Still The One,,0,False,
43607,John Wizards,Lusaka By Night,,0,False,
54197,NTS x Netflix: Top Boy Academy,Abi Leland & Michael Asante: Top Boy Music Tea...,1s1NBYDmNokyUdKGgpjkfo,0,False,
54201,The New Music Business with Ari Herstand,Euphoria Music Supervisor Jen Malone Gives a Sh*t,122ngSFhxBLq02tp5kaehl,0,True,
54205,Showstopper,"Insecure - ft. Kier Lehman, Raphael Saadiq, & ...",4KplnOanGnfyCBV1v0rK5X,0,False,
54209,Mundos Diferentes: El Podcast,Episodio 12 - Supervisión musical con Javier N...,0nnvOmX8BbQszi1X1rGt9l,0,False,
54213,Showstopper,This Is Us - ft. Siddhartha Khosla & Jennifer ...,0rfTSGTzmTpLPSiwI5yTuB,0,False,
54217,Sound Opinions,#673 Film & T.V. Music Supervisor Susan Jacobs...,6BBNDM0W2JD3g3orcyezjM,0,False,
54221,In Conversation,Morgan Rhodes,52QXHOXaFXeHDeuaSPknAs,0,False,


In [13]:
# function to drop specific rows based on artist, all of these were episodes and not songs

def drop_episodes(artist):
    tracks_df.drop(tracks_df.loc[tracks_df['artists'] == artist].index, inplace=True)

In [14]:
#Calling the function above with rows to drop

drop_episodes('NTS x Netflix: Top Boy Academy')
drop_episodes('The New Music Business with Ari Herstand')
drop_episodes('Showstopper')
drop_episodes('Mundos Diferentes: El Podcast')
drop_episodes('Sound Opinions')
drop_episodes('In Conversation')

In [15]:
# function to fill in specific rows with title match. function will find the row with the title, and fill in the track_id
# and the release_year based on what was shown on Spotify.

def fill_track_year(title, track_id, year):
    tracks_df.loc[tracks_df['title'] == title, ['track_id', 'release_year']] = [track_id, year]

In [16]:
# Calling the function to fill in missing track_id and release_year values

fill_track_year('I Still Care For You', '4kQF23eYkc18zXyvyqSFjz', 2009)
fill_track_year('Youre Still The One', '1wb4P4F0sxAQ2KXrRvsx6n', 1997)
fill_track_year('Lusaka By Night', '3gdvhtMXa3JgkZbrKcoPJx', 2013)
fill_track_year('Bass Down Low (feat. The Cataracs)', '7EqcHxisQRcdgx9VgpLFFd', 2012)
fill_track_year('Witch Doctor', '1lhYMdj0EqBUvjjRpujknM', 2010)
fill_track_year('Comprachicos', '2ZIJUwprFZrAaZCRKYfAno', 2010)
fill_track_year('Crush', '6hzwfFKrTabeUsW5SWti17', 2010)
fill_track_year('Encoder', '1SgUE0s8KGfJW9JeuN0ns2', 2010)
fill_track_year('Genesis', '7wZtIGsr1acUjLQMPIpM9A', 2010)
fill_track_year('Immunize (feat. Liam H)', '75Ik1zcZSvXSVCFHPwQr4E', 2010)
fill_track_year('Salt In The Wounds', '4ESOC1Jott4MYNH9cYuF8z', 2010)
fill_track_year('Set Me On Fire', '5E1UIRcuHd7Rr7PYbQSEOu', 2010)
fill_track_year('The Vulture', '3cNsLpqu5TGoGKwM6cyZXB', 2010)
fill_track_year('Under The Waves', '5AhJXm6nrCNDjVphyBUeLH', 2010)
fill_track_year('Watercolour', '5ami95W9OOWQPwrBb5tud5', 2010)
fill_track_year('Witchcraft', '4Y2glvLjQGOb4dXnwm1hQf', 2010)
fill_track_year('Blues från Sverige', '38Qhu88rMBRJHjZD7X5DpZ', 2004)
fill_track_year('Jag kommer','2QyTvPHRT9P6ZZSOAzG7eG', 2010)

In [17]:
tracks_df.isnull().sum()

artists         6
title           5
track_id        0
popularity      0
explicit        0
release_year    0
dtype: int64

Looking at the last missing values which are artists and title, the only thing I had to search with was the track_id. Backtracking these through Spotify lead to compilation albums from 'various artists' that have since been removed. Some of the tracks were from Johnny Cash, and one From Jay McShann. Despite knowing these 2 artists, I will drop them anyways because the tracks technically no longer exist and may cause issues in my feature pulls. Additionally the 1 track at index 63707 will also be dropped because it is a 20 min long discussion about the Petrushka Ballet.  

In [18]:
tracks_df.loc[tracks_df['artists'].isnull()]

Unnamed: 0,artists,title,track_id,popularity,explicit,release_year
40226,,,7DRill2AgyTf9LV7uWAK5E,0,False,2012
41918,,,1e6kD8AXQ152lFZLr7kKPN,0,False,2012
63707,,Leonard Bernstein Discusses Stravinsky and the...,5OJZjoVoyNbzD24C7tkycW,5,False,1962
69181,,,3e6ZqyhDUkiPdVaQ9MiNm7,0,False,2012
82048,,,69gRFGOWY9OMpFJgFol1u0,0,False,2012
86795,,,16TbVkFPNUtNkwCSZIziXJ,0,False,2013


In [19]:
tracks_df.drop(tracks_df.loc[tracks_df['artists'].isnull()].index, inplace=True)

In [48]:
tracks_df.shape

(103585, 6)

### Getting Track Features for 103k+ tracks

In [28]:
track_features = []

In [43]:
token = util.prompt_for_user_token('222ugx5gr6cxvoyfbs5z2xpvy', # my user id on spotify
                                   client_id="28926b2395b54676a52b39cbd9477300", # my 'app' information
                                   client_secret="6acaf3dbedba475c8c2a5d431de42129",
                                   redirect_uri='http://localhost:8000/')
sp = spotipy.Spotify(auth=token) # getting the authorization token

In [45]:
for track in tracks_df.values[]:
    features = sp.audio_features(track[2])
    track_features.append(features) 


In [63]:
track_features[0]

[{'danceability': 0.333,
  'energy': 0.637,
  'key': 1,
  'loudness': -4.904,
  'mode': 0,
  'speechiness': 0.0581,
  'acousticness': 0.131,
  'instrumentalness': 1.8e-05,
  'liveness': 0.149,
  'valence': 0.132,
  'tempo': 139.898,
  'type': 'audio_features',
  'id': '5KCbr5ndeby4y4ggthdiAb',
  'uri': 'spotify:track:5KCbr5ndeby4y4ggthdiAb',
  'track_href': 'https://api.spotify.com/v1/tracks/5KCbr5ndeby4y4ggthdiAb',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/5KCbr5ndeby4y4ggthdiAb',
  'duration_ms': 172693,
  'time_signature': 4}]

In [55]:
danceability = []
energy = []
key = []
loudness = []
mode = []
speechiness = []
acousticness = []
instrumentalness = []
liveness = []
valence = []
tempo = []
duration_ms = []
time_signature = []

In [56]:
for track in track_features:
    try:
        danceability.append( track[0]['danceability'])
        energy.append( track[0]['energy'])
        key.append( track[0]['key'])
        loudness.append( track[0]['loudness'])
        mode.append( track[0]['mode'])
        speechiness.append( track[0]['speechiness'])
        acousticness.append( track[0]['acousticness'])
        instrumentalness.append( track[0]['instrumentalness'])
        liveness.append( track[0]['liveness'])
        valence.append( track[0]['valence'])
        tempo.append( track[0]['tempo'])
        duration_ms.append( track[0]['duration_ms'])
        time_signature.append( track[0]['time_signature'])
    except:
        danceability.append(np.NaN)
        energy.append(np.NaN)
        key.append(np.NaN)
        loudness.append(np.NaN)
        mode.append(np.NaN)
        speechiness.append(np.NaN)
        acousticness.append(np.NaN)
        instrumentalness.append(np.NaN)
        liveness.append(np.NaN)
        valence.append(np.NaN)
        tempo.append(np.NaN)
        duration_ms.append(np.NaN)
        time_signature.append(np.NaN)

In [57]:
tracks_df['danceability'] = danceability
tracks_df['energy'] = energy
tracks_df['key'] = key
tracks_df['loudness'] = loudness
tracks_df['mode'] = mode
tracks_df['speechiness'] = speechiness
tracks_df['acousticness'] = acousticness
tracks_df['instrumentalness'] = instrumentalness
tracks_df['liveness'] = liveness
tracks_df['valence'] = valence
tracks_df['tempo'] = tempo
tracks_df['duration_ms'] = duration_ms
tracks_df['time_signature'] = time_signature

In [58]:
tracks_df.isnull().sum()

artists              0
title                0
track_id             0
popularity           0
explicit             0
release_year         0
danceability        23
energy              23
key                 23
loudness            23
mode                23
speechiness         23
acousticness        23
instrumentalness    23
liveness            23
valence             23
tempo               23
duration_ms         23
time_signature      23
dtype: int64

These missing values (audio features) are for nature sound tracks and Audiobooks. I will drop these as well.

In [59]:
tracks_df.loc[tracks_df['danceability'].isnull()]

Unnamed: 0,artists,title,track_id,popularity,explicit,release_year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
34805,Nature Sound Retreat,Holiday Yule Log: The Christmas Fireplace (2 H...,18wMrFh4sIB4D8aHOBmOeT,1,False,2014,,,,,,,,,,,,,
34807,Nature Sound Retreat,Just Fire: 2 Hours of Sounds from the Natural ...,0RZVMByDP26ZgkrhLqctba,1,False,2014,,,,,,,,,,,,,
58077,DBS Audiobooks,The Secret Garden - Frances Hodgson Burnett Pa...,5cRkireIUGdPJjD3FF4jXZ,22,False,2013,,,,,,,,,,,,,
58078,DBS Audiobooks,The Secret Garden - Frances Hodgson Burnett Pa...,3MUkNrSJarCpxGMsE5nuDt,16,False,2013,,,,,,,,,,,,,
58079,DBS Audiobooks,The Adventures of Huckleberry Finn - Mark Twai...,3knJwFwAEa1NjsvE8vMcAu,15,False,2013,,,,,,,,,,,,,
58080,DBS Audiobooks,The Adventures of Huckleberry Finn - Mark Twai...,3H9PkmNDnMCb1llejr4gyK,9,False,2013,,,,,,,,,,,,,
58089,DBS Audiobooks,Pride and Prejudice - Jane Austen Part 1 of 2,1n30qkfDcobnWz2YRGGPdL,24,False,2013,,,,,,,,,,,,,
58090,DBS Audiobooks,Pride and Prejudice - Jane Austen Part 2 of 2,0s5hSlD76edvVxoZE35nQh,20,False,2013,,,,,,,,,,,,,
58134,DBS Audiobooks,Alice's Adventures in Wonderland - Lewis Carroll,35Vz5sWeVVs0daA4og1UxR,22,False,2012,,,,,,,,,,,,,
60308,DBS Audiobooks,The Adventures of Sherlock Holmes - Sir Arthur...,5tzZKCPM8K7TsuFAfiyFGO,5,False,2013,,,,,,,,,,,,,


In [61]:
tracks_df.drop(tracks_df.loc[tracks_df['danceability'].isnull()].index, inplace=True)

In [62]:
tracks_df.shape

(103562, 19)

In [73]:
tracks_df.reset_index(drop=True, inplace=True)

In [74]:
tracks_df.tail()

Unnamed: 0,artists,title,track_id,popularity,explicit,release_year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
103557,Lukas Graham,7 Years,5kqIPrATaCc2LqxVWzQGbk,79,False,2016,0.765,0.473,10.0,-5.829,1.0,0.0514,0.287,0.0,0.391,0.34,119.992,237300.0,4.0
103558,Billy Joel,Piano Man,3FCto7hnn1shUyZL42YgfO,68,False,2001,0.334,0.472,0.0,-8.792,1.0,0.0277,0.6,4e-06,0.317,0.431,179.167,336093.0,3.0
103559,Alicia Keys,If I Ain't Got You,3XVBdLihbNbxUwZosxcGuJ,79,False,2003,0.609,0.445,7.0,-9.129,1.0,0.106,0.603,7e-06,0.104,0.166,118.393,228707.0,3.0
103560,Adele,Someone Like You,6QPKYGnAW9QozVz2dSWqRg,79,False,2011,0.554,0.321,9.0,-8.251,1.0,0.028,0.893,0.0,0.0996,0.288,135.047,285240.0,4.0
103561,Ed Sheeran,The A Team,1VdZ0vKfR5jneCmWIUAMxK,76,False,2011,0.642,0.289,9.0,-9.918,1.0,0.0367,0.669,0.0,0.18,0.407,84.996,258373.0,4.0


In [75]:
#tracks_df.to_csv('./data/tracks_and_features.csv', index = False)