# Spotify Data Exploration and Modeling
## Jacob Torres
---

In [1]:
# Imports
import os
import sqlite3
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from spotipy.oauth2 import SpotifyOAuth
from spotipy import Spotify

## Exploratory Data Analysis

In [2]:
# Load Spotify data from local db
DATA_TABLE_QUERY = 'select * from data;'
SONG_TABLE_QUERY = 'select * from song;'
conn = sqlite3.connect('../app/spotify_db.sqlite3')

data_df = pd.read_sql(DATA_TABLE_QUERY, conn)
song_df = pd.read_sql(SONG_TABLE_QUERY, conn)

print(f"Data: {data_df.shape}")
print(f"Song: {song_df.shape}")

Data: (169540, 14)
Song: (12, 12)


In [3]:
data_df.head()

Unnamed: 0,name,id,acousticness,danceability,duration_ms,energy,tempo,instrumentalness,key,liveness,loudness,mode,valence,speechiness
0,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4BJqT0PrAfrxzMOxytFOIz,0.982,0.279,831667,0.211,80.954,0.878,10,0.665,-20.096,1,0.0594,0.0366
1,Clancy Lowered the Boom,7xPhfUan2yNtyFG0cUWkt8,0.732,0.819,180533,0.341,60.936,0.0,7,0.16,-12.441,1,0.963,0.415
2,Gati Bali,1o6I8BglA6ylDMrIELygv1,0.961,0.328,500062,0.166,110.339,0.913,3,0.101,-14.85,1,0.0394,0.0339
3,Danny Boy,3ftBPsC5vPBKxYSee08FDH,0.967,0.275,210000,0.309,100.109,2.8e-05,5,0.381,-9.316,1,0.165,0.0354
4,When Irish Eyes Are Smiling,4d6HGyGT8e121BsdKmw9v6,0.957,0.418,166693,0.193,101.665,2e-06,3,0.229,-10.096,1,0.253,0.038


In [4]:
song_df.head(12)

Unnamed: 0,id,name,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,6BAnxKyld909yo6Pk1DO3r,Hello,0.864,7,-4.483,0,0.0348,0.0339,0.216,0.386,0.796,104.997
1,3sSESwCLq4g7WiXk6LJKbI,Wrecking Ball,0.0695,5,-22.837,1,0.0342,0.992,0.934,0.0943,0.109,112.81
2,7iUB3byx9Rcks8zW10JVkP,Feelings,0.778,6,-6.546,0,0.0365,0.0152,0.0,0.121,0.756,123.954
3,0ExiKxfY5rHBW06TcV1xXU,Queens,0.695,0,-5.499,1,0.0279,0.0703,0.0,0.13,0.571,133.949
4,5nvqqLZu75jMK31gDytANZ,Fighter,0.966,11,-3.565,0,0.0714,0.00107,9e-06,0.311,0.191,91.034
5,2ogguaH3LQ2H8gsKKVypYU,Going Home,0.0799,0,-22.987,0,0.0656,0.968,0.922,0.0864,0.124,74.45
6,2gG6IqL2enOyLdluSTcTTS,We Made It,0.745,2,-4.798,1,0.0448,0.0771,0.000141,0.175,0.153,116.092
7,7tEfdhVoIUueORVuyFvgpO,Madman,0.785,0,-6.079,1,0.0778,0.101,1.4e-05,0.115,0.649,150.059
8,6wlOciyEdLDqZXlloy2Fmp,New Cityt,0.607,5,-6.869,1,0.0369,0.189,0.0,0.347,0.716,115.025
9,3YbAvFyjnGgCEu3GybwN8E,Let's fight,0.922,10,-6.193,0,0.0416,0.0499,3e-06,0.162,0.965,79.979


In [5]:
# Create "liked" column
data_df['liked'] = np.zeros(data_df.shape[0], np.int8)
data_df.head()

Unnamed: 0,name,id,acousticness,danceability,duration_ms,energy,tempo,instrumentalness,key,liveness,loudness,mode,valence,speechiness,liked
0,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4BJqT0PrAfrxzMOxytFOIz,0.982,0.279,831667,0.211,80.954,0.878,10,0.665,-20.096,1,0.0594,0.0366,0
1,Clancy Lowered the Boom,7xPhfUan2yNtyFG0cUWkt8,0.732,0.819,180533,0.341,60.936,0.0,7,0.16,-12.441,1,0.963,0.415,0
2,Gati Bali,1o6I8BglA6ylDMrIELygv1,0.961,0.328,500062,0.166,110.339,0.913,3,0.101,-14.85,1,0.0394,0.0339,0
3,Danny Boy,3ftBPsC5vPBKxYSee08FDH,0.967,0.275,210000,0.309,100.109,2.8e-05,5,0.381,-9.316,1,0.165,0.0354,0
4,When Irish Eyes Are Smiling,4d6HGyGT8e121BsdKmw9v6,0.957,0.418,166693,0.193,101.665,2e-06,3,0.229,-10.096,1,0.253,0.038,0


In [6]:
# Change liked values for songs in song_df
data_df['liked'] = data_df['id'].apply(
    lambda x: 1 if x in song_df['id'] else 0
)
data_df['liked'].value_counts()

0    169540
Name: liked, dtype: int64

In [7]:
# Add "liked" columns to song_df
song_df['liked'] = np.ones(song_df.shape[0], np.int64)
song_df['liked'].value_counts()

1    12
Name: liked, dtype: int64

In [8]:
# Create joined dataframe
df = data_df.append(song_df)

print(df.shape)
df['liked'].value_counts()

(169552, 15)


0    169540
1        12
Name: liked, dtype: int64

### Authorize Spotify Access for Further Data Collection

In [9]:
# Authorize access to Spotify API
load_dotenv()
cid = os.getenv('CLIENT_ID')
cs = os.getenv('CLIENT_SECRET')
rduri = os.getenv('REDIRECT_URI')
scope = [
    'playlist-read-private',
    'playlist-modify-public',
    'user-library-read',
    'user-read-recently-played'
]

assert cid is not None and cs is not None and rduri is not None, 'One or more environment variables are missing.'

In [10]:
auth_manager = SpotifyOAuth(
    client_id=cid,
    client_secret=cs,
    redirect_uri=rduri,
    scope=scope
)
sp = Spotify(auth_manager=auth_manager)

sp

<spotipy.client.Spotify at 0x235d8c0ed30>

In [11]:
# Functions for getting lists of tracks and track ids
def get_tracks(from_index):
    """
    Get 50 tracks from current user library starting at given index position.

        returns tracks -- List of dicts of track details
"""
    tracks = sp.current_user_saved_tracks(50, offset=from_index)['items']
    return tracks


def get_track_ids(tracks):
    """
    Get a list of track ids from a list of dicts of track details.

    returns ids -- numpy array of track ids
    """
    ids = np.array(
        [track['track']['id'] for track in tracks]
    )

    return ids

In [12]:
tracks = get_tracks(0)
assert len(tracks) == 50

In [13]:
ids = get_track_ids(tracks)
list(ids)

['2RmLOxBT7hzEz2UUaDP5ZJ',
 '1HmhJrabjvTHZlkKDUzZfX',
 '0G05SSrl1EfK3S7rTVob7G',
 '0DJBgBiYeSn6n1AXAkFVE8',
 '5qN4HFkapdAOV94XPryVof',
 '6REbwUNlppTfcnV4d4ZoZi',
 '2HbKqm4o0w5wEeEFXm2sD4',
 '3eSJmGWqoBRx5wbFCtvPtz',
 '6nTIchACfHZWP2zrGOVYFa',
 '1WedZeiezCmCEOzLwhx0hV',
 '0jT8Nl0shPS8115is0wD2Q',
 '6Al0Kpd4VrRZ0Z4kTThNPa',
 '615XWyY2RPfk3iuYcU6qvi',
 '2alIsNw1mqEShbWyO1HXZZ',
 '5EbtodsuLbxrFDH6j5avVS',
 '338ShIVvBwdw9NjH0pYBHP',
 '1q1sjzJpUYEkTbA9Uz0qrU',
 '6RV9pRLIQiXm3qTGeyQluf',
 '5hXlAWrc8L8hYhxUnXlqFf',
 '2FSGUA0gFgGeQdprjtGM2M',
 '0pgj4EzB1XRqgZemoMNG5D',
 '2rCbl9naJYhaxjLsfx88uM',
 '4msyEItsAavVb5pZYCuz4n',
 '0Gx4VrHzS7pZOEAGrmXXBH',
 '42lpuSQmnLUM1ZXJVzIVOi',
 '2f8sB9lQ6Uh1lKKWJNR9rW',
 '6GfqNqXkAofNOakU77MWEb',
 '3YAnNBdk0uHyakXXzMbNJq',
 '5uzAnGIbuAIpaBHQ02X1uN',
 '1pkMzGqBBS10nDiiYrNGGJ',
 '23luOrEVHMfoX0AhfbQuS6',
 '6PGoSes0D9eUDeeAafB2As',
 '6SwRhMLwNqEi6alNPVG00n',
 '1eLSF6HfrRA0AsNmTkUlKx',
 '2tPcTFiQF9MbVUyjZ3zDhA',
 '6IZvVAP7VPPnsGX6bvgkqg',
 '0N3W5peJUQtI4eyR6GJT5O',
 

In [14]:
# Get 50 more tracks starting from index 200
tracks = get_tracks(200)
assert len(tracks) == 50

In [15]:
# Append new track ids to original array
ids = np.append(ids, get_track_ids(tracks))
assert len(ids) == 100

In [16]:
# Get audio features of the tracks
audio_features = sp.audio_features(ids)
audio_features

[{'danceability': 0.853,
  'energy': 0.515,
  'key': 1,
  'loudness': -5.723,
  'mode': 0,
  'speechiness': 0.106,
  'acousticness': 0.00413,
  'instrumentalness': 0,
  'liveness': 0.147,
  'valence': 0.532,
  'tempo': 100.05,
  'type': 'audio_features',
  'id': '2RmLOxBT7hzEz2UUaDP5ZJ',
  'uri': 'spotify:track:2RmLOxBT7hzEz2UUaDP5ZJ',
  'track_href': 'https://api.spotify.com/v1/tracks/2RmLOxBT7hzEz2UUaDP5ZJ',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/2RmLOxBT7hzEz2UUaDP5ZJ',
  'duration_ms': 201720,
  'time_signature': 4},
 {'danceability': 0.453,
  'energy': 0.666,
  'key': 3,
  'loudness': -5.439,
  'mode': 0,
  'speechiness': 0.268,
  'acousticness': 0.654,
  'instrumentalness': 0,
  'liveness': 0.184,
  'valence': 0.431,
  'tempo': 92.319,
  'type': 'audio_features',
  'id': '1HmhJrabjvTHZlkKDUzZfX',
  'uri': 'spotify:track:1HmhJrabjvTHZlkKDUzZfX',
  'track_href': 'https://api.spotify.com/v1/tracks/1HmhJrabjvTHZlkKDUzZfX',
  'analysis_url': 'https://api.spotify.