# Spotify Data Exploration and Modeling
## Jacob Torres

In [1]:
# Imports
import os
import re
import sqlite3
import numpy as np
import pandas as pd
import tensorflow as tf
from dotenv import load_dotenv
from spotipy.oauth2 import SpotifyOAuth
from spotipy import Spotify

## Exploratory Data Analysis
---

In [2]:
# Load Spotify data from local db
DATA_TABLE_QUERY = 'select * from data;'
conn = sqlite3.connect('../app/spotify_db.sqlite3')

# Dataframe of unliked songs
unliked_df = pd.read_sql(DATA_TABLE_QUERY, conn)
unliked_df = unliked_df.sample(5000)

print(f"Unliked songs: {unliked_df.shape}")
unliked_df.head()

Unliked songs: (5000, 14)


Unnamed: 0,name,id,acousticness,danceability,duration_ms,energy,tempo,instrumentalness,key,liveness,loudness,mode,valence,speechiness
111381,I Like Them All,4Fzc2djdWBQYbmenQntley,0.896,0.593,165333,0.196,153.311,0.0,9,0.21,-15.483,0,0.885,0.0658
118690,Contra Viento Y Marea,3VNYBo5b2n5rbCllOQ9ft1,0.165,0.727,175840,0.489,120.544,0.0,1,0.101,-9.752,0,0.823,0.0291
100960,9th & Hennepin,2fScFKA4WWLTFWVoJ1UQW7,0.831,0.584,116867,0.155,137.858,6.2e-05,9,0.151,-20.666,0,0.47,0.119
129604,Ask Me,36B3wTZu6VFok9MHYrv9yd,0.857,0.365,126973,0.274,79.081,0.962,4,0.169,-14.778,0,0.355,0.0308
123080,Get It On Tonite,0AcLrSfAEBQcUnHOTm5pXg,0.259,0.807,277413,0.498,99.001,3e-05,10,0.0618,-9.91,0,0.857,0.0756


In [3]:
# Create "unliked" vector for later use in the dataframe
unliked_vect = list(
    np.zeros(unliked_df.shape[0], np.int64)
)

### Authorize Spotify Access for Further Data Collection

In [4]:
# Authorize access to Spotify API
load_dotenv()
cid = os.getenv('CLIENT_ID')
cs = os.getenv('CLIENT_SECRET')
rduri = os.getenv('REDIRECT_URI')
scope = [
    'playlist-read-private',
    'playlist-modify-public',
    'user-library-read',
    'user-read-recently-played'
]

assert cid is not None and cs is not None and rduri is not None, 'One or more environment variables are missing.'

In [5]:
auth_manager = SpotifyOAuth(
    client_id=cid,
    client_secret=cs,
    redirect_uri=rduri,
    scope=scope
)
sp = Spotify(auth_manager=auth_manager)

sp

<spotipy.client.Spotify at 0x2346e17b4c0>

### Data Collection

In [6]:
# Functions for data collection via the API
def get_tracks(num=50, index=0):
    """
    Get tracks and track details from current user library starting at given index position.

    returns ids, names, tracks, audio_features -- track details
    """
    tracks = sp.current_user_saved_tracks(limit=num, offset=index)['items']
    names = [track['track']['name'] for track in tracks]
    ids = [track['track']['id'] for track in tracks]
    audio_features = []

    try:
        for id in ids:
            audio_features.append(
                sp.audio_features(id)
            )

        return ids, names, tracks, audio_features

    except:
        return ids, names, tracks, None


def get_new_likes(num=50, index=0):
    """
    Collects features for a number of tracks, starting at given index.

    returns new_likes -- dataframe of num liked tracks and audio features
    """
    ids, names, tracks, audio_features = get_tracks(num=num, index=index)

    # Create dataframe of new likes and audio features
    columns = [
        'name',
        'id',
        'duration_ms',
        'danceability',
        'energy',
        'key',
        'loudness',
        'mode',
        'speechiness',
        'acousticness',
        'instrumentalness',
        'liveness',
        'valence',
        'tempo'
    ]

    new_likes = pd.DataFrame(columns=columns)

    # Create list of track dicts with audio features
    if audio_features is None:
        print('Something went wrong.')

    elif len(audio_features) > 0:
        track_data = []
        for i, features in enumerate(audio_features):
            track = {
                'name': names[i],
                'id': features[0]['id'],
                'duration_ms': features[0]['duration_ms'],
                'danceability': features[0]['danceability'],
                'energy': features[0]['energy'],
                'key': features[0]['key'],
                'loudness': features[0]['loudness'],
                'mode': features[0]['mode'],
                'speechiness': features[0]['speechiness'],
                'acousticness': features[0]['acousticness'],
                'instrumentalness': features[0]['instrumentalness'],
                'liveness': features[0]['liveness'],
                'valence': features[0]['valence'],
                'tempo': features[0]['tempo']
            }

            track_data.append(track)

        new_likes = new_likes.append(
            track_data,
            ignore_index=True,
            verify_integrity=True
        )

    return new_likes

In [7]:
ids, names, tracks, audio_features = get_tracks()

assert (
    len(ids) == 50 and
    len(names) == 50 and
    len(tracks) == 50 and
    len(audio_features) != 0
), 'Something went wrong.'

In [8]:
# Get audio features of the tracks
audio_features[0]
#[print(i, features[0]['id']) for i, features in enumerate(audio_features)]

[{'danceability': 0.734,
  'energy': 0.712,
  'key': 6,
  'loudness': -6.217,
  'mode': 0,
  'speechiness': 0.201,
  'acousticness': 0.391,
  'instrumentalness': 0,
  'liveness': 0.103,
  'valence': 0.674,
  'tempo': 140.369,
  'type': 'audio_features',
  'id': '4CPcZHzSvIBM8AKhdHUwnL',
  'uri': 'spotify:track:4CPcZHzSvIBM8AKhdHUwnL',
  'track_href': 'https://api.spotify.com/v1/tracks/4CPcZHzSvIBM8AKhdHUwnL',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4CPcZHzSvIBM8AKhdHUwnL',
  'duration_ms': 165405,
  'time_signature': 4}]

### Add New TrackFeatures to Liked Song Dataset

In [9]:
# Add liked tracks to dataframe
indices = np.random.randint(0, 3000, 200)
liked_df = get_new_likes()

for index in indices:
    new_likes = get_new_likes(index=index)
    liked_df = liked_df.append(new_likes, ignore_index=True, verify_integrity=True)

print(liked_df.shape)
liked_df.tail(50)

(2435, 14)


Unnamed: 0,name,id,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
2385,Sunshine On My Shoulders,3m8lTUL5GxGUNSaycI77ND,311893,0.322,0.168,10,-13.893,1,0.0336,0.915,0.000223,0.117,0.259,146.68
2386,Told You So,1u76oCKjI9Wkb7DA6Q5LRV,239480,0.613,0.627,5,-6.626,1,0.0503,0.186,0.0,0.0541,0.506,67.489
2387,It's Over,6KLAVFlcTrUt9IteOXzsT8,246907,0.593,0.679,1,-4.68,0,0.0507,0.284,0.0,0.152,0.207,135.931
2388,Beautiful Soul,1HwpWwa6bnqqRhK8agG4RS,214227,0.66,0.666,9,-4.342,1,0.0472,0.0759,0.0,0.0268,0.933,89.975
2389,Black Me Out,6XI33hctgDLUIRKI8wsJS4,187813,0.52,0.885,4,-4.432,1,0.0371,0.00144,7e-06,0.121,0.602,104.646
2390,Two Coffins,5yDqUTEQVuhE3TT3l2ltlB,140187,0.545,0.707,3,-5.302,1,0.0386,0.241,0.0,0.368,0.607,166.692
2391,Landslide,5ihS6UUlyQAfmp48eSkxuQ,199493,0.414,0.161,7,-22.32,0,0.0318,0.883,0.000223,0.117,0.423,159.375
2392,The Chain - 2004 Remaster,5e9TFTbltYBg2xThimr0rU,270213,0.545,0.67,9,-8.81,1,0.0496,0.009,0.000822,0.0451,0.481,151.553
2393,Two Ghosts,4B1rpPmQXwj78wk6aIGwwU,229813,0.386,0.407,6,-7.095,0,0.0268,0.234,2e-06,0.0999,0.456,69.387
2394,"Sunflower, Vol. 6",6iYMfxznTBlcVOgRHab2W0,221827,0.566,0.846,7,-6.904,0,0.035,0.000564,0.000795,0.121,0.86,160.02


In [13]:
# Create "liked" vector
liked_vect = list(
    np.ones(liked_df.shape[0], np.int64)
)

In [14]:
# Combine liked and unliked tracks, and add "liked" column
df = unliked_df.append(liked_df, ignore_index=True, verify_integrity=True)
df['liked'] = unliked_vect + liked_vect

print(df.shape)
df['liked'].value_counts()

(7435, 15)


0    5000
1    2435
Name: liked, dtype: int64

In [15]:
df.sample(50)

Unnamed: 0,name,id,acousticness,danceability,duration_ms,energy,tempo,instrumentalness,key,liveness,loudness,mode,valence,speechiness,liked
3111,Separarla También,2uUimJq18JcrZjgkuvLuMa,0.74,0.483,155634,0.721,85.362,0.79,7,0.1,-5.899,1,0.945,0.0563,0
4949,El Beso de la Muerte - Remasterizado,5wg1nnx7c2kri8kx5xlCFH,0.996,0.493,158587,0.0997,127.023,0.481,5,0.192,-23.446,1,0.587,0.0607,0
142,Chaman Mein Rah Ke Virana,4oxWVUHYUy6WAiTV0m3wm2,0.886,0.622,193507,0.489,125.977,2.1e-05,2,0.0856,-5.19,1,0.511,0.0702,0
1998,Backroad Song,6naYpFIKihKbGg6lGxpveI,0.0352,0.488,236787,0.833,174.006,0.0172,11,0.34,-6.571,1,0.751,0.0437,0
1532,Cruel,3j5DVpcCELigVZrmwGOw3X,0.0104,0.57,214987,0.704,115.041,0.455,1,0.0905,-8.32,1,0.437,0.0314,0
3293,My Funny Valentine - Live at Philharmonic Hall...,176PPnVoT7ZnAl7urhkmoa,0.838,0.492,900933,0.149,128.094,0.578,8,0.97,-15.895,1,0.254,0.0444,0
1836,Band Of Gold - Re-Record,0YgoNapnyIXuruBrzvoCSu,0.451,0.631,182533,0.746,108.875,0.0,7,0.424,-5.439,1,0.93,0.0277,0
1560,"Sonata No. 5, Op. 24 ""Spring"" in F: Allegro",42ScjPBuyHEohFNxsxKmYX,0.956,0.335,395427,0.17,75.815,0.561,0,0.078,-15.961,1,0.343,0.0454,0
6776,Like a Boy,358bOvBiZCS9fRzNYosw6c,0.267,0.701,237053,0.724,132.035,0.0,0,0.0867,-5.751,0,0.425,0.145,1
4272,Not Strong Enough,1mCNkrKh9NEgxJWUqHnEgo,0.00183,0.354,216520,0.829,163.958,0.0,9,0.119,-3.834,1,0.195,0.047,0


In [16]:
df.describe()

Unnamed: 0,acousticness,danceability,energy,tempo,instrumentalness,liveness,loudness,valence,speechiness,liked
count,7435.0,7435.0,7435.0,7435.0,7435.0,7435.0,7435.0,7435.0,7435.0,7435.0
mean,0.473059,0.566529,0.492669,116.659233,0.139648,0.193386,-10.689551,0.52374,0.0951,0.327505
std,0.358654,0.170657,0.252507,30.129818,0.284622,0.159141,5.248577,0.254532,0.146401,0.469335
min,1e-06,0.0,2e-05,0.0,0.0,0.0175,-41.733,0.0,0.0,0.0
25%,0.113,0.453,0.285,93.834,0.0,0.0986,-13.588,0.323,0.0347,0.0
50%,0.452,0.579,0.487,115.003,0.000151,0.128,-9.811,0.523,0.0454,0.0
75%,0.833,0.693,0.696,135.404,0.0579,0.2405,-6.716,0.734,0.0796,1.0
max,0.996,0.978,0.999,217.032,1.0,0.998,0.878,1.0,0.969,1.0


## Data Prep and Modeling
---