# Spotify Data Exploration and Modeling
**Jacob Torres**

In [1]:
# Imports
import os
import re
import sqlite3
import numpy as np
import pandas as pd
import tensorflow as tf
from dotenv import load_dotenv
from spotipy.oauth2 import SpotifyOAuth
from spotipy import Spotify

## Exploratory Data Analysis
---

In [2]:
# Load Spotify data from local db
DATA_TABLE_QUERY = 'select * from data;'
conn = sqlite3.connect('../app/spotify_db.sqlite3')

# Dataframe of unliked songs
unliked_df = pd.read_sql(DATA_TABLE_QUERY, conn)
unliked_df = unliked_df.sample(5000)

print(f"Unliked songs: {unliked_df.shape}")
unliked_df.head()

Unliked songs: (5000, 14)


Unnamed: 0,name,id,acousticness,danceability,duration_ms,energy,tempo,instrumentalness,key,liveness,loudness,mode,valence,speechiness
5737,Conflict,1afaBH70jAwIC2qz5DVBmN,0.945,0.151,263560,0.0611,181.341,0.577,0,0.376,-18.26,1,0.0356,0.0425
60268,"Four Russian Peasant Songs ""Saucers"" - For Equ...",1eMwDNXvEYCTlMFaeZlhs6,0.994,0.698,44600,0.11,93.135,0.278,6,0.181,-20.665,1,0.413,0.0638
43415,"Die Borger, Kapitel 5",3uGV9HQz802MXmIEROFrr1,0.513,0.617,454747,0.359,76.615,0.00016,1,0.115,-20.985,1,0.534,0.952
60680,"Violin Concerto No. 3 in G Major, K. 216: I. A...",03D78yKHYmnZdFmuX157BL,0.96,0.426,553533,0.181,123.162,0.132,7,0.106,-15.668,1,0.375,0.0432
156450,"Der Rosenkavalier, Op.59 / Act 3: ""Zur Stelle!...",0yRjRgvO8kR6E9fehn07tE,0.976,0.424,308600,0.448,82.35,0.000112,5,0.436,-13.29,1,0.578,0.0954


In [3]:
# Create "unliked" vector for later use in the dataframe
unliked_vect = list(
    np.zeros(unliked_df.shape[0], np.int64)
)

### Authorize Spotify Access for Further Data Collection

In [4]:
# Authorize access to Spotify API
load_dotenv()
cid = os.getenv('CLIENT_ID')
cs = os.getenv('CLIENT_SECRET')
rduri = os.getenv('REDIRECT_URI')
scope = [
    'playlist-read-private',
    'playlist-modify-public',
    'user-library-read',
    'user-read-recently-played'
]

assert cid is not None and cs is not None and rduri is not None, 'One or more environment variables are missing.'

In [5]:
auth_manager = SpotifyOAuth(
    client_id=cid,
    client_secret=cs,
    redirect_uri=rduri,
    scope=scope
)
sp = Spotify(auth_manager=auth_manager)

sp

<spotipy.client.Spotify at 0x1907ca062b0>

### Data Collection

In [6]:
# Functions for data collection via the API
def get_tracks(index=0):
    """
    Get 50 tracks and track details from current user library starting at given index position.

    returns ids, names, tracks, audio_features -- track details
    """
    tracks = sp.current_user_saved_tracks(limit=50, offset=index)['items']
    names = [track['track']['name'] for track in tracks]
    ids = [track['track']['id'] for track in tracks]
    audio_features = []

    try:
        for id in ids:
            audio_features.append(
                sp.audio_features(id)
            )

        return ids, names, tracks, audio_features

    except:
        return ids, names, tracks, None


def get_new_likes(num=50, index=0):
    """
    Collects features for 50 tracks, starting at given index.

    returns new_likes -- dataframe of num liked tracks and audio features
    """
    ids, names, tracks, audio_features = get_tracks(index=index)

    # Create dataframe of new likes and audio features
    columns = [
        'name',
        'id',
        'duration_ms',
        'danceability',
        'energy',
        'key',
        'loudness',
        'mode',
        'speechiness',
        'acousticness',
        'instrumentalness',
        'liveness',
        'valence',
        'tempo'
    ]

    new_likes = pd.DataFrame(columns=columns)

    # Create list of track dicts with audio features
    if audio_features is None:
        print('Something went wrong.')

    elif len(audio_features) > 0:
        track_data = []
        for i, features in enumerate(audio_features):
            track = {
                'name': names[i],
                'id': features[0]['id'],
                'duration_ms': features[0]['duration_ms'],
                'danceability': features[0]['danceability'],
                'energy': features[0]['energy'],
                'key': features[0]['key'],
                'loudness': features[0]['loudness'],
                'mode': features[0]['mode'],
                'speechiness': features[0]['speechiness'],
                'acousticness': features[0]['acousticness'],
                'instrumentalness': features[0]['instrumentalness'],
                'liveness': features[0]['liveness'],
                'valence': features[0]['valence'],
                'tempo': features[0]['tempo']
            }

            track_data.append(track)

        new_likes = new_likes.append(
            track_data,
            ignore_index=True,
            verify_integrity=True
        )

    return new_likes

In [7]:
ids, names, tracks, audio_features = get_tracks()

assert (
    len(ids) == 50 and
    len(names) == 50 and
    len(tracks) == 50 and
    len(audio_features) != 0
), 'Something went wrong.'

In [8]:
# Get audio features of the tracks
audio_features[0]
#[print(i, features[0]['id']) for i, features in enumerate(audio_features)]

[{'danceability': 0.855,
  'energy': 0.71,
  'key': 8,
  'loudness': -4.64,
  'mode': 1,
  'speechiness': 0.0945,
  'acousticness': 0.0142,
  'instrumentalness': 0.00579,
  'liveness': 0.125,
  'valence': 0.22,
  'tempo': 144.974,
  'type': 'audio_features',
  'id': '4ds41ycY4UZaxApauzIg40',
  'uri': 'spotify:track:4ds41ycY4UZaxApauzIg40',
  'track_href': 'https://api.spotify.com/v1/tracks/4ds41ycY4UZaxApauzIg40',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/4ds41ycY4UZaxApauzIg40',
  'duration_ms': 252488,
  'time_signature': 4}]

### Add New TrackFeatures to Liked Song Dataset

In [9]:
# Add liked tracks to dataframe
indices = np.random.randint(50, 5000, 300)
liked_df = get_new_likes()

for index in indices:
    new_likes = get_new_likes(index=index)
    liked_df = liked_df.append(new_likes, ignore_index=True, verify_integrity=True)

print(liked_df.shape)
liked_df.tail(50)

(2331, 14)


Unnamed: 0,name,id,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
2281,Under the Bridge,3d9DChrdc6BOeFsbrZ3Is0,264307,0.559,0.345,4,-13.496,1,0.0459,0.0576,0.000105,0.141,0.458,84.581
2282,Demons,3LlAyCYU26dvFZBDUIMb7a,175200,0.505,0.71,3,-3.015,1,0.0321,0.19,0.00025,0.329,0.428,89.938
2283,Right Before My Eyes - Unpeeled,7d6qX28ITEgBUhCJ86SeYq,280213,0.494,0.185,2,-10.275,1,0.0305,0.829,0.0,0.113,0.272,123.97
2284,Cigarette Daydreams,2tznHmp70DxMyr2XhWLOW0,208760,0.636,0.676,2,-3.442,1,0.0263,0.0807,0.0,0.0831,0.273,113.98
2285,Swing Life Away,6GrrkiCRO3HYdgRpO4eKEL,200067,0.477,0.426,6,-6.961,1,0.043,0.544,0.0,0.0903,0.789,184.866
2286,"Mrs. Robinson - From ""The Graduate"" Soundtrack",0iOZM63lendWRTTeKhZBSC,244027,0.606,0.457,6,-14.035,0,0.0497,0.713,2.5e-05,0.0747,0.813,92.033
2287,The Sound of Silence - Acoustic Version,5y788ya4NvwhBznoDIcXwK,185413,0.525,0.216,6,-13.551,1,0.0301,0.837,0.0,0.107,0.328,106.761
2288,The Boxer,76TZCvJ8GitQ2FA1q5dKu0,308520,0.439,0.488,11,-14.464,1,0.0615,0.702,0.000339,0.16,0.629,93.017
2289,Homeward Bound,4Xl2PrS3DJqqSKXCo6Uhv9,149613,0.485,0.378,3,-11.773,1,0.0337,0.831,0.0,0.103,0.528,92.928
2290,America,6dfhF1BDGmhM69fnCb6wSC,215293,0.259,0.241,2,-15.955,1,0.042,0.554,7e-06,0.0849,0.275,178.453


In [10]:
# Create "liked" vector
liked_vect = list(
    np.ones(liked_df.shape[0], np.int64)
)

In [11]:
# Combine liked and unliked tracks, and add "liked" column
df = unliked_df.append(liked_df, ignore_index=True, verify_integrity=True)
df['liked'] = unliked_vect + liked_vect

print(df.shape)
df['liked'].value_counts(normalize=True) * 100

(7331, 15)


0    68.203519
1    31.796481
Name: liked, dtype: float64

In [12]:
df.sample(50)

Unnamed: 0,name,id,acousticness,danceability,duration_ms,energy,tempo,instrumentalness,key,liveness,loudness,mode,valence,speechiness,liked
6207,Aulon Raid,38DXZf5olllY3QPC8A9UYf,0.862,0.805,133520,0.372,128.98,0.0054,9,0.349,-5.774,1,0.227,0.0701,1
4498,Getaway,0g7UEpNuvLg5NNpwT4eq7n,0.135,0.692,181507,0.855,119.966,0.0,8,0.185,-4.704,1,0.636,0.0344,0
5974,Corsican Mastiff Stride,7aAC1GiDyzu0MKgoXkF5ng,0.32,0.65,140187,0.582,90.294,0.000258,7,0.506,-9.298,1,0.955,0.0497,1
4905,Butterfly On A Wheel,2d1vGDzaGWbCkupDsPniA7,0.0142,0.583,340800,0.616,110.397,0.000161,11,0.107,-13.533,0,0.436,0.0278,0
6157,Work Song,5szmwG86IFASvYrca21MEz,0.74,0.484,229667,0.381,118.567,1e-06,10,0.111,-10.732,1,0.296,0.0625,1
1935,I Can Never Go Home Anymore,6Dmrx1BOxZXTc2p3W8ezTh,0.627,0.527,192907,0.354,96.022,0.0,11,0.283,-12.386,0,0.164,0.0397,0
2263,Voice-Over Intro Rod Temperton Interview #2 / ...,1xU5NiN3ZKUa3J3S93aYd5,0.718,0.619,115760,0.227,181.398,0.0,10,0.624,-17.923,0,0.647,0.95,0
1905,No More Mr. Nice Guy,5D2eCwqbHcqOnfHOCM6TnV,0.0936,0.498,187893,0.515,127.761,0.0,11,0.1,-11.985,0,0.616,0.0395,0
1324,Song Groove (a.k.a. Abortion Papers),3YkgiRmt9aNzfxXgcij7eY,0.266,0.922,266773,0.922,133.208,0.18,1,0.0437,-3.464,1,0.963,0.0488,0
6508,Both Sides Now,3NW1YMA8kfNVTzGJCGBS8m,0.912,0.623,274560,0.156,97.235,0.0,6,0.141,-15.098,1,0.329,0.0308,1


In [13]:
df.describe()

Unnamed: 0,acousticness,danceability,energy,tempo,instrumentalness,liveness,loudness,valence,speechiness,liked
count,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0
mean,0.472345,0.56541,0.489798,117.018465,0.144263,0.19368,-10.8447,0.524925,0.092824,0.317965
std,0.360264,0.169368,0.252283,29.710909,0.287125,0.16371,5.243682,0.255587,0.150648,0.465717
min,0.0,0.0,0.000809,0.0,0.0,0.0211,-44.638,0.0,0.0,0.0
25%,0.103,0.454,0.285,94.004,0.0,0.0985,-13.7225,0.3195,0.0344,0.0
50%,0.452,0.576,0.483,114.814,0.000222,0.127,-9.941,0.529,0.0441,0.0
75%,0.832,0.693,0.692,135.284,0.07175,0.235,-6.904,0.7355,0.0716,1.0
max,0.996,0.97,0.999,220.192,1.0,0.996,0.878,0.986,0.969,1.0


## Data Prep and Modeling
---