# Spotify Exploratory Data Analysis
**Jacob Torres**

In [1]:
"""Imports"""

# Environment variables and authorization
import os
from dotenv import load_dotenv
from spotipy.oauth2 import SpotifyOAuth
from spotipy import Spotify

# Data collection and cleansing
import sqlite3
import numpy as np
import pandas as pd

---
## Data Collection
### SQLite3 Database

In [2]:
# Load Spotify data from local db
DB_FILE = '../app/spotify_db.sqlite3'
DATA_TABLE_QUERY = 'select * from data;'
conn = sqlite3.connect(DB_FILE)

# Dataframe of unliked songs
unliked_df = pd.read_sql(DATA_TABLE_QUERY, conn)
unliked_df = unliked_df.sample(10000)
conn.close()

print(f"Unliked songs: {unliked_df.shape}")
unliked_df.head(10)

Unliked songs: (10000, 14)


Unnamed: 0,name,id,acousticness,danceability,duration_ms,energy,tempo,instrumentalness,key,liveness,loudness,mode,valence,speechiness
44055,Smoke Signal,1wTKEEiHFYIfYfMqiPWFEV,0.742,0.454,222307,0.384,133.24,0.463,10,0.195,-12.123,0,0.296,0.0488
56459,Love It If We Made It,6WmIyn2fx1PKQ0XDpYj4VR,0.000607,0.472,252926,0.745,180.06,2.7e-05,4,0.0957,-5.297,1,0.0631,0.0356
33768,That Don't Impress Me Much,4FUfoWMypAyWbOavmYyeNu,0.235,0.757,218867,0.757,124.956,0.0,1,0.0474,-7.598,1,0.967,0.0389
54692,I Look So Good (Without You),1NxVd04cN2UoNtayFjDmnX,0.201,0.316,231840,0.866,189.88,0.0,6,0.109,-1.592,1,0.541,0.139
70162,Take Ten,3qtpVUsQzvHdMzrmqkdOzP,0.534,0.467,187106,0.184,74.123,0.000743,0,0.188,-15.771,1,0.518,0.036
49933,"Symphony No.9 in E Minor, Op.95, B. 178 ""From ...",6nqHzwOdGIaX57U6VU6kMO,0.901,0.249,685000,0.163,136.037,0.741,4,0.124,-15.638,0,0.112,0.0351
159963,Dream Lady,49sgtbb4QKFMaSmOM2GVuF,0.736,0.255,202333,0.459,192.398,0.232,8,0.114,-12.271,0,0.444,0.0531
162233,One Of The Lonely Ones,07lSaq7jP0I6GW3iEI1ItF,0.0256,0.517,241533,0.554,124.847,0.000461,2,0.128,-12.351,1,0.888,0.0335
89225,Woman's Gotta Have It,20HCbY9qPpi1H2pP2MRUWy,0.567,0.506,210453,0.59,83.627,2.1e-05,11,0.17,-8.922,0,0.735,0.0302
151738,El Poderoso de Israel,63TNDMbqRlJKBr8YcyaBgU,0.176,0.521,79213,0.928,151.303,0.0,2,0.428,-4.154,1,0.761,0.125


### SpotifyAPI

In [3]:
# Authorize access to Spotify API
load_dotenv()
cid = os.getenv('CLIENT_ID')
cs = os.getenv('CLIENT_SECRET')
rduri = os.getenv('REDIRECT_URI')
scope = [
    'playlist-read-private',
    'playlist-modify-public',
    'user-library-read',
    'user-read-recently-played'
]

assert cid is not None and cs is not None and rduri is not None, 'One or more environment variables are missing.'

In [4]:
auth_manager = SpotifyOAuth(
    client_id=cid,
    client_secret=cs,
    redirect_uri=rduri,
    scope=scope
)
sp = Spotify(auth_manager=auth_manager)

sp

<spotipy.client.Spotify at 0x260c500f100>

In [5]:
# Functions for data collection via the API
def get_tracks(index=0):
    """
    Get 50 tracks and track details from current user library starting at given index position.

    returns ids, names, tracks, audio_features -- track details
    """
    tracks = sp.current_user_saved_tracks(limit=50, offset=index)['items']
    names = [track['track']['name'] for track in tracks]
    ids = [track['track']['id'] for track in tracks]
    audio_features = []

    try:
        for id in ids:
            audio_features.append(
                sp.audio_features(id)
            )

        return ids, names, tracks, audio_features

    except:
        return ids, names, tracks, None


def get_new_likes(num=50, index=0):
    """
    Collects features for 50 tracks, starting at given index.

    returns new_likes -- dataframe of num liked tracks and audio features
    """
    ids, names, tracks, audio_features = get_tracks(index=index)

    # Create dataframe of new likes and audio features
    columns = [
        'name',
        'id',
        'duration_ms',
        'danceability',
        'energy',
        'key',
        'loudness',
        'mode',
        'speechiness',
        'acousticness',
        'instrumentalness',
        'liveness',
        'valence',
        'tempo'
    ]

    new_likes = pd.DataFrame(columns=columns)

    # Create list of track dicts with audio features
    if audio_features is None:
        print('Something went wrong.')

    elif len(audio_features) > 0:
        track_data = []
        for i, features in enumerate(audio_features):
            track = {
                'name': names[i],
                'id': features[0]['id'],
                'duration_ms': features[0]['duration_ms'],
                'danceability': features[0]['danceability'],
                'energy': features[0]['energy'],
                'key': features[0]['key'],
                'loudness': features[0]['loudness'],
                'mode': features[0]['mode'],
                'speechiness': features[0]['speechiness'],
                'acousticness': features[0]['acousticness'],
                'instrumentalness': features[0]['instrumentalness'],
                'liveness': features[0]['liveness'],
                'valence': features[0]['valence'],
                'tempo': features[0]['tempo']
            }

            track_data.append(track)

        new_likes = new_likes.append(
            track_data,
            ignore_index=True,
            verify_integrity=True
        )

    return new_likes

In [6]:
ids, names, tracks, audio_features = get_tracks()

assert (
    len(ids) == 50 and
    len(names) == 50 and
    len(tracks) == 50 and
    len(audio_features) == 50
), 'Something went wrong.'

In [7]:
# Get audio features of the tracks
audio_features[0]

[{'danceability': 0.674,
  'energy': 0.608,
  'key': 8,
  'loudness': -6.628,
  'mode': 0,
  'speechiness': 0.0334,
  'acousticness': 0.0119,
  'instrumentalness': 0,
  'liveness': 0.103,
  'valence': 0.464,
  'tempo': 124.952,
  'type': 'audio_features',
  'id': '7tGlzXJv6GD5e5qlu5YmDg',
  'uri': 'spotify:track:7tGlzXJv6GD5e5qlu5YmDg',
  'track_href': 'https://api.spotify.com/v1/tracks/7tGlzXJv6GD5e5qlu5YmDg',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/7tGlzXJv6GD5e5qlu5YmDg',
  'duration_ms': 253440,
  'time_signature': 4}]

---
## Data Wrangling and Exploration

In [8]:
%%time
# Add liked tracks to dataframe
indices = np.random.randint(50, 5000, 1000)
liked_df = get_new_likes()

for index in indices:
    new_likes = get_new_likes(index=index)
    liked_df = liked_df.append(new_likes, ignore_index=True, verify_integrity=True)

print(f"Liked songs: {liked_df.shape}")
liked_df.tail(10)

Something went wrong.
Liked songs: (6610, 14)
Wall time: 12min 34s


Unnamed: 0,name,id,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
6600,I Don’t Wanna Live Forever (Fifty Shades Darke...,3NdDpSvN911VPGivFlV5d0,245200,0.735,0.451,0,-8.374,1,0.0585,0.0631,1.3e-05,0.325,0.0862,117.973
6601,"Bust It Baby, Pt. 2 (feat. Ne-Yo)",0exb0K7vsjf5bXWLJJhAuF,240760,0.648,0.801,5,-7.24,0,0.167,0.154,0.0,0.339,0.807,78.946
6602,Sexy Love,10aWGOqSDBqvNzJ9NeKDbK,220853,0.693,0.516,8,-6.446,1,0.0413,0.297,0.000127,0.0604,0.494,94.02
6603,Hate That I Love You,7iu0WYLdo4yksKf3seaxzI,218947,0.637,0.73,5,-5.38,0,0.0874,0.323,0.0,0.0981,0.732,93.867
6604,Miss Independent,34ceTg8ChN5HjrqiIYCn9Q,232000,0.673,0.683,1,-5.693,1,0.115,0.522,0.0,0.235,0.713,171.86
6605,BOP,6Ozh9Ok6h4Oi1wUSLtBseN,159715,0.769,0.787,11,-3.909,1,0.367,0.189,0.0,0.129,0.836,126.77
6606,Alright,3iVcZ5G6tvkXZkZKlMpIUs,219333,0.796,0.766,7,-5.974,1,0.238,0.0742,0.0,0.0827,0.558,110.034
6607,F Slo (feat. Lil Dicky),0Od15R8yRYuXnXiuAwaAfH,195354,0.863,0.533,0,-7.183,1,0.0997,0.32,9e-06,0.0988,0.771,139.965
6608,Me and Your Mama,31tf1qEai5o5f4r66Kd0pU,379227,0.534,0.433,5,-12.628,1,0.0357,0.0116,0.0452,0.0667,0.235,117.834
6609,Redbone,0wXuerDYiBnERgIpbb3JBR,326933,0.743,0.347,1,-11.174,1,0.121,0.167,0.00951,0.103,0.572,160.143


In [9]:
# Create combined dataset
df = unliked_df.append(liked_df, ignore_index=True, verify_integrity=True)

In [10]:
df.isnull().sum()

name                0
id                  0
acousticness        0
danceability        0
duration_ms         0
energy              0
tempo               0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
valence             0
speechiness         0
dtype: int64

In [11]:
df.sample(10)

Unnamed: 0,name,id,acousticness,danceability,duration_ms,energy,tempo,instrumentalness,key,liveness,loudness,mode,valence,speechiness
11462,No Time To Die,73SpzrcaHk0RQPFP73vqVR,0.917,0.38,242265,0.219,73.537,0.0104,4,0.0827,-13.273,0,0.0517,0.0358
16103,Dark in Here,1gpkRjay3yoFotVbdUz0Bh,0.363,0.642,203360,0.602,125.126,0.495,9,0.0705,-7.665,0,0.31,0.0325
2380,"Part Of Me, Part Of You",5wsflCTlGxNn2z91O7KVgK,0.0494,0.663,358000,0.694,127.305,3.4e-05,9,0.0581,-8.333,1,0.401,0.0282
11112,All on My Mind,5GRPAVCMwnUNvIOKzDYYSb,0.252,0.584,224521,0.788,138.102,8e-06,0,0.0654,-4.162,0,0.364,0.0691
4955,"Swing Low, Sweet Chariot",72ZSWeKxB5yDrsDdWrGh9T,0.277,0.616,167467,0.385,115.904,3.7e-05,11,0.132,-13.586,1,0.664,0.0359
5595,夢中人,46mRvGnQNAIRniX5xCX9Ar,0.898,0.197,203467,0.223,169.586,5e-06,8,0.14,-15.977,0,0.115,0.0414
1485,Aggio Perduto 'O Suonno,0aGDRKRP9KLMAotTAQ9V0K,0.989,0.492,220453,0.0232,131.673,1.4e-05,2,0.107,-19.809,1,0.166,0.0428
12251,La Bicicleta,0Gx4VrHzS7pZOEAGrmXXBH,0.21,0.745,229510,0.944,179.991,1e-06,0,0.333,-3.959,1,0.951,0.134
7501,My Fight,5HDG2PPLki6wowJvZbAH9d,0.000994,0.498,201340,0.972,74.99,0.0,8,0.213,-3.443,1,0.334,0.196
4403,Часть 62.2 - На Западном фронте без перемен,4DIiCMEXAAdj09PFT1T5jv,0.014,0.673,104900,0.261,82.153,0.0,0,0.0741,-18.782,1,0.476,0.959


In [12]:
df.describe()

Unnamed: 0,acousticness,danceability,energy,tempo,instrumentalness,liveness,loudness,valence,speechiness
count,16610.0,16610.0,16610.0,16610.0,16610.0,16610.0,16610.0,16610.0,16610.0
mean,0.462809,0.570025,0.495101,116.776362,0.12253,0.188872,-10.504243,0.519277,0.093196
std,0.354704,0.168414,0.245836,30.333933,0.269373,0.157127,5.218244,0.251389,0.139699
min,3e-06,0.0,2e-05,0.0,0.0,0.015,-52.22,0.0,0.0
25%,0.105,0.459,0.3,93.717,0.0,0.0972,-13.369,0.321,0.0345
50%,0.434,0.5825,0.49,114.016,8.5e-05,0.124,-9.561,0.514,0.0454
75%,0.823,0.694,0.688,135.306,0.0285,0.228,-6.615,0.723,0.0802
max,0.996,0.977,1.0,221.741,0.999,0.996,0.101,1.0,0.968


---
## Commit Datasets to SQLite3 Database

In [13]:
# Create "liked_songs_jt" table
LIKED_TABLE_QUERY = """create table if not exists liked_songs_jt (
    name varchar(3000),
    id varchar(50),
    duration_ms int,
    danceability float,
    energy float,
    key int,
    loudness float,
    mode int,
    speechiness float,
    acousticness float,
    instrumentalness float,
    liveness float,
    valence float,
    tempo float
)"""

conn = sqlite3.connect(DB_FILE)
cur = conn.cursor()

try:
    cur.execute(LIKED_TABLE_QUERY)
    conn.commit()

except (Exception, sqlite3.Error) as err:
    print(err)

In [14]:
# Insert liked song data into liked_songs_jt table
try:
    liked_df.to_sql('liked_songs_jt', conn, if_exists='replace')

except (Exception, sqlite3.Error) as err:
    print(err)

In [15]:
# Create "train" table in database
TRAIN_TABLE_QUERY = """create table if not exists train (
    name varchar(3000),
    id varchar(50),
    acousticness float,
    danceability float,
    duration_ms int,
    energy float,
    tempo float,
    instrumentalness float,
    key int,
    liveness float,
    loudness float,
    mode int,
    valence float,
    speechiness float
)"""

try:
    cur.execute(TRAIN_TABLE_QUERY)
    conn.commit()

except (Exception, sqlite3.Error) as err:
    print(err)

In [16]:
# Insert entire training dataset into train table
try:
    df.to_sql('train', conn, if_exists='replace')

except (Exception, sqlite3.Error) as err:
    print(err)

finally:
    cur.close()
    conn.close()