# Spotify Exploratory Data Analysis
**Jacob Torres**

In [1]:
"""Imports"""

# Environment variables and authorization
import os
from dotenv import load_dotenv
from spotipy.oauth2 import SpotifyOAuth
from spotipy import Spotify

# Data collection and cleansing
import sqlite3
import numpy as np
import pandas as pd

---
## Data Collection
### SQLite3 Database

In [2]:
# Load Spotify data from local db
DB_FILE = '../app/spotify_db.sqlite3'
DATA_TABLE_QUERY = 'select * from data;'
conn = sqlite3.connect(DB_FILE)

# Dataframe of unliked songs
unliked_df = pd.read_sql(DATA_TABLE_QUERY, conn)
unliked_df = unliked_df.sample(10000)
conn.close()

print(f"Unliked songs: {unliked_df.shape}")
unliked_df.head(10)

Unliked songs: (10000, 14)


Unnamed: 0,name,id,acousticness,danceability,duration_ms,energy,tempo,instrumentalness,key,liveness,loudness,mode,valence,speechiness
112332,"Intermezzo No. 6 in E-Flat Minor, Op. 118 - An...",6pyvAjYuzyHmb7UWdxz0YD,0.988,0.408,358613,0.0361,121.813,0.872,10,0.164,-21.516,0,0.072,0.0515
49600,Work for Love,5E6bjaFJZszZHSphdOPHpE,0.108,0.789,283667,0.602,107.659,0.00378,9,0.368,-12.335,0,0.948,0.039
107429,Sweet Love,3Z6DV0jsGqCs4whsfda7lX,0.36,0.504,266201,0.728,89.255,0.0,10,0.159,-5.699,0,0.683,0.0234
34336,Frontier Psychiatrist,0E2UdlzNQP9I70MYumYlFo,0.122,0.711,288347,0.905,98.931,0.000166,6,0.395,-7.459,1,0.437,0.218
122353,Hey Mister,3ORgbqQVR5oj2QlX1nML8l,3.4e-05,0.384,235640,0.992,160.038,7.8e-05,10,0.565,-3.249,0,0.0988,0.125
9434,Why I Sing The Blues,2bIfM0ZOF4gxK1BeqH3Djm,0.239,0.792,517400,0.495,118.066,0.00175,0,0.074,-11.603,1,0.719,0.0527
33662,Leflaur Leflah Eshkoshka,3RvFnZZz1x00l8FIHE5wxx,0.0241,0.797,302974,0.547,92.12,0.0,11,0.313,-10.875,0,0.71,0.355
108611,Часть 13.2 - Зеленые холмы Африки,6V8wYYBaBpwzYmecmpENwF,0.27,0.706,99200,0.0885,105.431,0.0,11,0.228,-17.794,0,0.69,0.927
16159,Nemo Egg (Main Title),7jSQaXLzpLHjSQTev2krvT,0.98,0.164,76133,0.0248,79.083,0.967,5,0.103,-27.25,1,0.0553,0.0462
52132,It's A Little Too Late - 1996 Greatest Hits Ve...,0D5L0ZxGyhEFTqhfiXwYxw,0.317,0.512,163560,0.909,172.975,0.0,9,0.0681,-7.505,1,0.791,0.0504


### SpotifyAPI

In [3]:
# Authorize access to Spotify API
load_dotenv()
cid = os.getenv('CLIENT_ID')
cs = os.getenv('CLIENT_SECRET')
rduri = os.getenv('REDIRECT_URI')
scope = [
    'playlist-read-private',
    'playlist-modify-public',
    'user-library-read',
    'user-read-recently-played'
]

assert cid is not None and cs is not None and rduri is not None, 'One or more environment variables are missing.'

In [4]:
auth_manager = SpotifyOAuth(
    client_id=cid,
    client_secret=cs,
    redirect_uri=rduri,
    scope=scope
)
sp = Spotify(auth_manager=auth_manager)

sp

<spotipy.client.Spotify at 0x1a7c96d6400>

In [5]:
# Functions for data collection via the API
def get_tracks(index=0):
    """
    Get 50 tracks and track details from current user library starting at given index position.

    returns ids, names, tracks, audio_features -- track details
    """
    tracks = sp.current_user_saved_tracks(limit=50, offset=index)['items']
    names = [track['track']['name'] for track in tracks]
    ids = [track['track']['id'] for track in tracks]
    audio_features = []

    try:
        for id in ids:
            audio_features.append(
                sp.audio_features(id)
            )

        return ids, names, tracks, audio_features

    except:
        return ids, names, tracks, None


def get_new_likes(num=50, index=0):
    """
    Collects features for 50 tracks, starting at given index.

    returns new_likes -- dataframe of num liked tracks and audio features
    """
    ids, names, tracks, audio_features = get_tracks(index=index)

    # Create dataframe of new likes and audio features
    columns = [
        'name',
        'id',
        'duration_ms',
        'danceability',
        'energy',
        'key',
        'loudness',
        'mode',
        'speechiness',
        'acousticness',
        'instrumentalness',
        'liveness',
        'valence',
        'tempo'
    ]

    new_likes = pd.DataFrame(columns=columns)

    # Create list of track dicts with audio features
    if audio_features is None:
        pass

    elif len(audio_features) > 0:
        track_data = []
        for i, features in enumerate(audio_features):
            track = {
                'name': names[i],
                'id': features[0]['id'],
                'duration_ms': features[0]['duration_ms'],
                'danceability': features[0]['danceability'],
                'energy': features[0]['energy'],
                'key': features[0]['key'],
                'loudness': features[0]['loudness'],
                'mode': features[0]['mode'],
                'speechiness': features[0]['speechiness'],
                'acousticness': features[0]['acousticness'],
                'instrumentalness': features[0]['instrumentalness'],
                'liveness': features[0]['liveness'],
                'valence': features[0]['valence'],
                'tempo': features[0]['tempo']
            }

            track_data.append(track)

        new_likes = new_likes.append(
            track_data,
            ignore_index=True,
            verify_integrity=True
        )

    return new_likes

In [6]:
ids, names, tracks, audio_features = get_tracks()

assert (
    len(ids) == 50 and
    len(names) == 50 and
    len(tracks) == 50 and
    len(audio_features) == 50
), 'Something went wrong.'

In [7]:
# Get audio features of the tracks
audio_features[0]

[{'danceability': 0.748,
  'energy': 0.705,
  'key': 4,
  'loudness': -4.547,
  'mode': 0,
  'speechiness': 0.485,
  'acousticness': 0.204,
  'instrumentalness': 0,
  'liveness': 0.246,
  'valence': 0.483,
  'tempo': 189.891,
  'type': 'audio_features',
  'id': '1EaKU4dMbesXXd3BrLCtYG',
  'uri': 'spotify:track:1EaKU4dMbesXXd3BrLCtYG',
  'track_href': 'https://api.spotify.com/v1/tracks/1EaKU4dMbesXXd3BrLCtYG',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/1EaKU4dMbesXXd3BrLCtYG',
  'duration_ms': 208733,
  'time_signature': 4}]

---
## Data Wrangling and Exploration

In [8]:
%%time
# Add liked tracks to dataframe
indices = np.random.randint(50, 5000, 1000)
liked_df = get_new_likes()

for index in indices:
    new_likes = get_new_likes(index=index)
    liked_df = liked_df.append(new_likes, ignore_index=True, verify_integrity=True)

print(f"Liked songs: {liked_df.shape}")
liked_df.tail(10)

Liked songs: (5874, 14)
Wall time: 11min 21s


Unnamed: 0,name,id,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
5864,If You See Light,2WizTIQY0T1hLouDdF2xJX,118013,0.572,0.928,0,-8.047,1,0.0614,0.0031,0.85,0.199,0.325,119.785
5865,Genesis 30:3,1LajReP0NLtT239Dzw6ih6,206707,0.406,0.142,1,-20.087,1,0.0378,0.931,0.179,0.111,0.142,94.985
5866,Stench of the Unburied,0UU8JLF577uvFiZkrEkpIz,269640,0.808,0.445,6,-11.596,1,0.0286,0.367,0.191,0.239,0.871,108.072
5867,Half Dead,6S5YciPXi9wZEVB4TpuzfC,206213,0.764,0.588,4,-9.032,1,0.029,0.163,0.749,0.13,0.901,129.447
5868,Hopeful Assassins of Zeno,6AQmNbVZncy1ZURVNfthLz,195133,0.868,0.256,4,-13.685,1,0.0674,0.829,0.548,0.193,0.617,115.84
5869,Deuteronomy 2:10,2N8PYzWvirAUrdKBay7h8s,205507,0.48,0.0669,2,-20.818,1,0.0556,0.93,0.00136,0.0996,0.225,105.311
5870,Romans 10:9,4l7wSoHzKRVh8NJWVRFWGb,164853,0.597,0.535,7,-11.128,1,0.0363,0.406,0.252,0.0998,0.901,127.803
5871,1 Samuel 15:23,71zJ6ki9ve3qV7JHMs93qY,249013,0.782,0.071,4,-21.523,0,0.0812,0.667,0.409,0.348,0.147,80.586
5872,Paid in Cocaine,0Lcql3NYZSEpvpN2Aat6ee,237293,0.759,0.333,3,-12.547,1,0.0289,0.69,0.155,0.109,0.47,118.073
5873,1 John 4:16,7LybhrlWEwxGxI97JBh9cV,191093,0.437,0.103,4,-14.979,1,0.0417,0.943,6e-06,0.112,0.215,78.064


In [9]:
# Create combined dataset
df = unliked_df.append(liked_df, ignore_index=True, verify_integrity=True)

In [10]:
df.isnull().sum()

name                0
id                  0
acousticness        0
danceability        0
duration_ms         0
energy              0
tempo               0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
valence             0
speechiness         0
dtype: int64

In [11]:
df.sample(10)

Unnamed: 0,name,id,acousticness,danceability,duration_ms,energy,tempo,instrumentalness,key,liveness,loudness,mode,valence,speechiness
12360,Getting Into Knives,2b3XA03SZ0ns3Lgc1Y8Xin,0.753,0.791,314333,0.202,125.63,0.0697,4,0.1,-16.653,1,0.558,0.0612
6559,Great Are You Lord,2YgtXx9OVBOzja9Fckb4FY,0.0832,0.492,232379,0.656,147.948,0.0,9,0.202,-7.361,1,0.358,0.0365
6027,Buena Idea - Instrumental (Remasterizado),7Hs53qjCKGLwSDi0IvtwfO,0.995,0.883,162920,0.108,114.458,0.343,2,0.118,-19.619,1,0.827,0.19
8682,"Künstlerleben, Op.316",0kEcGmZzt2ShQlX4seciWG,0.95,0.251,523867,0.0695,176.777,0.803,0,0.624,-22.376,1,0.199,0.0368
4354,Dance The Night Away,3t723PlEADna6GjgquoMRW,0.106,0.711,180800,0.861,118.004,4.4e-05,0,0.113,-2.905,1,0.78,0.0924
2601,Las flores,0oakHTPj4SF7Si0NAsLwi8,0.186,0.615,135133,0.928,156.926,4e-06,10,0.147,-4.218,1,0.962,0.0412
13673,No Children,5cxnSTLzGD1t9xcdmJYFVB,0.603,0.607,168307,0.411,84.94,0.000671,1,0.103,-7.566,1,0.412,0.0414
9405,Roses Of Picardy - 2001 Digital Remaster,7GQUvfQ8hqczDDVmqyZJ8k,0.55,0.407,131227,0.259,140.22,0.0,8,0.0902,-12.366,1,0.356,0.0344
15177,Ivy,2ZWlPOoWh0626oTaHrnl2a,0.782,0.567,249191,0.388,116.362,0.000309,9,0.248,-9.579,0,0.452,0.0384
8293,Going Bad (feat. Drake),2IRZnDFmlqMuOrYOLnZZyc,0.259,0.889,180522,0.496,86.003,0.0,4,0.252,-6.365,0,0.544,0.0905


In [12]:
df.describe()

Unnamed: 0,acousticness,danceability,energy,tempo,instrumentalness,liveness,loudness,valence,speechiness
count,15874.0,15874.0,15874.0,15874.0,15874.0,15874.0,15874.0,15874.0,15874.0
mean,0.459044,0.569318,0.494837,117.251375,0.134743,0.191452,-10.701819,0.523939,0.094003
std,0.358577,0.169377,0.249254,29.89741,0.276817,0.15934,5.201994,0.254266,0.143338
min,0.0,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0
25%,0.0959,0.458,0.295,94.451,0.0,0.0988,-13.538,0.319,0.0343
50%,0.428,0.582,0.49,115.138,0.000189,0.127,-9.79,0.527,0.045
75%,0.822,0.696,0.692,135.7385,0.0602,0.23,-6.8275,0.734,0.079
max,0.996,0.977,1.0,217.943,1.0,0.996,0.878,0.985,0.969


---
## Commit Datasets to SQLite3 Database

In [13]:
# Create "liked_songs_jt" table
LIKED_TABLE_QUERY = """create table if not exists liked_songs_jt (
    name varchar(3000),
    id varchar(50),
    duration_ms int,
    danceability float,
    energy float,
    key int,
    loudness float,
    mode int,
    speechiness float,
    acousticness float,
    instrumentalness float,
    liveness float,
    valence float,
    tempo float
)"""

conn = sqlite3.connect(DB_FILE)
cur = conn.cursor()

try:
    cur.execute(LIKED_TABLE_QUERY)
    conn.commit()

except (Exception, sqlite3.Error) as err:
    print(err)

In [14]:
# Insert liked song data into liked_songs_jt table
try:
    liked_df.to_sql('liked_songs_jt', conn, if_exists='replace')

except (Exception, sqlite3.Error) as err:
    print(err)

In [15]:
# Create "train" table in database
TRAIN_TABLE_QUERY = """create table if not exists train (
    name varchar(3000),
    id varchar(50),
    acousticness float,
    danceability float,
    duration_ms int,
    energy float,
    tempo float,
    instrumentalness float,
    key int,
    liveness float,
    loudness float,
    mode int,
    valence float,
    speechiness float
)"""

try:
    cur.execute(TRAIN_TABLE_QUERY)
    conn.commit()

except (Exception, sqlite3.Error) as err:
    print(err)

In [16]:
# Insert entire training dataset into train table
try:
    df.to_sql('train', conn, if_exists='replace')

except (Exception, sqlite3.Error) as err:
    print(err)

finally:
    cur.close()
    conn.close()