# Spotify Data Exploration and Modeling
## Jacob Torres
---

In [1]:
# Imports
import os
import re
import sqlite3
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from spotipy.oauth2 import SpotifyOAuth
from spotipy import Spotify

## Exploratory Data Analysis

In [2]:
# Load Spotify data from local db
DATA_TABLE_QUERY = 'select * from data;'
SONG_TABLE_QUERY = 'select * from song;'
conn = sqlite3.connect('../app/spotify_db.sqlite3')

data_df = pd.read_sql(DATA_TABLE_QUERY, conn)
liked_df = pd.read_sql(SONG_TABLE_QUERY, conn)

print(f"Data: {data_df.shape}")
print(f"Liked songs: {liked_df.shape}")

Data: (169540, 14)
Liked songs: (12, 12)


In [3]:
print(data_df.shape)
data_df.head()

(169540, 14)


Unnamed: 0,name,id,acousticness,danceability,duration_ms,energy,tempo,instrumentalness,key,liveness,loudness,mode,valence,speechiness
0,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4BJqT0PrAfrxzMOxytFOIz,0.982,0.279,831667,0.211,80.954,0.878,10,0.665,-20.096,1,0.0594,0.0366
1,Clancy Lowered the Boom,7xPhfUan2yNtyFG0cUWkt8,0.732,0.819,180533,0.341,60.936,0.0,7,0.16,-12.441,1,0.963,0.415
2,Gati Bali,1o6I8BglA6ylDMrIELygv1,0.961,0.328,500062,0.166,110.339,0.913,3,0.101,-14.85,1,0.0394,0.0339
3,Danny Boy,3ftBPsC5vPBKxYSee08FDH,0.967,0.275,210000,0.309,100.109,2.8e-05,5,0.381,-9.316,1,0.165,0.0354
4,When Irish Eyes Are Smiling,4d6HGyGT8e121BsdKmw9v6,0.957,0.418,166693,0.193,101.665,2e-06,3,0.229,-10.096,1,0.253,0.038


In [4]:
liked_df.head(12)

Unnamed: 0,id,name,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,6BAnxKyld909yo6Pk1DO3r,Hello,0.864,7,-4.483,0,0.0348,0.0339,0.216,0.386,0.796,104.997
1,3sSESwCLq4g7WiXk6LJKbI,Wrecking Ball,0.0695,5,-22.837,1,0.0342,0.992,0.934,0.0943,0.109,112.81
2,7iUB3byx9Rcks8zW10JVkP,Feelings,0.778,6,-6.546,0,0.0365,0.0152,0.0,0.121,0.756,123.954
3,0ExiKxfY5rHBW06TcV1xXU,Queens,0.695,0,-5.499,1,0.0279,0.0703,0.0,0.13,0.571,133.949
4,5nvqqLZu75jMK31gDytANZ,Fighter,0.966,11,-3.565,0,0.0714,0.00107,9e-06,0.311,0.191,91.034
5,2ogguaH3LQ2H8gsKKVypYU,Going Home,0.0799,0,-22.987,0,0.0656,0.968,0.922,0.0864,0.124,74.45
6,2gG6IqL2enOyLdluSTcTTS,We Made It,0.745,2,-4.798,1,0.0448,0.0771,0.000141,0.175,0.153,116.092
7,7tEfdhVoIUueORVuyFvgpO,Madman,0.785,0,-6.079,1,0.0778,0.101,1.4e-05,0.115,0.649,150.059
8,6wlOciyEdLDqZXlloy2Fmp,New Cityt,0.607,5,-6.869,1,0.0369,0.189,0.0,0.347,0.716,115.025
9,3YbAvFyjnGgCEu3GybwN8E,Let's fight,0.922,10,-6.193,0,0.0416,0.0499,3e-06,0.162,0.965,79.979


In [5]:
# Create "liked" column
data_df['liked'] = np.zeros(data_df.shape[0], np.int8)
data_df.head()

Unnamed: 0,name,id,acousticness,danceability,duration_ms,energy,tempo,instrumentalness,key,liveness,loudness,mode,valence,speechiness,liked
0,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4BJqT0PrAfrxzMOxytFOIz,0.982,0.279,831667,0.211,80.954,0.878,10,0.665,-20.096,1,0.0594,0.0366,0
1,Clancy Lowered the Boom,7xPhfUan2yNtyFG0cUWkt8,0.732,0.819,180533,0.341,60.936,0.0,7,0.16,-12.441,1,0.963,0.415,0
2,Gati Bali,1o6I8BglA6ylDMrIELygv1,0.961,0.328,500062,0.166,110.339,0.913,3,0.101,-14.85,1,0.0394,0.0339,0
3,Danny Boy,3ftBPsC5vPBKxYSee08FDH,0.967,0.275,210000,0.309,100.109,2.8e-05,5,0.381,-9.316,1,0.165,0.0354,0
4,When Irish Eyes Are Smiling,4d6HGyGT8e121BsdKmw9v6,0.957,0.418,166693,0.193,101.665,2e-06,3,0.229,-10.096,1,0.253,0.038,0


In [6]:
# Change liked values for songs in song_df
data_df['liked'] = data_df['id'].apply(
    lambda x: 1 if x in liked_df['id'] else 0
)
data_df['liked'].value_counts()

0    169540
Name: liked, dtype: int64

In [7]:
# Add "liked" columns to liked_df
liked_df['liked'] = np.ones(liked_df.shape[0], np.int64)
liked_df['liked'].value_counts()

1    12
Name: liked, dtype: int64

### Authorize Spotify Access for Further Data Collection

In [8]:
# Authorize access to Spotify API
load_dotenv()
cid = os.getenv('CLIENT_ID')
cs = os.getenv('CLIENT_SECRET')
rduri = os.getenv('REDIRECT_URI')
scope = [
    'playlist-read-private',
    'playlist-modify-public',
    'user-library-read',
    'user-read-recently-played'
]

assert cid is not None and cs is not None and rduri is not None, 'One or more environment variables are missing.'

In [9]:
auth_manager = SpotifyOAuth(
    client_id=cid,
    client_secret=cs,
    redirect_uri=rduri,
    scope=scope
)
sp = Spotify(auth_manager=auth_manager)

sp

<spotipy.client.Spotify at 0x24428fbf460>

### Data Collection

In [10]:
# Functions for data collection via the API
def get_tracks(num=50, index=0):
    """
    Get tracks from current user library starting at given index position.

    returns ids, names, tracks -- track details
    """
    tracks = sp.current_user_saved_tracks(limit=num, offset=index)['items']
    names = [track['track']['name'] for track in tracks]
    ids = [track['track']['id'] for track in tracks]

    return ids, names, tracks


def get_new_likes(num=50, index=0):
    """
    Collects audio features for tracks starting at from_index.

    returns new_likes -- dataframe of liked tracks and audio features
    """
    ids, names, tracks = get_tracks(num=num, index=index)
    audio_features = sp.audio_features(ids)

    # Create list of track dicts with audio features
    track_data = []
    for i, features in enumerate(audio_features):
        track_features = {
            'id': features['id'],
            'name': names[i],
            'duration_ms': features['duration_ms'],
            'danceability': features['danceability'],
            'energy': features['energy'],
            'key': features['key'],
            'loudness': features['loudness'],
            'mode': features['mode'],
            'speechiness': features['speechiness'],
            'acousticness': features['acousticness'],
            'instrumentalness': features['instrumentalness'],
            'liveness': features['liveness'],
            'valence': features['valence'],
            'tempo': features['tempo']
        }

        track_data.append(track_features)

    # Create dataframe of new likes and audio features
    new_likes = pd.DataFrame(data=track_data)

    # Add "liked" column (all tracks are liked)
    new_likes['liked'] = np.ones(new_likes.shape[0], np.int64)

    return new_likes

In [11]:
ids, names, tracks = get_tracks()
assert len(ids) and len(names) and len(tracks) != 0, 'Something went wrong.'

In [12]:
# Get audio features of the tracks
audio_features = sp.audio_features(ids)
audio_features[0]

{'danceability': 0.54,
 'energy': 0.508,
 'key': 7,
 'loudness': -7.868,
 'mode': 0,
 'speechiness': 0.29,
 'acousticness': 0.522,
 'instrumentalness': 1.13e-05,
 'liveness': 0.0755,
 'valence': 0.653,
 'tempo': 82.606,
 'type': 'audio_features',
 'id': '0K3dGPXHVALEqW8EEQGc3T',
 'uri': 'spotify:track:0K3dGPXHVALEqW8EEQGc3T',
 'track_href': 'https://api.spotify.com/v1/tracks/0K3dGPXHVALEqW8EEQGc3T',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/0K3dGPXHVALEqW8EEQGc3T',
 'duration_ms': 203137,
 'time_signature': 4}

### Add New Track Analysis to Liked Song Dataset

In [13]:
# Function for adding new likes to liked_df
def add_new_likes(new_likes):
    """
    Appends dataframe of new likes and audio features to liked_df.
    
    returns liked_df -- Modified dataframe of likes with 1000 new likes appended
    """
    likes = liked_df.append(new_likes)
    return likes

In [14]:
new_likes = get_new_likes()
new_likes

Unnamed: 0,id,name,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,liked
0,0K3dGPXHVALEqW8EEQGc3T,DELITO,203137,0.54,0.508,7,-7.868,0,0.29,0.522,1.1e-05,0.0755,0.653,82.606,1
1,4DHDIdeayp8xvlyg22wREO,Ram Pam Pam,200785,0.908,0.813,5,-2.749,0,0.0777,0.305,0.0,0.278,0.924,97.018,1
2,3qqHroTNyW69IQAkbKc0v8,Rosario Tijeras,208067,0.563,0.809,6,-6.377,1,0.409,0.269,0.0,0.21,0.413,187.97,1
3,411nmd0QMzty1UjCWSo3rc,No Bailes Sola,182813,0.585,0.75,10,-3.974,1,0.145,0.203,0.0,0.3,0.619,179.881,1
4,2Ao6tuwPEgKbTRhNw5DUZ1,Cynicism,148227,0.364,0.0946,4,-16.25,1,0.0307,0.888,0.000765,0.102,0.22,101.085,1
5,1bvERTuePaoVjQ3NpJq9aH,June Hymn,237547,0.593,0.365,9,-6.407,1,0.0252,0.511,0.0,0.139,0.159,98.345,1
6,6AIKEQvWItx9NcjtoNDOjh,Up the Wolves,207400,0.49,0.365,2,-9.301,1,0.0268,0.494,7.2e-05,0.0892,0.474,157.522,1
7,2o2stv4uGKSYuUaghBTVX3,Dark in Here,203360,0.643,0.602,9,-7.653,0,0.0323,0.367,0.5,0.0835,0.308,125.114,1
8,3s7MCdXyWmwjdcWh7GWXas,Violent Crimes,215320,0.669,0.419,1,-6.724,0,0.522,0.376,0.0,0.187,0.0397,109.813,1
9,5vuJuBqwzHJgCA1ysRfwxZ,good kid,214120,0.451,0.831,5,-8.641,0,0.307,0.0623,0.0,0.185,0.41,176.162,1


In [15]:
# Add new liked tracks to liked_df
indices = np.random.randint(0, 3000, 2000)
for ind in indices:
    liked_df = add_new_likes(new_likes)

print(liked_df.shape)
liked_df.tail(50)

(100012, 15)


Unnamed: 0,id,name,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,liked,duration_ms,danceability
0,0K3dGPXHVALEqW8EEQGc3T,DELITO,0.508,7,-7.868,0,0.29,0.522,1.1e-05,0.0755,0.653,82.606,1,203137.0,0.54
1,4DHDIdeayp8xvlyg22wREO,Ram Pam Pam,0.813,5,-2.749,0,0.0777,0.305,0.0,0.278,0.924,97.018,1,200785.0,0.908
2,3qqHroTNyW69IQAkbKc0v8,Rosario Tijeras,0.809,6,-6.377,1,0.409,0.269,0.0,0.21,0.413,187.97,1,208067.0,0.563
3,411nmd0QMzty1UjCWSo3rc,No Bailes Sola,0.75,10,-3.974,1,0.145,0.203,0.0,0.3,0.619,179.881,1,182813.0,0.585
4,2Ao6tuwPEgKbTRhNw5DUZ1,Cynicism,0.0946,4,-16.25,1,0.0307,0.888,0.000765,0.102,0.22,101.085,1,148227.0,0.364
5,1bvERTuePaoVjQ3NpJq9aH,June Hymn,0.365,9,-6.407,1,0.0252,0.511,0.0,0.139,0.159,98.345,1,237547.0,0.593
6,6AIKEQvWItx9NcjtoNDOjh,Up the Wolves,0.365,2,-9.301,1,0.0268,0.494,7.2e-05,0.0892,0.474,157.522,1,207400.0,0.49
7,2o2stv4uGKSYuUaghBTVX3,Dark in Here,0.602,9,-7.653,0,0.0323,0.367,0.5,0.0835,0.308,125.114,1,203360.0,0.643
8,3s7MCdXyWmwjdcWh7GWXas,Violent Crimes,0.419,1,-6.724,0,0.522,0.376,0.0,0.187,0.0397,109.813,1,215320.0,0.669
9,5vuJuBqwzHJgCA1ysRfwxZ,good kid,0.831,5,-8.641,0,0.307,0.0623,0.0,0.185,0.41,176.162,1,214120.0,0.451


In [16]:
# Combine liked and unliked tracks
df = data_df.append(liked_df)
print(df.shape)
df['liked'].value_counts()

(269552, 15)


0    169540
1    100012
Name: liked, dtype: int64