In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import os
import json

import pandas as pd

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

from tqdm.notebook import tqdm


with open("spotify_keys.json") as f:
    keys = json.load(f)

os.environ["SPOTIPY_CLIENT_ID"] = keys["SPOTIPY_CLIENT_ID"]
os.environ["SPOTIPY_CLIENT_SECRET"] = keys["SPOTIPY_CLIENT_SECRET"]

sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

<IPython.core.display.Javascript object>

In [3]:
def get_audio_features(track_artist):
    """Return spotify's audio features given a track - artist search"""
    results = sp.search(q=track_artist, limit=1)
    results = results["tracks"]["items"][0]

    track_id = results["id"]
    features = sp.audio_features([track_id])[0]

    return features

<IPython.core.display.Javascript object>

In [4]:
face = pd.read_csv("data/emotion_data.csv")

spotify = pd.read_csv(
    "data/spotify_data.csv",
    names=["end_time", "artist_name", "track_name", "time_played"],
    header=0,
)

<IPython.core.display.Javascript object>

In [5]:
face["timestamp"] = pd.to_datetime(face["timestamp"], utc=True)

print(f"\nface.shape: {face.shape}")
print(f"face.columns: {list(face.columns)}")


face.shape: (42165, 7)
face.columns: ['timestamp', 'angry', 'scared', 'happy', 'sad', 'surprised', 'neutral']


<IPython.core.display.Javascript object>

In [6]:
spotify["end_time"] = pd.to_datetime(spotify["end_time"], utc=True)
spotify["time_played"] = pd.to_timedelta(spotify["time_played"], unit="ms")
spotify["start_time"] = spotify["end_time"] - spotify["time_played"]

print(f"spotify.shape: {spotify.shape}")
print(f"spotify.columns: {list(spotify.columns)}")

spotify.shape: (28494, 5)
spotify.columns: ['end_time', 'artist_name', 'track_name', 'time_played', 'start_time']


<IPython.core.display.Javascript object>

In [7]:
# INNER JOIN dfs BETWEEN datetimes
# pandas doesnt offer feature afaik and dont want to learn a new pkg
# its... pretty slow....

joined_rows = []
for i, row in tqdm(face.iterrows(), total=face.shape[0]):
    # fmt: off
    time_filter = ((spotify["start_time"] < row["timestamp"]) &
                   (spotify["end_time"] > row["timestamp"]))
    # fmt: on

    to_join = spotify[time_filter]

    if to_join.shape[0] < 1:
        continue
    elif to_join.shape[0] > 1:
        # cop out and going to take first if multiple
        to_join = to_join.iloc[[0], :]

    row_df = pd.DataFrame(row).T

    row_df.index = [i]
    to_join.index = [i]

    joined_row = pd.concat((row_df, to_join), axis=1)
    joined_rows.append(joined_row)


joined_df = pd.concat(joined_rows)
joined_df = joined_df.reset_index(drop=True)
joined_df = joined_df.dropna()

print(f"joined_df.shape: {joined_df.shape}")
joined_df.to_csv("data/joined_data.csv", index=False)

HBox(children=(FloatProgress(value=0.0, max=42165.0), HTML(value='')))


joined_df.shape: (14857, 12)


<IPython.core.display.Javascript object>

In [8]:
joined_df["track_artist"] = joined_df["track_name"] + " - " + joined_df["artist_name"]
track_artists = joined_df["track_artist"]
track_artists = track_artists.unique()


feats = []
for track_artist in tqdm(track_artists):
    try:
        feat = get_audio_features(track_artist)
        feat["track_artist"] = track_artist
        feats.append(feat)
    except:
        print(f"FAILURE: {track_artist}")

song_feats_df = pd.DataFrame(feats)
song_feats_df.to_csv("data/song_features.csv", index=False)

HBox(children=(FloatProgress(value=0.0, max=513.0), HTML(value='')))

FAILURE: Bedtime Story - GoldLink
FAILURE: Stoned Love - dubé



<IPython.core.display.Javascript object>