This notebook was used on the outputs of Spotify's Track Audio Feature API. Below is code that splits features into quantiles and assigns them categorical values like "low, medium, and high."

In [158]:
import pandas as pd
import uuid

def split_quantiles(n, series: pd.Series):
    values = series.sort_values().tolist()
    size = len(values)/n
    splits = [values[int(size*(i+1))-1] for i in range(n-1)]
    return splits

def encode_lowmedhigh(series):
    splits = split_quantiles(3, series)
    encoded = []
    for value in series:
        if value <= splits[0]:
            encoded.append('low')
        elif splits[0] < value <= splits[1]:
            encoded.append('medium')
        else:
            encoded.append('high')
    return encoded

# from Spotify API using the transform_track_audio_json function (see spotify_api.ipynb)
# contains track ids and audio features, all info needed to create AudioFeature nodes and their edges
af = pd.read_csv('audio_features.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'audio_features.csv'

The following constants contain mapping information regarding how to encode all audio features.

In [None]:
map_key = {
    0: 'C', 1: 'C#', 2: 'D', 3: 'D#',
    4: 'E', 5: 'F', 6: 'F#', 7: 'G',
    8: 'G#', 9: 'A', 10: 'A#', 11: 'B'
}
map_mode = {0: 'Minor', 1: 'Major'}
lmh = ['danceability', 'energy', 'speechiness', 'acousticness', 'liveness', 'valence', 'tempo', 'duration_ms']
mapped = {'mode': map_mode, 'key': map_key}
lh = {'loudness': 0.5, 'instrumentalness': 0.5}
unchanged = ['time_signature']


In [153]:
def manufacture_nodes():
    """Generate node entities for all combinations of audio features and their encoded groupings."""
    nodes = []
    for n in lmh:
        for level in ('low', 'medium', 'high'):
            nodes.append([str(uuid.uuid4()), n, level])

    for n, map_ in mapped.items():
        for value in map_.values():
            nodes.append([str(uuid.uuid4()), n, value])

    for n in lh:
        for level in ('low', 'high'):
            nodes.append([str(uuid.uuid4()), n, level])

    for level in range(1,5):
        nodes.append([str(uuid.uuid4()), 'time_signature', str(level)])

    node_features = pd.DataFrame(nodes, columns=['feature_id', 'feature', 'encoding'])
    node_features['name'] = [f'{" ".join([i.capitalize() for i in f.split("_")])} ({c.capitalize()})' for f,c in zip(node_features.feature, node_features.encoding)]
    node_features.name.replace('(Ms )', '', regex=True, inplace=True)
    #node_features.to_csv('../clean/node_feature.csv', index=False)
    return node_features

Applying groupings and encodings

In [154]:
for feature in lmh:
    af[feature+'_encoded'] = encode_lowmedhigh(af[feature])

for feature in lh:
    af[feature+'_encoded'] = af[feature].apply(lambda x: 'high' if x > 0.5 else 'low')

for feature, map_ in mapped.items():
    af[feature+'_encoded'] = [map_[i] if i==i else pd.NA for i in af[feature]]

af['time_signature_encoded'] = af['time_signature']

#af.to_csv('encoded_features.csv', index=False)

Pivoting to a "tall" format wherein each record is an audio feature edge.

In [None]:
features = ['danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'time_signature']

union_dfs = []
for feature in features:
    subset = af[['spotify_id', feature]].rename(columns={feature: 'value'})
    subset['feature_name'] = feature
    subset['encoded_value'] = af[feature+'_encoded']
    union_dfs.append(subset)

tall = pd.concat(union_dfs)

Joining the "tall" features to the nodes generated by the manufacture_nodes function to generate track-audioFeature edges

In [None]:
fdf = manufacture_nodes()

merged = tall.rename(columns={'feature_name': 'feature', 'encoded_value': 'encoding'})\
    .set_index(['feature', 'encoding'])\
    .join(fdf.set_index(['feature', 'encoding'])).reset_index()