In [1]:
import sys
sys.path.append("..") # To get storm code

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
import joblib

from scipy.spatial import distance

import plotly.express as px

from typing import List, Dict

In [2]:
from pymongo import MongoClient

In [3]:
# Internal imports
from storm.db import StormDB
from storm.client import StormUserClient
from storm.modeling import StormTrackClusterizer, FeatureSelector

sdb = StormDB()

2022-11-25 13:51:10,995 - storm.db - DEBUG - Storm MongoDB Backend Successfully Initialized.


In [4]:
# Configuration
target_playlist = '3K9no6AflSDYiiMzignAm7' # Instrumental
model_name = 'film_vg_instrumental' # For use in model registry

# target_playlist = '2zngrEiplX6Z1aAaIWgZ4m' # Lyrical
# model_name = 'contemporary_lyrical' # For use in model registry

simple_columns = [
    'valence',
    'danceability',
    'acousticness',
    'energy',
    'tempo',
    'duration_ms',
]

n_clusters = 4
seed = 43
distance_cutoff = 5

## Source

In [5]:
target_track_ids = sdb.get_loaded_playlist_tracks(target_playlist)

fields = {'last_updated':0} # The only fields not needed are the metadata for runs
target_tracks = sdb.get_track_info(target_track_ids, fields=fields)

In [11]:
df = pd.DataFrame.from_records(target_tracks)
predicted = df.copy()

## Indivudally Clustered Results
This is similar to an RFM style clustering where metrics, in this case audio features, are clustered separately. This forces similarity to be explainable and avoids heuristically similar clusters that wash out in the middle due to dimensionality.

In [12]:
for feature_name in simple_columns:
    cluster_pipeline = Pipeline([
        ('feature_selection', FeatureSelector([feature_name])),
        #('simple_scaling', StandardScaler()),
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('kmeans', KMeans(
            n_clusters=n_clusters, 
            random_state=seed, 
            init='random'
            )
        )
    ]).fit(df)

    predicted[f'{feature_name}_cluster'] = cluster_pipeline.predict(df)

    
    px.bar(predicted[f'{feature_name}_cluster'].value_counts()).show()
    px.box(predicted, x=f'{feature_name}_cluster', y=feature_name, points='all', hover_data=['name', feature_name]).show()

In [13]:
predicted

Unnamed: 0,_id,album_id,artists,duration_ms,explicit,name,track_number,acousticness,audio_features,danceability,...,time_signature,valence,audio_analysis,audio_analysis_flag,valence_cluster,danceability_cluster,acousticness_cluster,energy_cluster,tempo_cluster,duration_ms_cluster
0,00AIvOuQcYACm33xqdoV3R,1GUXN088ad0bOWQ6f6wWAY,[7kxdoQTibsQW5pOim1p2i3],137000,False,Kage,6,0.939,True,0.2700,...,4,0.1300,,,3,0,0,1,1,0
1,00F8q1aENo9WsFAMSQo46X,16jtLkbdr6ry4xV3H7lRh3,[0360rTDeUjEyBXaz2Ki00a],142693,False,Somethings Wrong,2,0.919,True,0.2940,...,4,0.0388,,,1,0,0,1,1,0
2,00L9p3dKK3ufNyua3pzVeC,18Jd3iAcykSIDGTA0vanR3,[7E3BRXV9ZbCt5lQTCXMTia],196440,False,Not Alone,13,0.950,True,0.0613,...,4,0.0374,,,1,1,0,1,3,1
3,00QyHVsz7Jf3mhOTTQC4YY,66R08h7Wu28pW4ePyhz71s,[5JF5ys1276wnCORFlQ3laz],60373,False,Narrative Therapy I,1,0.884,True,0.1660,...,4,0.0375,,,1,1,0,1,1,0
4,00vZPeb31ckUwcHRWwdZQv,5o6LRy15mnzE8M0AjkKBdw,[3dnf5MC1UPGuedb4cAA1sn],103221,False,Enzo,2,0.488,True,0.2000,...,4,0.0452,,,1,1,1,3,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1733,7yuTp2jOCN1dTxCSt68TUe,6U5z9EiIl1iBdwfzAGxS0B,[2VZNmg4vCnew4Pavo8zDdW],176039,False,The Inexorable Advance Of Mr. Delaney,2,0.347,True,0.0737,...,4,0.0301,"{'meta': {'analyzer_version': '4.0.0', 'platfo...",True,1,1,1,3,1,0
1734,7z88hLSesdUzt84MsPuAil,69iw6Zy6fh2dSnEgBTZKJE,"[2P6ygesd9xg5DPOBnda2jg, 7ku4vnPiSrXkUarPFjlzw...",552866,False,Spiegel im Spiegel - Version for Cello and Piano,3,0.989,True,0.3660,...,3,0.0400,,,1,3,0,1,1,3
1735,7z8IVlpwHziC0bFq4gZMxT,3iMEn4oNhkAj70YRaJGMWJ,"[6UEYawMcp2M4JFoXVOtZEq, 4QSmk1Bll3nbQfsjTe7vIR]",128048,False,What Gently Flutters,1,0.972,True,0.3020,...,4,0.3820,,,0,0,0,1,2,0
1736,7zTaLLMF9A170FfPtBaL3m,3qFNuf0WssUVPXTPdrxGho,[7E3BRXV9ZbCt5lQTCXMTia],137680,False,"Video Tape - From ""Surface""",1,0.936,True,0.3730,...,4,0.0377,,,1,3,0,3,2,0


## Write to Spotify
The easiest way to interpret the clusters is to write them back in to spotify for reference

In [None]:
model_name = StormTrackClusterizer.register_model(model_name, cluster_pipeline, n_clusters)

In [11]:
model = StormTrackClusterizer(dir='../models', storm_db_client=sdb)
model.load_model_by_name(model_name)
results = model.format_track_predictions_for_writing(predicted, '{cluster_number} - Testing - Film, VG and Instrumental')

In [12]:
storm_client = StormUserClient()

playlist_info = []
for i, name in enumerate(list(results.keys())):
    playlist_info.append({
        'name':name
    })

storm_client.create_many_playlists(playlist_info, overwrite_info=True)
for playlist_name, tracks in results.items():
    storm_client.write_playlist_tracks_by_name(playlist_name, tracks)

2022-11-25 10:33:53,205 - storm.client - DEBUG - Storm User Client successfully connected to Spotify.



The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`.

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  6.96it/s]

2022-11-25 10:34:00,481 - storm.client - DEBUG - Successfully Wrote 299 Tracks to 52zcyr16QPCT8ztay3u1zH



100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00,  6.70it/s]

2022-11-25 10:34:04,118 - storm.client - DEBUG - Successfully Wrote 313 Tracks to 3ICtk4ZfkAZkUhYrGdxyVF



100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  6.33it/s]

2022-11-25 10:34:07,837 - storm.client - DEBUG - Successfully Wrote 228 Tracks to 5ObbMIHyvttsNb9eyor34j



100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.54it/s]

2022-11-25 10:34:12,835 - storm.client - DEBUG - Successfully Wrote 96 Tracks to 63iBMxqa1Ws5mFVOqqD0J4



100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.55it/s]

2022-11-25 10:34:15,680 - storm.client - DEBUG - Successfully Wrote 93 Tracks to 1DtSxMDawqsh3roMf0uL2E



100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  6.25it/s]

2022-11-25 10:34:18,905 - storm.client - DEBUG - Successfully Wrote 150 Tracks to 0r6xImJvFeZLaT22SBqxNt



100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.32it/s]

2022-11-25 10:34:22,025 - storm.client - DEBUG - Successfully Wrote 86 Tracks to 1SA7hpjhUu4pEIJFM3wFVs



100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  5.71it/s]

2022-11-25 10:34:25,786 - storm.client - DEBUG - Successfully Wrote 197 Tracks to 0gxcp40Lry0EatEEDylgMr



100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  5.65it/s]

2022-11-25 10:34:28,980 - storm.client - DEBUG - Successfully Wrote 136 Tracks to 5SuB54CEF2oJuZ2jaWwqDf



100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  5.53it/s]

2022-11-25 10:34:31,954 - storm.client - DEBUG - Successfully Wrote 125 Tracks to 5voh3pV7C4E5jz56RZHHkJ



0it [00:00, ?it/s]

2022-11-25 10:34:35,057 - storm.client - DEBUG - Successfully Wrote 15 Tracks to 5IBDM0AhCOdP9m0xPVgecQ



