In [15]:
import sys
sys.path.append("..") # To get storm code

import pandas as pd
import numpy as np
import os

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
import joblib
from uuid import uuid4

import plotly.express as px

In [16]:
# Internal imports
from pymongo import MongoClient
from src.db import StormDB
from src.storm_client import StormUserClient
from src.runner import StormRunner
from src.modeling import FeatureSelector

sdb = StormDB()
suc = StormUserClient(os.getenv('user_id'))

In [17]:
# Model Params
simple_columns = [
    'liveness',
    'speechiness',
    'valence',
    'danceability',
    'acousticness',
    'energy',
    'instrumentalness',
]

n_clusters = 6

## Helpers

In [18]:
StormRunner('film_vg_instrumental_v2').load_last_run()
.collect_playlist_info()

Using `localhost` as redirect URI without a port. Specify a port (e.g. `localhost:8080`) to allow automatic retrieval of authentication code instead of having to copy and paste the URL your browser is redirected to.


Enter the URL you were redirected to: http://localhost/?code=AQDc1xmpquGY8ciQm1jWeY49fOFiWpfi7vJRqMeyFIwMmol8-DP8qGzmYpcQ-2GD4buXd7zoclztkRUiR3lbF2UrgHIS2NTX_WMaZkwrkAF7CqgF7fMmZQuhUAytMdbR_oKOleyG1U5-ZYo_FuAYkIKV2YNj1e9auGeLwRLiesaPeZhDciqFn68oAcFxnqr2ur_CxoWN9tJ6C1l9j3ct-2f1ybOIQQ


## Source

In [19]:
# Playlists (input_to_cluster, new_tracks_to_score)
inst = ('0R1gw1JbcOFD0r8IzrbtYP', '7fnvajjUoWBQDo8iFNMH3s')
lyr = ('2zngrEiplX6Z1aAaIWgZ4m', '6v7JJ3RJZQbJaA9ImezP3F')
tiaptp = ('2TngS1Exm3C5F10XVM1B4L', '2TngS1Exm3C5F10XVM1B4L')

playlist_pairs = [inst, lyr, tiaptp]
playlist_predictions = [None for x in playlist_pairs]

fields = {'last_updated':0} # The only fields not needed are the metadata for runs

## Simple Clustering -  Fitting
Take the track as a flat representation and group the target playlist. Model will be outputted and can be validated in a separate location. Retraining will always need to be manual and validation will need to be manual since the value of the groups is mostly based on their meaning to the end user.

In [20]:
for i, playlist_pair in enumerate(playlist_pairs):
    
    # Get Target Playlist for clustering
    target_tracks = sdb.get_track_info(sdb.get_loaded_playlist_tracks(playlist_pair[0]), fields=fields)
    df = pd.DataFrame.from_records(target_tracks)
    
    # Get Playlist tracks to score
    output = sdb.get_track_info(sdb.get_loaded_playlist_tracks(playlist_pair[1]), fields=fields)
    output_df = pd.DataFrame.from_records(output)
    
    cluster_pipeline = Pipeline([
        ('feature_selection', FeatureSelector(simple_columns)),
        ('impute', SimpleImputer(strategy='constant', fill_value=.5)),
        ('kmeans', KMeans(n_clusters))
    ]).fit(df)
    
    output_df['cluster'] = cluster_pipeline.predict(output_df)
    playlist_predictions[i] = output_df

In [21]:
playlist_predictions[0]

Unnamed: 0,_id,album_id,artists,duration_ms,explicit,name,track_number,acousticness,audio_features,danceability,...,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,cluster
0,3UcZSfsPaWmyRJPsWxV6RJ,3in5dg17c0pM5pIG17WRjd,"[3gGbSXSwHWmrUBIG9IUAau, 3C4MmUJYQN9svNdedAR2BK]",275242,False,It's Not Too Late,1,0.968,True,0.24,...,0.667,2,0.0611,-17.419,0,0.0424,109.324,4,0.0388,4


## Registering and Saving
Based on the results above you can save the model below, which can then be run within a clusterizer workflow

'film_vg_instrumental_v2__track_feature_clusterizer__912681f5-926b-4da3-8bb8-d9bbb7259953'