In [41]:
import sys
sys.path.append("..") # To get storm code

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
import joblib
from uuid import uuid4

import plotly.express as px

In [29]:
# Internal imports
from pymongo import MongoClient
from src.db import StormDB
from src.modeling import FeatureSelector

sdb = StormDB(MongoClient(host='localhost', port=27017))

In [23]:
# Configuration, name follows friendly_name__storm class__version(preferrably auto-generate)
MODEL_TYPE = 'track_feature_clusterizer' # For use in model registry

# Model Params
simple_columns = [
    'liveness',
    'speechiness',
    'valence',
    'danceability',
    'acousticness',
    'energy',
    'instrumentalness',
]
n_clusters = 6

## Helpers

In [51]:
class ModelManager:
    """
    Shared object for managing the preservation and movement of models
    for different storms within the same directory.
    """

    def __init__(self, dir: str='../models', base_format: str = '{0}__{1}__{2}'):
        self.dir = dir
        self.base_format = base_format

    def load_model_by_name(self, name: str, location ='/dev'):
        """
        Loads a model from prod or dev given exact name
        """

        if name+'.pkl' in os.listdir(self.dir+location):
            model = joblib.load(name+'.pkl')

            return model
        else:
            raise FileNotFoundError(f"Can't find {name}.pkl")

    def register_model(self, storm_name: str, storm_model_type: str, fitted_pipeline: Pipeline):
        """
        Saves a model to the directory with consistent formatting
        """

        output_name = self.base_format.format(storm_name, storm_model_type, uuid4())
        joblib.dump(fitted_pipeline, f'../models/dev/{output_name}.pkl', compress = 1)

        return output_name

    def promote_model(model_name, retire: bool=True):
        """
        Moves a model from dev to prod. This allows for preservation of dev and prod models.
        If a model of the type already exists in prod it will move it to archive.
        dev -> prod, if in prod then prod -> archive
        """

        None


    def restore_model(retire: bool=True):
        """
        Moves a model from dev to prod. This allows for preservation of dev and prod models.
        If a model of the type already exists in prod it will move it to archive.
        archive -> prod, if in prod then prod -> dev
        """

        None

    def get_prod_model(storm_name, storm_model_type):
        """
        Returns the loaded model from prod
        """
    
        None

## Source

In [43]:
storm_name = 'film_vg_instrumental_v2'
fields = {'last_updated':0} # The only fields not needed are the metadata for runs

In [44]:
# First grab the targets. These will be 1s in the prediction
target_playlist = sdb.get_config(storm_name)['great_targets']
target_track_ids = sdb.get_loaded_playlist_tracks(target_playlist)
target_tracks = sdb.get_track_info(target_track_ids, fields=fields)

In [45]:
df = pd.DataFrame.from_records(target_tracks)

## Simple Clustering -  Fitting
Take the track as a flat representation and group the target playlist. Model will be outputted and can be validated in a separate location. Retraining will always need to be manual and validation will need to be manual since the value of the groups is mostly based on their meaning to the end user.

In [46]:
cluster_pipeline = Pipeline([
    ('feature_selection', FeatureSelector(simple_columns)),
    ('impute', SimpleImputer(strategy='constant', fill_value=.5)),
    ('kmeans', KMeans(n_clusters))
]).fit(df)

In [52]:
ModelManager().register_model(storm_name, MODEL_TYPE, cluster_pipeline)

'film_vg_instrumental_v2__track_feature_clusterizer__912681f5-926b-4da3-8bb8-d9bbb7259953'