In [1]:
## Create Sagemaker Session and Get Execution Role ##
import sagemaker

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

In [2]:
## Upload Training Data to s3 Bucket ##

bucket = sagemaker_session.default_bucket()

data_dir = 'data' 
prefix = 'rcmdKNN'
input_data = sagemaker_session.upload_data(data_dir, key_prefix=prefix)

In [3]:
## Instantiate Estimator Object ##

from sagemaker.sklearn.estimator import SKLearn

estimator = SKLearn(entry_point='train.py',
                    framework_version='0.23-1',
                    role=role,
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge',
                    py_version='py3',
                    source_dir='source',
                    image_uri=None,
                    hyperparameters = {'n_neighbors':101,
                                       'metric':'cosine',
                                       'algorithm':'brute'})

train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [4]:
## Train the estimator ##
estimator.fit({'train': input_data})

2021-12-15 15:18:12 Starting - Starting the training job...
2021-12-15 15:18:40 Starting - Launching requested ML instancesProfilerReport-1639581492: InProgress
......
2021-12-15 15:19:41 Starting - Preparing the instances for training.........
2021-12-15 15:21:05 Downloading - Downloading input data...
2021-12-15 15:21:41 Training - Downloading the training image...
2021-12-15 15:22:14 Uploading - Uploading generated training model[34m2021-12-15 15:22:05,771 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-12-15 15:22:05,773 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-12-15 15:22:05,785 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-12-15 15:22:06,891 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-12-15 15:22:07,509 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus in

In [5]:
## Define new model object pointing to custom inference code ##

from sagemaker.sklearn import SKLearnModel

sklearn_model = SKLearnModel(model_data=estimator.model_data,
                             role=role,
                             entry_point="predict.py",
                             source_dir='source',
                             framework_version='0.23-1')

In [6]:
## Deploy the estimator ##

predictor = sklearn_model.deploy(instance_type="ml.m4.xlarge", initial_instance_count=1)

---------!

In [7]:
## Load in training data and mapping files ##
import scipy
import numpy as np
import pickle

train_data = scipy.sparse.load_npz('./data/artist_user_mtrx.npz')
train_data = np.array(train_data.todense())

with open('./datasources/artist_to_idx.pkl', 'rb') as f:
    artist_to_idx = pickle.load(f)
    
with open('./datasources/idx_to_artist.pkl', 'rb') as f:
    idx_to_artist = pickle.load(f)

In [8]:
## Define utility functions to process input/output ##

def process_input(artists, input_data):
    artist_ids = [artist_to_idx[i] for i in artists]
    artist_ids = input_data[artist_ids].reshape(1,-1)
    return artist_ids

def process_output(reponse, n, verbose=True):
    artists = [idx_to_artist[i] for i in reponse]
    if verbose:
        print ('Recommended Artists: {}'.format(artists[1:n+1]))
    return artists[1:n+1]

In [9]:
## Generate some predictions for select artists ##

for artist in ['the beatles', 'eagles', 'genesis', 'nirvana', 'the strokes']:
    print ('\nInput artist: {} \n'.format(artist))
    input_data = process_input([artist], train_data)
    response = predictor.predict(input_data)
    process_output(response, 15)


Input artist: the beatles 

Recommended Artists: ['bob dylan', 'led zeppelin', 'the rolling stones', 'pink floyd', 'radiohead', 'the who', 'john lennon', 'david bowie', 'simon & garfunkel', 'beck', 'the white stripes', 'the beach boys', 'paul mccartney', 'modest mouse', 'the shins']

Input artist: eagles 

Recommended Artists: ['elton john', 'billy joel', 'lynyrd skynyrd', 'chicago', 'eric clapton', 'boston', 'creedence clearwater revival', 'jimmy buffett', 'fleetwood mac', 'tom petty and the heartbreakers', 'the rolling stones', 'led zeppelin', 'james taylor', 'journey', 'pink floyd']

Input artist: genesis 

Recommended Artists: ['yes', 'peter gabriel', 'jethro tull', 'rush', 'king crimson', 'styx', 'pink floyd', 'porcupine tree', 'the police', 'the moody blues', 'queensrÿche', 'dream theater', 'journey', 'ayreon', 'black sabbath']

Input artist: nirvana 

Recommended Artists: ['alice in chains', 'pearl jam', 'soundgarden', 'the smashing pumpkins', 'stone temple pilots', 'nine inch 

In [13]:
## Calculate 'hit rate' ##
def rcmnd_from_fav(user, data, num_preds = 100):
    play_history = data[:,user]
    artist_idx = (-play_history).argsort()[:1]
    predictions = predictor.predict(data[artist_idx].reshape(1,-1))
    return predictions[1:num_preds+1]

def hit_rate(user, data, predictions):
    hits = train_data[:, user][predictions]
    return hits

hits = []
num_users = 100
preds_per_user = 10

for user in np.random.randint(0, train_data.shape[1], size=num_users):
    predictions = rcmnd_from_fav(user, train_data)
    hits = hit_rate(user, train_data, predictions)
    hits+=hits
    
rate = (np.count_nonzero(hits) / len(hits))

print ('Artist Hit Rate for {} Users ({} Recommendations per User): {:.0%}'\
       .format(num_users, preds_per_user, rate))

Artist Hit Rate for 100 Users (10 Recommendations per User): 29%
