In this tutorial, we build a simple matrix factorization model using the [MovieLens 100K dataset](https://grouplens.org/datasets/movielens/100k/) with TensorFlow Recommender System (TFRS) using Amazon SageMaker. 

We will use this model to recommend movies for a given user.

In [1]:
!pip install -q sagemaker==2.9.2
!pip install -q sagemaker-experiments==0.1.24
!pip install -q tensorflow==2.3.0
!pip install -q tensorflow-recommenders==0.2.0
!pip install -q tensorflow-datasets==4.0.0

In [2]:
!ls -al .

total 56
drwx------ 12 root nogroup 6144 Nov  2 21:14 .
drwxr-xr-x  1 root root      39 Nov  2 21:24 ..
-rw-------  1 root root    2230 Nov  2 21:14 .bash_history
drwxr-xr-x  7 root root    6144 Nov  2 20:59 .cache
drwxr-xr-x  3 root root    6144 Nov  2 15:44 .config
-rw-r--r--  1 root root      54 Nov  2 15:39 .gitconfig
drwxr-xr-x  2 root root    6144 Nov  1 19:31 .ipynb_checkpoints
drwxr-xr-x  5 root root    6144 Oct 10 23:13 .ipython
drwxr-xr-x  3 root root    6144 Nov  2 21:24 .jupyter
drwxr-xr-x  2 root root    6144 Nov  1 19:59 .keras
drwxr-xr-x  3 root root    6144 Oct 10 22:41 .local
drwxr-xr-x  2 root root    6144 Nov  1 19:33 .ssh
-rw-r--r--  1 root root     111 Oct 10 22:41 .yarnrc
drwxr-xr-x  3 root root    6144 Nov  2 21:12 exported_models
drwxr-xr-x 19 root root    6144 Nov  1 19:35 workshop


In [3]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# Specify Input Data S3 URI and `Distribution Strategy`

In [4]:
from sagemaker.inputs import TrainingInput

input_train_data_s3_uri ='s3://sagemaker-us-east-1-835319576252/tensorflow_datasets/train/'

s3_input_train_data = TrainingInput(s3_data=input_train_data_s3_uri,
                                    distribution='ShardedByS3Key')
print(s3_input_train_data.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/tensorflow_datasets/train/', 'S3DataDistributionType': 'ShardedByS3Key'}}}


# Setup Metrics To Track Model Performance

These sample log lines...
```
499/500 [=====>..] - ETA: 3s - root_mean_squared_error: 1.1198 - factorized_top_k/top_10_categorical_accuracy: 0.481 - factorized_top_k/top_50_categorical_accuracy: 0.607 - factorized_top_k/top_100_categorical_accuracy: 0.885
```
...will produce the following metrics in CloudWatch:

`root_mean_squared_error` = 1.1198

`factorized_top_k/top_10_categorical_accuracy` = 0.481

`factorized_top_k/top_50_categorical_accuracy` = 0.607

`factorized_top_k/top_100_categorical_accuracy` = 0.885

In [5]:
metrics_definitions = [    
     {'Name': 'root_mean_squared_error', 'Regex': 'root_mean_squared_error: ([0-9\\.]+)'},
     {'Name': 'top_10_categorical_accuracy', 'Regex': 'factorized_top_k/top_10_categorical_accuracy: ([0-9\\.]+)'},
     {'Name': 'top_50_categorical_accuracy', 'Regex': 'factorized_top_k/top_50_categorical_accuracy: ([0-9\\.]+)'},
     {'Name': 'top_100_categorical_accuracy', 'Regex': 'factorized_top_k/top_100_categorical_accuracy: ([0-9\\.]+)'}
]

# Setup Hyper-Parameters for Classification Layer

In [6]:
epochs=1000
learning_rate=0.5
dataset_variant='100k' # movielens 100k, 1m, 20m, 25m, etc
embedding_dimension=256 # dimension (k) of our user and item embeddings
enable_tensorboard=True
train_instance_count=1
train_instance_type='ml.p3.2xlarge'

# Setup Our TensorFlow Script to Run on SageMaker
Prepare our TensorFlow model to run on the managed SageMaker service

In [7]:
!ls -al .

total 56
drwx------ 12 root nogroup 6144 Nov  2 21:14 .
drwxr-xr-x  1 root root      39 Nov  2 21:24 ..
-rw-------  1 root root    2230 Nov  2 21:14 .bash_history
drwxr-xr-x  7 root root    6144 Nov  2 20:59 .cache
drwxr-xr-x  3 root root    6144 Nov  2 15:44 .config
-rw-r--r--  1 root root      54 Nov  2 15:39 .gitconfig
drwxr-xr-x  2 root root    6144 Nov  1 19:31 .ipynb_checkpoints
drwxr-xr-x  5 root root    6144 Oct 10 23:13 .ipython
drwxr-xr-x  3 root root    6144 Nov  2 21:24 .jupyter
drwxr-xr-x  2 root root    6144 Nov  1 19:59 .keras
drwxr-xr-x  3 root root    6144 Oct 10 22:41 .local
drwxr-xr-x  2 root root    6144 Nov  1 19:33 .ssh
-rw-r--r--  1 root root     111 Oct 10 22:41 .yarnrc
drwxr-xr-x  3 root root    6144 Nov  2 21:12 exported_models
drwxr-xr-x 19 root root    6144 Nov  1 19:35 workshop


In [8]:
!pygmentize /root/workshop/02_usecases/sagemaker_recommendations/src/train_multitask.py

[34mimport[39;49;00m [04m[36mtime[39;49;00m
[34mimport[39;49;00m [04m[36mrandom[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mfrom[39;49;00m [04m[36mglob[39;49;00m [34mimport[39;49;00m glob
[34mimport[39;49;00m [04m[36mpprint[39;49;00m
[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mjson[39;49;00m
[34mimport[39;49;00m [04m[36msubprocess[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m

subprocess.check_call([sys.executable, [33m'[39;49;00m[33m-m[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mpip[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33minstall[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mscikit-learn==0.23.1[39;49;00m[33m'[39;49;00m])
subprocess.check_call([sys.executable, [33m'[39;49;00m[33m-m[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mpip[39;49;00m[33m'[39;49;00m,

In [9]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(entry_point='train_multitask.py',
                       source_dir='/root/workshop/02_usecases/sagemaker_recommendations/src',
                       role=role,
                       instance_count=train_instance_count,
                       instance_type=train_instance_type,
                       py_version='py37',
                       framework_version='2.3.0',
                       hyperparameters={
                           'epochs': epochs,
                           'learning_rate': learning_rate,
                           'dataset_variant': dataset_variant,
                           'embedding_dimension': embedding_dimension,                           
                           'enable_tensorboard': enable_tensorboard
                       },
                       metric_definitions=metrics_definitions,
                       debugger_hook_config=False
            )

# Create the Experiment

In [10]:
import time
from smexperiments.experiment import Experiment

timestamp = int(time.time())

recommender_experiment = Experiment.create(
                         experiment_name='MovieLens-Recommender-{}'.format(timestamp),
                         description='MovieLens Recommender', 
                         sagemaker_boto_client=sm)

recommender_experiment_name = recommender_experiment.experiment_name
print('Experiment name: {}'.format(recommender_experiment_name))

Experiment name: MovieLens-Recommender-1604352479


In [11]:
import time
from smexperiments.trial import Trial

timestamp = int(time.time())

trial_name = 'trial-{}-{}-{}-{}'.format(timestamp, epochs, dataset_variant, embedding_dimension)

trial = Trial.create(trial_name=trial_name,
                     experiment_name=recommender_experiment_name,
                     sagemaker_boto_client=sm)

trial_name = trial.trial_name
print('Trial name: {}'.format(trial_name))

Trial name: trial-1604352479-1000-100k-256


In [12]:
recommender_experiment_config = {
    'ExperimentName': recommender_experiment_name,
    'TrialName': trial.trial_name,
    'TrialComponentDisplayName': 'train'
}

# Train the Model on SageMaker

In [13]:
estimator.fit(
              inputs={
                  'train': s3_input_train_data, 
              },              
              experiment_config=recommender_experiment_config,                   
              wait=False)

INFO:sagemaker:Creating training-job with name: tensorflow-training-2020-11-02-21-27-59-737


In [14]:
recommender_training_job_name = estimator.latest_training_job.name
print('Training Job Name:  {}'.format(recommender_training_job_name))

Training Job Name:  tensorflow-training-2020-11-02-21-27-59-737


In [15]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training Job</a></b>'.format(region, recommender_training_job_name)))


In [16]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a></b>'.format(region, recommender_training_job_name)))


In [17]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Training Job Has Completed</b>'.format(bucket, recommender_training_job_name, region)))


# Wait for Training Job to Finish

In [18]:
%%time

estimator.latest_training_job.wait(logs=False)


2020-11-02 21:28:01 Starting - Starting the training job
2020-11-02 21:28:03 Starting - Launching requested ML instances.................
2020-11-02 21:29:33 Starting - Preparing the instances for training.........
2020-11-02 21:30:23 Downloading - Downloading input data...............
2020-11-02 21:31:46 Training - Downloading the training image.....
2020-11-02 21:32:14 Training - Training image download completed. Training in progress..........
2020-11-02 21:33:05 Uploading - Uploading generated training model
2020-11-02 21:33:12 Failed - Training job failed


UnexpectedStatusException: Error for Training job tensorflow-training-2020-11-02-21-27-59-737: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/usr/local/bin/python3.7 train_multitask.py --dataset_variant 100k --embedding_dimension 256 --enable_tensorboard True --epochs 1000 --learning_rate 0.5 --model_dir s3://sagemaker-us-east-1-835319576252/tensorflow-training-2020-11-02-21-27-59-737/model"
WARNING: You are using pip version 20.2.3; however, version 20.2.4 is available.
You should consider upgrading via the '/usr/local/bin/python3.7 -m pip install --upgrade pip' command.
WARNING: You are using pip version 20.2.3; however, version 20.2.4 is available.
You should consider upgrading via the '/usr/local/bin/python3.7 -m pip install --upgrade pip' command.
WARNING: You are using pip version 20.2.3; however, version 20.2.4 is available.
You should consider upgrading via the '/usr/local/bin/python3.7 -m pip install --upgrade pip' command.
WARNING: You are using pip version 20.2.3; however, version 20.2.4 is available.
You should consider upgrading via the '/usr/local/bin/python3.7 -m pip install --upgrade

# Copy the Trained Model from S3

In [19]:
!aws s3 cp s3://$bucket/$recommender_training_job_name/output/model.tar.gz ./model.tar.gz

download: s3://sagemaker-us-east-1-835319576252/tensorflow-training-2020-11-02-21-27-59-737/output/model.tar.gz to ./model.tar.gz


In [20]:
!mkdir -p ./model/
!tar -xvzf ./model.tar.gz -C ./model/

tensorboard/
tensorboard/train/
tensorboard/train/events.out.tfevents.1604352782.ip-10-0-235-59.ec2.internal.33.543.v2


# Inspect the Model

In [21]:
!saved_model_cli show --all --dir ./model/tensorflow/saved_model/0/

2020-11-02 21:33:17.524011: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2020-11-02 21:33:17.524168: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Traceback (most recent call last):
  File "/opt/conda/bin/saved_model_cli", line 8, in <module>
    sys.exit(main())
  File "/opt/conda/lib/python3.7/site-packages/tensorflow/python/tools/saved_model_cli.py", line 1185, in main
    args.func(args)
  File "/opt/conda/lib/python3.7/site-packages/tensorflow/python/tools/saved_model_cli.py", line 715, in show
    _show_all(args.dir)
  File "/opt/conda/lib/python3.7/site-packages/tensorflow/python/tools/saved_model_cli.py", line 296, in _show_all
    tag_sets = saved_model_utils.get_saved_model_tag_sets(saved_model_dir)
  File "/opt/conda/lib/python3.7/site-pa

# Make a Sample Prediction

In [22]:
user_id = "42"

In [23]:
!saved_model_cli run --input_exprs 'input_1=np.array(["$user_id"])' --tag_set serve --signature_def serving_default --dir ./model/tensorflow/saved_model/0

2020-11-02 21:33:23.992231: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2020-11-02 21:33:23.992519: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Traceback (most recent call last):
  File "/opt/conda/bin/saved_model_cli", line 8, in <module>
    sys.exit(main())
  File "/opt/conda/lib/python3.7/site-packages/tensorflow/python/tools/saved_model_cli.py", line 1185, in main
    args.func(args)
  File "/opt/conda/lib/python3.7/site-packages/tensorflow/python/tools/saved_model_cli.py", line 748, in run
    init_tpu=args.init_tpu, tf_debug=args.tf_debug)
  File "/opt/conda/lib/python3.7/site-packages/tensorflow/python/tools/saved_model_cli.py", line 408, in run_saved_model_with_feed_dict
    tag_set)
  File "/opt/conda/lib/python3.7/site-packages/tensorf

# Show the Experiment Tracking Lineage

In [24]:
from sagemaker.analytics import ExperimentAnalytics

lineage_table = ExperimentAnalytics(
    sagemaker_session=sess,
    experiment_name=recommender_experiment_name,
    metric_names=[
        'top_10_categorical_accuracy',
        'top_50_categorical_accuracy',
        'top_100_categorical_accuracy'
    ],
    sort_by="CreationTime",
    sort_order="Ascending",
)

lineage_df = lineage_table.dataframe()
lineage_df.shape

(1, 24)

In [25]:
lineage_df.columns

Index(['TrialComponentName', 'DisplayName', 'SourceArn', 'SageMaker.ImageUri',
       'SageMaker.InstanceCount', 'SageMaker.InstanceType',
       'SageMaker.VolumeSizeInGB', 'dataset_variant', 'embedding_dimension',
       'enable_tensorboard', 'epochs', 'learning_rate', 'model_dir',
       'sagemaker_container_log_level', 'sagemaker_job_name',
       'sagemaker_program', 'sagemaker_region', 'sagemaker_submit_directory',
       'train - MediaType', 'train - Value',
       'SageMaker.ModelArtifact - MediaType',
       'SageMaker.ModelArtifact - Value', 'Trials', 'Experiments'],
      dtype='object')

In [26]:
lineage_df

Unnamed: 0,TrialComponentName,DisplayName,SourceArn,SageMaker.ImageUri,SageMaker.InstanceCount,SageMaker.InstanceType,SageMaker.VolumeSizeInGB,dataset_variant,embedding_dimension,enable_tensorboard,...,sagemaker_job_name,sagemaker_program,sagemaker_region,sagemaker_submit_directory,train - MediaType,train - Value,SageMaker.ModelArtifact - MediaType,SageMaker.ModelArtifact - Value,Trials,Experiments
0,tensorflow-training-2020-11-02-21-27-59-737-aw...,train,arn:aws:sagemaker:us-east-1:835319576252:train...,763104351884.dkr.ecr.us-east-1.amazonaws.com/t...,1.0,ml.p3.2xlarge,30.0,"""100k""",256.0,True,...,"""tensorflow-training-2020-11-02-21-27-59-737""","""train_multitask.py""","""us-east-1""","""s3://sagemaker-us-east-1-835319576252/tensorf...",,s3://sagemaker-us-east-1-835319576252/tensorfl...,,s3://sagemaker-us-east-1-835319576252/tensorfl...,[trial-1604352479-1000-100k-256],[MovieLens-Recommender-1604352479]


In [27]:
sm.describe_trial_component(TrialComponentName=lineage_df.TrialComponentName[0])

{'TrialComponentName': 'tensorflow-training-2020-11-02-21-27-59-737-aws-training-job',
 'TrialComponentArn': 'arn:aws:sagemaker:us-east-1:835319576252:experiment-trial-component/tensorflow-training-2020-11-02-21-27-59-737-aws-training-job',
 'DisplayName': 'train',
 'Source': {'SourceArn': 'arn:aws:sagemaker:us-east-1:835319576252:training-job/tensorflow-training-2020-11-02-21-27-59-737',
  'SourceType': 'SageMakerTrainingJob'},
 'Status': {'PrimaryStatus': 'Failed',
 'StartTime': datetime.datetime(2020, 11, 2, 21, 30, 23, tzinfo=tzlocal()),
 'EndTime': datetime.datetime(2020, 11, 2, 21, 33, 12, tzinfo=tzlocal()),
 'CreationTime': datetime.datetime(2020, 11, 2, 21, 28, 2, 25000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:835319576252:user-profile/d-dsxoghy6ztwy/default-1602368497083',
  'UserProfileName': 'default-1602368497083',
  'DomainId': 'd-dsxoghy6ztwy'},
 'LastModifiedTime': datetime.datetime(2020, 11, 2, 21, 33, 13, 168000, tzinfo=tzlocal(

# Pass Variables to the Next Notebook(s)

In [28]:
%store recommender_training_job_name

Stored 'recommender_training_job_name' (str)
