In [457]:
%load_ext autoreload
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [458]:
%pip install -e ../ --quiet

Note: you may need to restart the kernel to use updated packages.


Log into Docker registry with ECR credentials

In [459]:
!aws ecr get-login-password --region eu-west-1 | docker login --username AWS --password-stdin 141502667606.dkr.ecr.eu-west-1.amazonaws.com

Login Succeeded


In [460]:
!mkdir -p pipelines/recommendations/code

In [461]:
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [462]:
%%writefile pipelines/recommendations/code/user_item_matrix_transformer.py

import numpy as np
import pandas as pd
import logging
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import cosine_similarity

logging.basicConfig(level=logging.INFO)


class UserItemMatrixTransformer(TransformerMixin, BaseEstimator):
    """
    This class is a custom scikit-learn transformer
    that accepts a pandas dataframe of user/item interactions
    and returns a user/item matrix.

    :param user (str): Column name for user id
    :param item (str): Column name for item id
    :param rating (float): Column name for user/item rating
    :param agg (str): Panadas aggregation function to use when combining duplicate user/item interactions
    :param binary (bool): If True, user/item interactions are converted to binary values in the user/item output matrix
    """

    def __init__(
        self, user="user_id", item="item_id", rating="rating", agg="max", binary=False
    ):
        self.user = user
        self.item = item
        self.rating = rating
        self.agg = agg
        self.binary = binary

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame):
        matrix = X.groupby([self.user, self.item])[self.rating].agg(self.agg).unstack()
        if self.binary:
            return matrix.notnull().astype(int)
        else:
            return matrix.fillna(0)


if __name__ == "__main__":
    base_dir = "/opt/ml/processing"
    input_file = "test_user_item_ratings.csv"
    output_file = "user_item_matrix.csv"
    data_types = {"user_id": str, "item_id": str, "rating": np.float64}

    logging.info(os.listdir(base_dir))
    logging.info(os.listdir(base_dir + "/input"))

    user_item_ratings = pd.read_csv(f"{base_dir}/{input_file}", dtype=data_types, engine='python')
    transformer = UserItemMatrixTransformer()
    user_item_matrix = transformer.transform(user_item_ratings)

    user_item_matrix.to_csv(f"{base_dir}/{output_file}", header=True, index=False)

Overwriting pipelines/recommendations/code/user_item_matrix_transformer.py


In [463]:
%%writefile pipelines/recommendations/code/similarity_matrix_transformer.py

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import cosine_similarity
import argparse

class SimilarityTransformer(TransformerMixin, BaseEstimator):
    """
    This class is a custom scikit-learn transformer
    that accepts a user/item matrix where user ids are
    the index and item ids are the columns and returns
    a similarity matrix. It can be used to calculate
    user-user or item-item similarity.
    """

    def __init__(self, kind="user", metric="cosine", normalise=False):
        if kind not in ["user", "item"]:
            raise ValueError("kind must be 'user' or 'item'")
        if metric not in ["cosine", "dot", "euclidean"]:
            raise ValueError("metric must be 'cosine', 'dot', or 'euclidean'")
        self.kind = kind
        self.metric = metric
        self.normalise = normalise

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame):
        matrix = X
        if self.kind == "item":
            matrix = X.T

        if self.metric == "cosine":
            df = pd.DataFrame(
                cosine_similarity(matrix), index=matrix.index, columns=matrix.index
            )
        else:
            raise NotImplementedError("Only cosine similarity is currently supported")

        if self.normalise:
            df = (df - df.min()) / (df.max() - df.min())

        return df



parser = argparse.ArgumentParser()
parser.add_argument("--kind", type=str, default="user")
parser.add_argument("--metric", type=str, default="cosine")
args = parser.parse_args()

if __name__ == "__main__":
    base_dir = "/opt/ml/processing"
    input_file = "user_item_matrix.csv"
    output_file = f"{args.kind}_similarity_matrix.csv"

    user_item_matrix = pd.read_csv(f"{base_dir}/{input_file}", dtype=np.float64)
    transformer = SimilarityTransformer(kind=args.kind, metric=args.metric)
    similarity_matrix = transformer.transform(user_item_matrix)

    similarity_matrix.to_csv(f"{base_dir}/{output_file}", header=True, index=False)

Overwriting pipelines/recommendations/code/similarity_matrix_transformer.py


In [464]:
%%writefile pipelines/recommendations/code/item_recommender_train.py



Overwriting pipelines/recommendations/code/item_recommender_train.py


In [465]:
import sagemaker
import boto3

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client("iam")
    role = iam.get_role(RoleName="pipeliner")["Role"]["Arn"]

role



'arn:aws:iam::477807511636:role/pipeliner'

In [466]:
session = LocalPipelineSession()
session.config = {"local": {"local_code": True}}

region = session.boto_region_name
default_bucket = session.default_bucket()
prefix = "recommendations"
base_uri = f"s3://{default_bucket}/{prefix}/user_item"



INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [467]:
ratings_data_path = "../tests/test_data/test_user_item_ratings.csv"

input_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=ratings_data_path,
    desired_s3_uri=base_uri,
)
input_data_uri

's3://sagemaker-eu-west-1-477807511636/recommendations/user_item/test_user_item_ratings.csv'

In [468]:
from sagemaker.workflow.pipeline_context import LocalPipelineSession
import sagemaker
from sagemaker import ScriptProcessor
from sagemaker.workflow.pipeline_context import LocalPipelineSession
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import CacheConfig, ProcessingStep, TrainingStep


class RecommenderPipeline:
    def create(
        self,
        role: str,
        name: str,
        session: sagemaker.Session,
        framework_version = "1.2-1",
    ) -> Pipeline:
        self.local = isinstance(session, LocalPipelineSession)
        self.framework_version = framework_version

        instance_type = ParameterString(
            name="InstanceType",
            default_value="local" if self.local else "ml.m5.large",
        )
        
        input_data = ParameterString(
            name="user_item_ratings",
            default_value=input_data_uri,
        )

        image_uri = sagemaker.image_uris.retrieve(
            framework="sklearn",
            region=session.boto_region_name,
            version="1.2-1",
        )

        cache_config = CacheConfig(
            enable_caching=True,
            expire_after="P30d",  # 30 days
        )

        processor = SKLearnProcessor(
            framework_version=framework_version,
            instance_type=instance_type,
            instance_count=1,
            base_job_name="sklearn-preprocess",
            role=role,
            sagemaker_session=session,
        )

        user_item_matrix_step = ProcessingStep(
            name="user_item_matrix_transformer",
            step_args=processor.run(
                inputs=[
                    ProcessingInput(source=input_data, input_name="user_item_ratings", destination="/opt/ml/processing"),
                ],
                outputs=[
                    ProcessingOutput(output_name="user_item_matrix", source="/opt/ml/processing"),
                ],
                code="pipelines/recommendations/code/user_item_matrix_transformer.py",
            ),
        )

        user_similarity_matrix_step = ProcessingStep(
            name="user_similarity_matrix_transformer",
            step_args=processor.run(
                inputs=[
                    ProcessingInput(
                        source=user_item_matrix_step.properties.ProcessingOutputConfig.Outputs["user_item_matrix"].S3Output.S3Uri,
                        destination="/opt/ml/processing"),
                ],
                outputs=[
                    ProcessingOutput(output_name="user_similarity_matrix", source="/opt/ml/processing"),
                ],
                code="pipelines/recommendations/code/similarity_matrix_transformer.py",
            ),
            job_arguments=["--kind", "user"],
        )

        item_similarity_matrix_step = ProcessingStep(
            name="item_similarity_matrix_transformer",
            step_args=processor.run(
                inputs=[
                    ProcessingInput(
                        source=user_item_matrix_step.properties.ProcessingOutputConfig.Outputs["user_item_matrix"].S3Output.S3Uri,
                        destination="/opt/ml/processing"),
                ],
                outputs=[
                    ProcessingOutput(output_name="item_similarity_matrix", source="/opt/ml/processing"),
                ],
                code="pipelines/recommendations/code/similarity_matrix_transformer.py",
            ),
            job_arguments=["--kind", "item"],
        )

        # sklearn_estimator = SKLearn(
        #     entry_point="pipelines/recommendations/code/item_recommender_train.py",
        #     role=role,
        #     image_uri=image_uri,
        #     instance_type=instance_type,
        #     sagemaker_session=session,
        #     base_job_name="training_job",
        #     # hyperparameters=hyperparameters,
        #     enable_sagemaker_metrics=True,
        # )

        # training_step = TrainingStep(
        #     name="Train", estimator=sklearn_estimator, cache_config=cache_config
        # )

        return Pipeline(
            name=name,
            # steps=[user_item_matrix_step, item_similarity_matrix_step, training_step],
            steps=[
                user_item_matrix_step, 
                user_similarity_matrix_step, 
                item_similarity_matrix_step
            ],
            sagemaker_session=session,
            parameters=[input_data, instance_type],
        )

In [469]:
pipeline = RecommenderPipeline().create(role=role, name="recommender", session=session)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [470]:
import json

definition = json.loads(pipeline.definition())
definition



{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'user_item_ratings',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-eu-west-1-477807511636/recommendations/user_item/test_user_item_ratings.csv'},
  {'Name': 'InstanceType', 'Type': 'String', 'DefaultValue': 'local'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'user_item_matrix_transformer',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': {'Get': 'Parameters.InstanceType'},
      'InstanceCount': 1,
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '141502667606.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3',
     'ContainerEntrypoint': ['python3',
      '/opt/ml/processing/input/code/user_item_matrix_transformer.py']},
    'RoleArn': 'arn:aws:iam::477807511636:role/pipeliner',
    'ProcessingInputs': [

In [471]:
pipeline.upsert(role_arn=role)

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.


{'PipelineArn': 'recommender'}

In [472]:
execution = pipeline.start()

INFO:sagemaker.local.entities:Starting execution for pipeline recommender. Execution ID is c8705c30-491f-471a-af02-5feb576f1667
INFO:sagemaker.local.entities:Starting pipeline step: 'user_item_matrix_transformer'
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.local.image:'Docker Compose' found using Docker CLI.
INFO:sagemaker.local.local_session:Starting processing job
INFO:sagemaker.local.image:Using the long-lived AWS credentials found in session
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-hvz2m:
    container_name: x

 Container xvpg1pwt7f-algo-1-hvz2m  Creating
 Container xvpg1pwt7f-algo-1-hvz2m  Created
Attaching to xvpg1pwt7f-algo-1-hvz2m
xvpg1pwt7f-algo-1-hvz2m  | INFO:root:['input']
xvpg1pwt7f-algo-1-hvz2m  | INFO:root:['code']
xvpg1pwt7f-algo-1-hvz2m  | Traceback (most recent call last):
xvpg1pwt7f-algo-1-hvz2m  |   File "/opt/ml/processing/input/code/user_item_matrix_transformer.py", line 54, in <module>
xvpg1pwt7f-algo-1-hvz2m  |     user_item_ratings = pd.read_csv(f"{base_dir}/{input_file}", dtype=data_types, engine='python')
xvpg1pwt7f-algo-1-hvz2m  |   File "/miniconda3/lib/python3.8/site-packages/pandas/io/parsers.py", line 686, in read_csv
xvpg1pwt7f-algo-1-hvz2m  |     return _read(filepath_or_buffer, kwds)
xvpg1pwt7f-algo-1-hvz2m  |   File "/miniconda3/lib/python3.8/site-packages/pandas/io/parsers.py", line 452, in _read
xvpg1pwt7f-algo-1-hvz2m  |     parser = TextFileReader(fp_or_buf, **kwds)
xvpg1pwt7f-algo-1-hvz2m  |   File "/miniconda3/lib/python3.8/site-packages/pandas/io/parsers

INFO:sagemaker.local.entities:Pipeline step 'user_item_matrix_transformer' FAILED. Failure message is: RuntimeError: Failed to run: ['docker', 'compose', '-f', '/private/var/folders/w1/813808f13m14dv7gd5ystxbc0000gn/T/tmpqq2j092p/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit']. Process exited with code: 1
INFO:sagemaker.local.entities:Pipeline execution c8705c30-491f-471a-af02-5feb576f1667 FAILED because step 'user_item_matrix_transformer' failed.


In [473]:
steps = execution.list_steps()
steps

{'PipelineExecutionSteps': [{'EndTime': 1721772258.914985,
   'FailureReason': "RuntimeError: Failed to run: ['docker', 'compose', '-f', '/private/var/folders/w1/813808f13m14dv7gd5ystxbc0000gn/T/tmpqq2j092p/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit']. Process exited with code: 1",
   'StartTime': 1721772251.837451,
   'StepName': 'user_item_matrix_transformer',
   'StepStatus': 'Failed'}]}