Create/update a `.env` file in the project root including the following environment variables.

AWS_DEFAULT_SAGEMAKER_BUCKET

AWS_PROFILE

In [None]:
%load_ext autoreload
%autoreload 1

In [None]:
%pip install -e ../ --quiet

In [None]:
!python -m build ../

In [None]:
import os

dist_path = "../dist"
package_path = os.path.join(dist_path, [f for f in os.listdir(dist_path) if f.endswith(".tar.gz")][0])
package_path

In [None]:
import os
AWS_DEFAULT_SAGEMAKER_BUCKET = os.environ.get("AWS_DEFAULT_SAGEMAKER_BUCKET", None)
DEFAULT_BUCKET_PREFIX = "pipelines"
if AWS_DEFAULT_SAGEMAKER_BUCKET is None:
    raise ValueError("AWS_DEFAULT_SAGEMAKER_BUCKET is not set")

Log into Docker registry with ECR credentials

In [None]:
!aws ecr get-login-password --region eu-west-1 | docker login --username AWS --password-stdin 141502667606.dkr.ecr.eu-west-1.amazonaws.com

In [None]:
!mkdir -p pipelines/recommendations/code

In [None]:
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import sagemaker
import boto3

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client("iam")
    role = iam.get_role(RoleName="pipeliner")["Role"]["Arn"]

role

In [None]:
from sagemaker.workflow.pipeline_context import LocalPipelineSession

session = LocalPipelineSession(
    default_bucket=AWS_DEFAULT_SAGEMAKER_BUCKET,
    default_bucket_prefix=DEFAULT_BUCKET_PREFIX,
)
session.config = {"local": {"local_code": True}}

region = session.boto_region_name
default_bucket = session.default_bucket()

In [None]:
import pandas as pd
import numpy as np

ratings_data_path = "../tests/test_data/user_item_ratings.csv"
data_types = {"user_id": str, "item_id": str, "rating": np.float64}

user_item_ratings = pd.read_csv(ratings_data_path, dtype=data_types, engine='python')
user_item_ratings.head(5)

In [None]:
input_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=ratings_data_path,
    desired_s3_uri=f"s3://{default_bucket}/{DEFAULT_BUCKET_PREFIX}/recommender/data",
)
input_data_uri

In [None]:
import sagemaker
from sagemaker.inputs import TrainingInput
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_context import LocalPipelineSession
from sagemaker.workflow.steps import CacheConfig, ProcessingStep, TrainingStep


class RecommenderPipeline:
    def create(
        self,
        role: str,
        name: str,
        session: sagemaker.Session,
        framework_version="1.2-1",
    ) -> Pipeline:
        self.local = isinstance(session, LocalPipelineSession)
        self.framework_version = framework_version

        instance_type = ParameterString(
            name="InstanceType",
            default_value="local" if self.local else "ml.m5.large",
        )

        input_data = ParameterString(
            name="user_item_ratings",
            default_value=input_data_uri,
        )

        image_uri = sagemaker.image_uris.retrieve(
            framework="sklearn",
            region=session.boto_region_name,
            version="1.2-1",
        )

        cache_config = CacheConfig(
            enable_caching=True,
            expire_after="P30d",  # 30 days
        )

        processor = SKLearnProcessor(
            framework_version=framework_version,
            instance_type=instance_type,
            instance_count=1,
            base_job_name="sklearn-preprocess",
            role=role,
            sagemaker_session=session,
        )

        user_item_ratings_input = ProcessingInput(
            source=input_data,
            input_name="user_item_ratings",
            destination="/opt/ml/processing/input/data",
        )

        pipeliner_input = ProcessingInput(
            source="../src/pipeliner",
            input_name="pipeliner",
            destination="/opt/ml/processing/input/code/pipeliner",
        )

        user_item_matrix_step = ProcessingStep(
            name="user_item_matrix_transformer",
            step_args=processor.run(
                inputs=[
                    user_item_ratings_input,
                    pipeliner_input,
                ],
                outputs=[
                    ProcessingOutput(
                        output_name="user_item_matrix",
                        source="/opt/ml/processing/output/data",
                    ),
                ],
                code="pipelines/recommendations/code/user_item_matrix_transformer.py",
            ),
        )

        user_similarity_matrix_step = ProcessingStep(
            name="user_similarity_matrix_transformer",
            step_args=processor.run(
                inputs=[
                    ProcessingInput(
                        source=user_item_matrix_step.properties.ProcessingOutputConfig.Outputs[
                            "user_item_matrix"
                        ].S3Output.S3Uri,
                        input_name="user_item_matrix",
                        destination="/opt/ml/processing/input/data",
                    ),
                    pipeliner_input,
                ],
                outputs=[
                    ProcessingOutput(
                        output_name="user_similarity_matrix",
                        source="/opt/ml/processing/output/data",
                    ),
                ],
                code="pipelines/recommendations/code/similarity_matrix_transformer.py",
            ),
            job_arguments=["--kind", "user"],
        )

        item_similarity_matrix_step = ProcessingStep(
            name="item_similarity_matrix_transformer",
            step_args=processor.run(
                inputs=[
                    ProcessingInput(
                        source=user_item_matrix_step.properties.ProcessingOutputConfig.Outputs[
                            "user_item_matrix"
                        ].S3Output.S3Uri,
                        input_name="user_item_matrix",
                        destination="/opt/ml/processing/input/data",
                    ),
                    pipeliner_input,
                ],
                outputs=[
                    ProcessingOutput(
                        output_name="item_similarity_matrix",
                        source="/opt/ml/processing/output/data",
                    ),
                ],
                code="pipelines/recommendations/code/similarity_matrix_transformer.py",
            ),
            job_arguments=["--kind", "item"],
        )

        sklearn_estimator = SKLearn(
            entry_point="pipelines/recommendations/code/user_based_recommender.py",
            role=role,
            image_uri=image_uri,
            instance_type=instance_type,
            sagemaker_session=session,
            base_job_name="training_job",
            # hyperparameters=hyperparameters,
            enable_sagemaker_metrics=True,
        )

        training_step = TrainingStep(
            name="Train",
            estimator=sklearn_estimator,
            cache_config=cache_config,
            inputs={
                "user_item_matrix": TrainingInput(
                    s3_data=user_item_matrix_step.properties.ProcessingOutputConfig.Outputs[
                        "user_item_matrix"
                    ].S3Output.S3Uri,
                    content_type="text/csv",
                ),
                "similarity_matrix": TrainingInput(
                    s3_data=user_similarity_matrix_step.properties.ProcessingOutputConfig.Outputs[
                        "user_similarity_matrix"
                    ].S3Output.S3Uri,
                    content_type="text/csv",
                ),
            },
        )

        return Pipeline(
            name=name,
            steps=[
                user_item_matrix_step,
                user_similarity_matrix_step,
                item_similarity_matrix_step,
                training_step,
            ],
            sagemaker_session=session,
            parameters=[input_data, instance_type],
        )

In [None]:
pipeline = RecommenderPipeline().create(role=role, name="recommender", session=session)

In [None]:
import json

definition = json.loads(pipeline.definition())
definition

In [None]:
pipeline.upsert(role_arn=role)

In [None]:
execution = pipeline.start()

In [None]:
steps = execution.list_steps()
steps