Create/update a `.env` file in the project root including the following environment variables.

AWS_DEFAULT_SAGEMAKER_BUCKET

AWS_PROFILE

In [1]:
%load_ext autoreload
%autoreload 1

In [3]:
!mkdir -p ../../build
!rm -rf ../../build/*
!cp ../pipelines/recommendations_np/code/* ../../build/
!cp -r ../../src/pipeliner ../../build/

In [4]:
build_path = "../../build"

In [5]:
import os
AWS_DEFAULT_SAGEMAKER_BUCKET = os.environ.get("AWS_DEFAULT_SAGEMAKER_BUCKET", None)
DEFAULT_BUCKET_PREFIX = "pipelines"
if AWS_DEFAULT_SAGEMAKER_BUCKET is None:
    raise ValueError("AWS_DEFAULT_SAGEMAKER_BUCKET is not set")

Log into Docker registry with ECR credentials

In [None]:
!aws ecr get-login-password --region eu-west-1 | docker login --username AWS --password-stdin 141502667606.dkr.ecr.eu-west-1.amazonaws.com

In [9]:
import os
import sys

module_path = os.path.abspath(os.path.join("../../"))
if module_path not in sys.path:
    sys.path.append(module_path)

In [10]:
import sagemaker
import boto3

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client("iam")
    role = iam.get_role(RoleName="pipeliner")["Role"]["Arn"]

role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/pappa/.config/sagemaker/config.yaml


Couldn't call 'get_role' to get Role ARN from role name pappa to get Role path.


'arn:aws:iam::477807511636:role/pipeliner'

In [12]:
from sagemaker.workflow.pipeline_context import LocalPipelineSession

session = LocalPipelineSession(
    default_bucket=AWS_DEFAULT_SAGEMAKER_BUCKET,
    default_bucket_prefix=DEFAULT_BUCKET_PREFIX,
)
session.config = {"local": {"local_code": True}}

region = session.boto_region_name
default_bucket = session.default_bucket()

'sagemaker-eu-west-1-477807511636'

In [16]:
import pandas as pd
import numpy as np

data_types = {"user_id": str, "item_id": str, "rating": np.float32}

user_item_interactions = pd.read_csv(
    "../../tests/test_data/user_item_interactions_30_days.csv", dtype=data_types, engine="python"
)
user_item_interactions.head(3)

Unnamed: 0,user_id,item_id,date,interactions
0,U007714,I00372373,2024-09-08,1.0
1,U007714,I00605528,2024-09-08,1.0
2,U013522,I01182960,2024-09-08,2.0


In [None]:
# save the data for later use
!mkdir -p ../pipelines/recommendations_np/data

ratings_data_path = "../pipelines/recommendations_np/data/user_item_interactions.csv.gz"

user_item_interactions.to_csv(
    "../pipelines/recommendations_np/data/user_item_interactions.csv.gz", compression="gzip", index=False
)

In [38]:
# temporarily recreate the code in preprocessor.py

df = user_item_interactions


MIN_USER_RATINGS = 5
INTERACTION_CAP = 5

user_id_value_counts = df.user_id.value_counts()

excluded_users = (
    user_id_value_counts[user_id_value_counts < MIN_USER_RATINGS]
    .index.to_series()
    .reset_index(drop=True)
)

excluded_data_df = df[
    df.user_id.isin(excluded_users)
]

test_train_data_df = df[
    ~df.user_id.isin(excluded_users)
].sort_values(by=["date", "user_id"], ascending=True)

test_data_df = (
    test_train_data_df
    .reset_index()
    .groupby(["user_id"], as_index=False)
    .last()
    .set_index("index")
)[["user_id", "item_id"]]
test_data_df.index.names = [None]

train_data_df = (
    test_train_data_df[~test_train_data_df.index.isin(test_data_df.index)]
    .groupby(["user_id", "item_id"])
    .agg({"interactions": "sum"})
    .reset_index()
)


train_data_df["interactions_capped"] = np.minimum(
    train_data_df.interactions, INTERACTION_CAP
)
train_data_df["rating"] = 1 + np.log1p(
    train_data_df["interactions_capped"]
)
train_data_df["rating"] = (
    train_data_df["rating"] / train_data_df["rating"].max()
).round(5)

train_data_df = train_data_df[["user_id", "item_id", "rating"]]

display(excluded_data_df.head(3))
display(test_train_data_df.head(3))


display(test_data_df.head(3))
display(train_data_df.head(3))

excluded_data_df.shape, test_train_data_df.shape


Unnamed: 0,user_id,item_id,date,interactions
13,U020471,I00004763,2024-09-08,1.0
14,U006722,I00544482,2024-09-08,1.0
16,U068565,I01127785,2024-09-08,1.0


Unnamed: 0,user_id,item_id,date,interactions
274092,U000029,I00343676,2024-09-01,1.0
11548,U000048,I00647513,2024-09-01,1.0
22449,U000086,I00139088,2024-09-01,1.0


Unnamed: 0,user_id,item_id
1504178,U000003,I01111027
1111144,U000004,I00440954
1810652,U000005,I00209959


Unnamed: 0,user_id,item_id,rating
0,U000003,I00037925,0.60648
1,U000003,I00189384,0.60648
2,U000003,I00256366,0.60648


((49952, 4), (1777406, 4))

In [17]:
input_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=ratings_data_path,
    desired_s3_uri=f"s3://{default_bucket}/{DEFAULT_BUCKET_PREFIX}/recommender/data",
)
input_data_uri

's3://sagemaker-eu-west-1-477807511636/pipelines/recommender/data/user_item_interactions_30_days.csv'

In [None]:
import sagemaker
from sagemaker.inputs import TrainingInput
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_context import LocalPipelineSession
from sagemaker.workflow.steps import CacheConfig, ProcessingStep, TrainingStep
from sagemaker.workflow.properties import PropertyFile


class RecommenderPipeline:
    def create(
        self,
        role: str,
        name: str,
        session: sagemaker.Session,
        framework_version="1.2-1",
    ) -> Pipeline:
        self.local = isinstance(session, LocalPipelineSession)
        self.framework_version = framework_version

        instance_type = ParameterString(
            name="InstanceType",
            default_value="local" if self.local else "ml.m5.large",
        )

        input_data = ParameterString(
            name="user_item_interactions",
            default_value=input_data_uri,
        )

        image_uri = sagemaker.image_uris.retrieve(
            framework="sklearn",
            region=session.boto_region_name,
            version="1.2-1",
        )

        cache_config = CacheConfig(
            enable_caching=True,
            expire_after="P30d",  # 30 days
        )

        processor = SKLearnProcessor(
            framework_version=framework_version,
            instance_type=instance_type,
            instance_count=1,
            base_job_name="sklearn-processor",
            role=role,
            sagemaker_session=session,
        )

        user_item_interactions_input = ProcessingInput(
            source=input_data,
            input_name="user_item_interactions",
            destination="/opt/ml/processing/input/data",
        )

        pipeliner_input = ProcessingInput(
            source=build_path + "/pipeliner",
            input_name="pipeliner",
            destination="/opt/ml/processing/input/code/pipeliner",
        )

        preprocessor_step = ProcessingStep(
            name="preprocessor",
            step_args=processor.run(
                inputs=[
                    user_item_interactions_input,
                    pipeliner_input,
                ],
                outputs=[
                    ProcessingOutput(
                        output_name="user_item_matrix",
                        source="/opt/ml/processing/output/user_item_matrix",
                    ),
                    ProcessingOutput(
                        output_name="user_similarity_matrix",
                        source="/opt/ml/processing/output/user_similarity_matrix",
                    ),
                    ProcessingOutput(
                        output_name="item_similarity_matrix",
                        source="/opt/ml/processing/output/item_similarity_matrix",
                    ),
                    ProcessingOutput(
                        output_name="test",
                        source="/opt/ml/processing/output/test",
                    ),
                ],
                code=build_path + "/preprocessor.py",
            ),
        )

        sklearn_estimator = SKLearn(
            entry_point="item_based_recommender.py",
            source_dir=build_path,
            role=role,
            image_uri=image_uri,
            instance_type=instance_type,
            sagemaker_session=session,
            base_job_name="training_job",
            # hyperparameters=hyperparameters,
            enable_sagemaker_metrics=True,
        )

        item_training_step = TrainingStep(
            name="item_based_recommender",
            estimator=sklearn_estimator,
            cache_config=cache_config,
            inputs={
                "user_item_matrix": TrainingInput(
                    s3_data=preprocessor_step.properties.ProcessingOutputConfig.Outputs[
                        "user_item_matrix"
                    ].S3Output.S3Uri,
                    content_type="text/csv",
                ),
                "item_similarity_matrix": TrainingInput(
                    s3_data=preprocessor_step.properties.ProcessingOutputConfig.Outputs[
                        "item_similarity_matrix"
                    ].S3Output.S3Uri,
                    content_type="text/csv",
                ),
            },
        )

        evaluation_processor = SKLearnProcessor(
            framework_version=framework_version,
            instance_type=instance_type,
            instance_count=1,
            base_job_name="sklearn-evaluation",
            role=role,
            sagemaker_session=session,
        )

        evaluation_report = PropertyFile(
            name="item_based_evaluation",
            output_name="evaluation",
            path="evaluation.json",
        )

        evaluation_step = ProcessingStep(
            name="item_based_evaluation",
            step_args=evaluation_processor.run(
                inputs=[
                    ProcessingInput(
                        source=item_training_step.properties.ModelArtifacts.S3ModelArtifacts,
                        destination="/opt/ml/processing/model",
                        input_name="model",
                    ),
                    ProcessingInput(
                        source=preprocessor_step.properties.ProcessingOutputConfig.Outputs[
                            "test"
                        ].S3Output.S3Uri,
                        destination="/opt/ml/processing/test",
                    ),
                    pipeliner_input,
                ],
                outputs=[
                    ProcessingOutput(
                        output_name="evaluation", source="/opt/ml/processing/evaluation"
                    ),
                ],
                code="pipelines/recommendations/code/item_based_evaluation.py",
            ),
            property_files=[evaluation_report],
        )

        return Pipeline(
            name=name,
            steps=[
                preprocessor_step,
                item_training_step,
                evaluation_step,
            ],
            sagemaker_session=session,
            parameters=[input_data, instance_type],
        )

In [None]:
pipeline = RecommenderPipeline().create(role=role, name="recommender", session=session)

In [None]:
import json

definition = json.loads(pipeline.definition())
[
    {"Name": step.get("Name"), "Type": step.get("Type")}
    for step in definition.get("Steps")
]

In [None]:
pipeline.upsert(role_arn=role)

In [None]:
execution = pipeline.start()

In [None]:
steps = execution.list_steps()
steps