In [21]:
!mkdir -p pipelines/code

In [22]:
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [23]:
%%writefile pipelines/code/user_item_matrix_transformer.py

import numpy as np
import pandas as pd

from pipeliner.recommendations.transformer import UserItemMatrixTransformer

data_types = {"user_id": str, "item_id": str, "rating": np.float64}

if __name__ == "__main__":
    base_dir = "/opt/ml/processing"
    input_file = "user_item_ratings.csv"
    output_file = "user_item_matrix.csv"

    user_item_ratings = pd.read_csv(f"{base_dir}/{input_file}", dtype=data_types)
    transformer = UserItemMatrixTransformer()
    user_item_matrix = transformer.transform(user_item_ratings)

    user_item_matrix.to_csv(f"{base_dir}/{output_file}", header=True, index=False)

Writing pipelines/code/user_item_matrix_transformer.py


In [24]:
%%writefile pipelines/code/similarity_matrix_transformer.py

import numpy as np
import pandas as pd

from pipeliner.recommendations.transformer import SimilarityTransformer

import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--kind", type=str, default="user")
parser.add_argument("--metric", type=str, default="cosine")
args = parser.parse_args()

if __name__ == "__main__":
    base_dir = "/opt/ml/processing"
    input_file = "user_item_matrix.csv"
    output_file = "item_similarity_matrix.csv"

    user_item_matrix = pd.read_csv(f"{base_dir}/{input_file}", dtype=np.float64)
    transformer = SimilarityTransformer(kind=args.kind, metric=args.metric)
    similarity_matrix = transformer.transform(user_item_matrix)

    similarity_matrix.to_csv(f"{base_dir}/{output_file}", header=True, index=False)

Writing pipelines/code/similarity_matrix_transformer.py


In [None]:
%%writefile pipelines/code/item_recommender_train.py



In [25]:
%%writefile pipelines/code/item_recommender_pipeline.py
import sagemaker
from sagemaker import ScriptProcessor
from sagemaker.workflow.pipeline_context import LocalPipelineSession
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import CacheConfig, ProcessingStep, TrainingStep

from pipeliner.factory import SagemakerPipelineFactory


class RecommenderPipeline(SagemakerPipelineFactory):
    local: bool

    def create(
        self,
        role: str,
        name: str,
        session: sagemaker.Session,
    ) -> Pipeline:
        self.local = isinstance(session, LocalPipelineSession)

        instance_type = ParameterString(
            name="InstanceType",
            default_value="local" if self.local else "ml.m5.large",
        )

        image_uri = sagemaker.image_uris.retrieve(
            framework="sklearn",
            region=session.boto_region_name,
            version="1.2-1",
        )

        cache_config = CacheConfig(
            enable_caching=True,
            expire_after="P30d" # 30 days
        )

        processor = ScriptProcessor(
            image_uri=image_uri,
            command=["python3"],
            instance_type=instance_type,
            instance_count=1,
            role=role,
            sagemaker_session=session,
        )

        user_item_matrix_step = ProcessingStep(
            name="user_item_matrix_transformer",
            step_args=processor.run(
                code="pipelines/code/user_item_matrix_transformer.py",
            ),
        )

        item_similarity_matrix_step = ProcessingStep(
            name="similarity_matrix_transformer",
            step_args=processor.run(
                code="pipelines/code/similarity_matrix_transformer.py",
            ),  
            job_arguments=[  
                "--kind", "item"  
            ],
        )

        sklearn_estimator = SKLearn(
            entry_point="pipelines/code/item_recommender_train.py",
            role=role,
            image_uri=image_uri,
            instance_type=instance_type,
            sagemaker_session=session,
            base_job_name="training_job",
            # hyperparameters=hyperparameters,
            enable_sagemaker_metrics=True,
        )

        training_step = TrainingStep(
            name="Train",
            estimator=sklearn_estimator,
            cache_config=cache_config
        )

        return Pipeline(
            name=name,
            steps=[
                user_item_matrix_step, 
                item_similarity_matrix_step,
                training_step
            ],
            sagemaker_session=session,
            parameters=[instance_type],
        )

Writing pipelines/code/item_recommender_pipeline.py
