Create/update a `.env` file in the project root including the following environment variables.

AWS_DEFAULT_SAGEMAKER_BUCKET

AWS_PROFILE

In [None]:
%load_ext autoreload
%autoreload 1

In [None]:
%pip install -e ../ --quiet

In [None]:
!python -m build ../

In [None]:
import os

dist_path = "../dist"
package_path = os.path.join(dist_path, [f for f in os.listdir(dist_path) if f.endswith(".tar.gz")][0])
package_path

In [None]:
import os
AWS_DEFAULT_SAGEMAKER_BUCKET = os.environ.get("AWS_DEFAULT_SAGEMAKER_BUCKET", None)
DEFAULT_BUCKET_PREFIX = "pipelines"
if AWS_DEFAULT_SAGEMAKER_BUCKET is None:
    raise ValueError("AWS_DEFAULT_SAGEMAKER_BUCKET is not set")

Log into Docker registry with ECR credentials

In [None]:
!aws ecr get-login-password --region eu-west-1 | docker login --username AWS --password-stdin 141502667606.dkr.ecr.eu-west-1.amazonaws.com

In [None]:
!mkdir -p pipelines/recommendations/code

In [None]:
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
%%writefile pipelines/recommendations/code/user_item_matrix_transformer.py

import numpy as np
import pandas as pd
import logging
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import cosine_similarity

logging.basicConfig(level=logging.INFO)


class UserItemMatrixTransformer(TransformerMixin, BaseEstimator):
    """
    This class is a custom scikit-learn transformer
    that accepts a pandas dataframe of user/item interactions
    and returns a user/item matrix.

    :param user (str): Column name for user id
    :param item (str): Column name for item id
    :param rating (float): Column name for user/item rating
    :param agg (str): Panadas aggregation function to use when combining duplicate user/item interactions
    :param binary (bool): If True, user/item interactions are converted to binary values in the user/item output matrix
    """

    def __init__(
        self, user="user_id", item="item_id", rating="rating", agg="max", binary=False
    ):
        self.user = user
        self.item = item
        self.rating = rating
        self.agg = agg
        self.binary = binary

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame):
        matrix = X.groupby([self.user, self.item])[self.rating].agg(self.agg).unstack()
        if self.binary:
            return matrix.notnull().astype(int)
        else:
            return matrix.fillna(0)


if __name__ == "__main__":
    base_dir = "/opt/ml/processing"
    data_types = {"user_id": str, "item_id": str, "rating": np.float64}

    user_item_ratings = pd.read_csv(f"{base_dir}/input/data/user_item_ratings.csv", dtype=data_types, engine='python')
    
    transformer = UserItemMatrixTransformer()
    user_item_matrix = transformer.transform(user_item_ratings)

    user_item_matrix.to_csv(f"{base_dir}/output/data/user_item_matrix.csv", header=True, index=False)

In [None]:
%%writefile pipelines/recommendations/code/similarity_matrix_transformer.py

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import cosine_similarity
import argparse

class SimilarityTransformer(TransformerMixin, BaseEstimator):
    """
    This class is a custom scikit-learn transformer
    that accepts a user/item matrix where user ids are
    the index and item ids are the columns and returns
    a similarity matrix. It can be used to calculate
    user-user or item-item similarity.
    """

    def __init__(self, kind="user", metric="cosine", normalise=False):
        if kind not in ["user", "item"]:
            raise ValueError("kind must be 'user' or 'item'")
        if metric not in ["cosine", "dot", "euclidean"]:
            raise ValueError("metric must be 'cosine', 'dot', or 'euclidean'")
        self.kind = kind
        self.metric = metric
        self.normalise = normalise

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame):
        matrix = X
        if self.kind == "item":
            matrix = X.T

        if self.metric == "cosine":
            df = pd.DataFrame(
                cosine_similarity(matrix), index=matrix.index, columns=matrix.index
            )
        else:
            raise NotImplementedError("Only cosine similarity is currently supported")

        if self.normalise:
            df = (df - df.min()) / (df.max() - df.min())

        return df


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--kind", type=str, default="user")
    parser.add_argument("--metric", type=str, default="cosine")
    args = parser.parse_args()
    
    base_dir = "/opt/ml/processing"

    user_item_matrix = pd.read_csv(f"{base_dir}/input/data/user_item_matrix.csv", dtype=np.float64)
    
    transformer = SimilarityTransformer(kind=args.kind, metric=args.metric)
    similarity_matrix = transformer.transform(user_item_matrix)

    similarity_matrix.to_csv(f"{base_dir}/output/data/{args.kind}_similarity_matrix.csv", header=True, index=False)

In [None]:
%%writefile pipelines/recommendations/code/item_based_recommender.py

import argparse
import os
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
import joblib
from collections.abc import Sequence


class ItemBasedRecommender(BaseEstimator):
    """Item-based collaborative filtering recommender."""

    n: int
    threshold: float
    similarity_matrix: pd.DataFrame
    user_item_matrix: pd.DataFrame

    def __init__(self, n=5, threshold=0.1):
        self.n = n
        self.threshold = threshold

    def fit(self, X: pd.DataFrame | tuple[pd.DataFrame, pd.DataFrame], y=None):
        """Fits the recommender to the given data.

        Args:
          X (pd.DataFrame | tuple[pd.DataFrame, pd.DataFrame]):
            Single DataFrame with similarity matrix
            or tuple of (similarity matrix, user/item matrix)
        """
        if isinstance(X, pd.DataFrame):
            self.similarity_matrix = X
        elif isinstance(X, tuple):
            self.similarity_matrix = X[0]
            self.user_item_matrix = X[1]
        else:
            raise ValueError("Input should be DataFrame or (DataFrame, DataFrame)")

        return self

    def _get_exclusions(self, item_id: str, user_id: str | None) -> list[str]:
        if user_id is None:
            return [item_id]
        single_user_matrix = self.user_item_matrix.loc[user_id]
        user_rated_items = single_user_matrix[single_user_matrix > 0]
        return [item_id] + user_rated_items.index.to_list()

    def _get_recommendations(self, item: str | tuple[str, str]) -> np.array:
        if isinstance(item, str):
            item_id, user_id = item, None
        elif isinstance(item, tuple):
            item_id, user_id = item[0], item[1]
        else:
            raise ValueError("Input items should be str or (str, str)")

        exclusions = self._get_exclusions(item_id, user_id)

        item_recommendations = (
            self.similarity_matrix[self.similarity_matrix[item_id] > self.threshold][
                item_id
            ]
            .drop(exclusions, errors="ignore")
            .sort_values(ascending=False)
        )
        return np.array(item_recommendations.head(self.n).index)

    def predict(self, X: Sequence[str] | Sequence[tuple[str, str]]) -> np.array:
        """Predicts n item recommendations for each item_id provided
        If tuples of (user_id, item_id) are provided, items previously
        rated by the user will be excluded from the recommendations.

        Args:
          X (Sequence): List of item_id or (item_id, user_id)

        Returns:
          np.array of shape (X.shape[0], n)
        """
        return np.array([self._get_recommendations(item) for item in X])


if __name__ == "__main__":
    base_dir = "/opt/ml/processing"

    user_item_matrix = pd.read_csv(f"{base_dir}/input/data/user_item_matrix.csv", dtype=np.float64)
    similarity_matrix = pd.read_csv(f"{base_dir}/input/data/user_similarity_matrix.csv", dtype=np.float64)
    
    rec = UserBasedRecommender(5, 5, 0.1)
    rec.fit((similarity_matrix, user_item_matrix))

    joblib.dump(rec, os.path.join(f"{base_dir}/output/model/", "rec.joblib"))

In [None]:
%%writefile pipelines/recommendations/code/user_based_recommender.py

import argparse
import os
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
import joblib
from collections.abc import Sequence
import logging

logging.basicConfig(level=logging.INFO)


class UserBasedRecommender(BaseEstimator):
    """User-based collaborative filtering recommender."""

    n: int
    n_users: int
    threshold: float
    similarity_matrix: pd.DataFrame
    user_item_matrix: pd.DataFrame

    def __init__(self, n=5, n_users=5, threshold=0.1):
        self.n = n
        self.n_users = n_users
        self.threshold = threshold

    def fit(self, X, y=None):
        """Fits the recommender to the given data.

        Args:
          X (tuple[pd.DataFrame, pd.DataFrame]):
            tuple of (similarity matrix, user/item matrix)
        """
        if isinstance(X, tuple):
            self.similarity_matrix = X[0]
            self.user_item_matrix = X[1]
        else:
            raise ValueError("Input should be tuple of (DataFrame, DataFrame)")

        return self

    def _get_similar_users(self, user_id: str):
        return (
            self.similarity_matrix[self.similarity_matrix[user_id] > self.threshold][
                user_id
            ]
            .drop(user_id, errors="ignore")
            .sort_values(ascending=False)
        )

    def _get_exclusions(self, user_id):
        single_user_matrix = self.user_item_matrix.loc[user_id]
        user_rated_items = single_user_matrix[single_user_matrix > 0]
        return user_rated_items.index.to_list()

    def _get_recommendations(self, user_id):
        if not isinstance(user_id, str):
            raise ValueError("Input items should be str")
        exclusions = self._get_exclusions(user_id)
        similar_users = self._get_similar_users(user_id)
        matrix = self.user_item_matrix.T[similar_users.head(self.n_users).index]

        user_recommendations = (
            matrix[~matrix.index.isin(exclusions) & (matrix > 0).any(axis="columns")]
            .max(axis=1)
            .sort_values(ascending=False)
        )

        return np.array(user_recommendations.head(self.n).index)

    def predict(self, X):
        """Predicts n item recommendations for each user_id provided.

        Args:
          X (Sequence): List of user_id

        Returns:
          np.array of shape (X.shape[0], n)
        """
        return np.array([self._get_recommendations(item) for item in X])

    # def predict_proba(self, X):
    #     raise NotImplementedError("predict_proba not implemented yet")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--output_data_dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR"))
    parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--input", type=str, default=os.environ.get("SM_INPUT_DIR"))
    parser.add_argument("--output", type=str, default=os.environ.get("SM_OUTPUT_DIR"))

    

    args = parser.parse_args()

    logging.info(f"SM_OUTPUT_DATA_DIR: {args.output_data_dir}")
    logging.info(f"SM_MODEL_DIR: {args.model_dir}")
    logging.info(f"SM_CHANNEL_TRAIN: {args.train}")
    logging.info(f"SM_INPUT_DIR: {args.input}")
    logging.info(f"SM_OUTPUT_DIR: {args.output}")
    
    base_dir = "/opt/ml"
    
    logging.info(os.listdir(base_dir))
    logging.info(os.listdir(args.input))
    logging.info(os.listdir(f"{args.input}/data"))

    user_item_matrix = pd.read_csv(f"{args.input}/data/user_item_matrix/user_item_matrix.csv", dtype=np.float64)
    similarity_matrix = pd.read_csv(f"{args.input}/data/similarity_matrix/user_similarity_matrix.csv", dtype=np.float64)
    
    rec = UserBasedRecommender(5, 5, 0.1).fit((similarity_matrix, user_item_matrix))

    joblib.dump(rec, os.path.join(args.model_dir, "rec.joblib"))


In [None]:
import sagemaker
import boto3

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client("iam")
    role = iam.get_role(RoleName="pipeliner")["Role"]["Arn"]

role

In [None]:
from sagemaker.workflow.pipeline_context import LocalPipelineSession

session = LocalPipelineSession(
    default_bucket=AWS_DEFAULT_SAGEMAKER_BUCKET,
    default_bucket_prefix=DEFAULT_BUCKET_PREFIX,
)
session.config = {"local": {"local_code": True}}

region = session.boto_region_name
default_bucket = session.default_bucket()

In [None]:
import pandas as pd
import numpy as np

ratings_data_path = "../tests/test_data/user_item_ratings.csv"
data_types = {"user_id": str, "item_id": str, "rating": np.float64}

user_item_ratings = pd.read_csv(ratings_data_path, dtype=data_types, engine='python')
user_item_ratings.head(5)

In [None]:
input_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=ratings_data_path,
    desired_s3_uri=f"s3://{default_bucket}/{DEFAULT_BUCKET_PREFIX}/recommender/data",
)
input_data_uri

In [None]:
import sagemaker
from sagemaker.inputs import TrainingInput
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_context import LocalPipelineSession
from sagemaker.workflow.steps import CacheConfig, ProcessingStep, TrainingStep


class RecommenderPipeline:
    def create(
        self,
        role: str,
        name: str,
        session: sagemaker.Session,
        framework_version = "1.2-1",
    ) -> Pipeline:
        self.local = isinstance(session, LocalPipelineSession)
        self.framework_version = framework_version

        instance_type = ParameterString(
            name="InstanceType",
            default_value="local" if self.local else "ml.m5.large",
        )

        input_data = ParameterString(
            name="user_item_ratings",
            default_value=input_data_uri,
        )

        image_uri = sagemaker.image_uris.retrieve(
            framework="sklearn",
            region=session.boto_region_name,
            version="1.2-1",
        )

        cache_config = CacheConfig(
            enable_caching=True,
            expire_after="P30d",  # 30 days
        )

        processor = SKLearnProcessor(
            framework_version=framework_version,
            instance_type=instance_type,
            instance_count=1,
            base_job_name="sklearn-preprocess",
            role=role,
            sagemaker_session=session,
        )

        user_item_matrix_step = ProcessingStep(
            name="user_item_matrix_transformer",
            step_args=processor.run(
                inputs=[
                    ProcessingInput(
                        source=input_data,
                        input_name="user_item_ratings",
                        destination="/opt/ml/processing/input/data",
                    ),
                    ProcessingInput(
                        source="../src/pipeliner",
                        input_name="pipeliner",
                        destination="/opt/ml/processing/input/code/pipeliner",
                    ),
                ],
                outputs=[
                    ProcessingOutput(
                        output_name="user_item_matrix",
                        source="/opt/ml/processing/output/data",
                    ),
                ],
                code="pipelines/recommendations/code/user_item_matrix_transformer.py",
            ),
        )

        user_similarity_matrix_step = ProcessingStep(
            name="user_similarity_matrix_transformer",
            step_args=processor.run(
                inputs=[
                    ProcessingInput(
                        source=user_item_matrix_step.properties.ProcessingOutputConfig.Outputs["user_item_matrix"].S3Output.S3Uri,
                        input_name="user_item_matrix",
                        destination="/opt/ml/processing/input/data"),
                ],
                outputs=[
                    ProcessingOutput(
                        output_name="user_similarity_matrix", 
                        source="/opt/ml/processing/output/data"),
                ],
                code="pipelines/recommendations/code/similarity_matrix_transformer.py",
            ),
            job_arguments=["--kind", "user"],
        )

        item_similarity_matrix_step = ProcessingStep(
            name="item_similarity_matrix_transformer",
            step_args=processor.run(
                inputs=[
                    ProcessingInput(
                        source=user_item_matrix_step.properties.ProcessingOutputConfig.Outputs["user_item_matrix"].S3Output.S3Uri,
                        input_name="user_item_matrix",
                        destination="/opt/ml/processing/input/data"),
                ],
                outputs=[
                    ProcessingOutput(
                        output_name="item_similarity_matrix", 
                        source="/opt/ml/processing/output/data"),
                ],
                code="pipelines/recommendations/code/similarity_matrix_transformer.py",
            ),
            job_arguments=["--kind", "item"],
        )

        sklearn_estimator = SKLearn(
            entry_point="pipelines/recommendations/code/user_based_recommender.py",
            role=role,
            image_uri=image_uri,
            instance_type=instance_type,
            sagemaker_session=session,
            base_job_name="training_job",
            # hyperparameters=hyperparameters,
            enable_sagemaker_metrics=True,
        )

        training_step = TrainingStep(
            name="Train", 
            estimator=sklearn_estimator, 
            cache_config=cache_config,
            inputs={
                "user_item_matrix": TrainingInput(
                    s3_data=user_item_matrix_step.properties.ProcessingOutputConfig.Outputs[
                        "user_item_matrix"
                    ].S3Output.S3Uri,
                    content_type="text/csv",
                ),
                "similarity_matrix": TrainingInput(
                    s3_data=user_similarity_matrix_step.properties.ProcessingOutputConfig.Outputs[
                        "user_similarity_matrix"
                    ].S3Output.S3Uri,
                    content_type="text/csv",
                ),
            }
        )

        return Pipeline(
            name=name,
            steps=[
                user_item_matrix_step, 
                user_similarity_matrix_step, 
                item_similarity_matrix_step,
                training_step
            ],
            sagemaker_session=session,
            parameters=[input_data, instance_type],
        )

In [None]:
pipeline = RecommenderPipeline().create(role=role, name="recommender", session=session)

In [None]:
import json

definition = json.loads(pipeline.definition())
definition

In [None]:
pipeline.upsert(role_arn=role)

In [None]:
execution = pipeline.start()

In [None]:
steps = execution.list_steps()
steps