In [13]:
!mkdir -p notebooks/pipelines/code

In [14]:
import os
import sys
from sagemaker import Session

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from pipeliner.exceptions import SagemakerSessionException
from pipeliner.sagemaker.session import create_pipeline_session
from pipeliner.sagemaker.pipeline import PipelineFactory

In [15]:
%%writefile notebooks/pipelines/code/transform.py
import argparse
import os
import requests
import tempfile

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# # Since we get a headerless CSV file, we specify the column names here.
# feature_columns_names = [
#     "sex",
#     "length",
#     "diameter",
#     "height",
#     "whole_weight",
#     "shucked_weight",
#     "viscera_weight",
#     "shell_weight",
# ]
# label_column = "rings"

# feature_columns_dtype = {
#     "sex": str,
#     "length": np.float64,
#     "diameter": np.float64,
#     "height": np.float64,
#     "whole_weight": np.float64,
#     "shucked_weight": np.float64,
#     "viscera_weight": np.float64,
#     "shell_weight": np.float64,
# }
# label_column_dtype = {"rings": np.float64}


# def merge_two_dicts(x, y):
#     z = x.copy()
#     z.update(y)
#     return z


# if __name__ == "__main__":
#     base_dir = "/opt/ml/processing"

#     df = pd.read_csv(
#         f"{base_dir}/input/abalone-dataset.csv",
#         header=None,
#         names=feature_columns_names + [label_column],
#         dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype),
#     )
#     numeric_features = list(feature_columns_names)
#     numeric_features.remove("sex")
#     numeric_transformer = Pipeline(
#         steps=[
#             ("imputer", SimpleImputer(strategy="median")),
#             ("scaler", StandardScaler()),
#         ]
#     )

#     categorical_features = ["sex"]
#     categorical_transformer = Pipeline(
#         steps=[
#             ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
#             ("onehot", OneHotEncoder(handle_unknown="ignore")),
#         ]
#     )

#     preprocess = ColumnTransformer(
#         transformers=[
#             ("num", numeric_transformer, numeric_features),
#             ("cat", categorical_transformer, categorical_features),
#         ]
#     )

#     y = df.pop("rings")
#     X_pre = preprocess.fit_transform(df)
#     y_pre = y.to_numpy().reshape(len(y), 1)

#     X = np.concatenate((y_pre, X_pre), axis=1)

#     np.random.shuffle(X)
#     train, validation, test = np.split(X, [int(0.7 * len(X)), int(0.85 * len(X))])

#     pd.DataFrame(train).to_csv(f"{base_dir}/train/train.csv", header=False, index=False)
#     pd.DataFrame(validation).to_csv(
#         f"{base_dir}/validation/validation.csv", header=False, index=False
#     )
#     pd.DataFrame(test).to_csv(f"{base_dir}/test/test.csv", header=False, index=False)

Overwriting notebooks/pipelines/code/transform.py


In [16]:
%%writefile notebooks/pipelines/code/recommender_pipeline.py
import sagemaker
from sagemaker import ScriptProcessor
from sagemaker.workflow.pipeline_context import LocalPipelineSession
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep

from pipeliner.factory import SagemakerPipelineFactory


class RecommenderPipeline(SagemakerPipelineFactory):
    local: bool

    def create(
        self,
        role: str,
        name: str,
        session: sagemaker.Session,
    ) -> Pipeline:
        self.local = isinstance(session, LocalPipelineSession)

        instance_type = ParameterString(
            name="InstanceType",
            default_value="local" if self.local else "ml.m5.large",
        )

        image_uri = sagemaker.image_uris.retrieve(
            framework="sklearn",
            region=session.boto_region_name,
            version="1.2-1",
        )

        # Create a ScriptProcessor and add code / run parameters
        processor = ScriptProcessor(
            image_uri=image_uri,
            command=["python3"],
            instance_type=instance_type,
            instance_count=1,
            role=role,
            sagemaker_session=session,
        )

        processing_step = ProcessingStep(
            name="processing-example",
            step_args=processor.run(
                code="pipelines/sources/example_pipeline/evaluate.py",
            ),
        )

        return Pipeline(
            name=name,
            steps=[processing_step],
            sagemaker_session=session,
            parameters=[instance_type],
        )

Overwriting notebooks/pipelines/code/recommender_pipeline.py
