# Create a Kubeflow Pipeline with BERT and Amazon SageMaker

# Install Dependencies

In [1]:
!pip install sagemaker==1.72.0
!pip install https://storage.googleapis.com/ml-pipeline/release/0.1.29/kfp.tar.gz --upgrade

Collecting sagemaker==1.72.0
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/75ea837e2bd704b1567bdf55f7e768745da4fcf1e3b3e061e41ba7d7f399/sagemaker-1.72.0.tar.gz (297kB)
[K    100% |████████████████████████████████| 307kB 34.8MB/s ta 0:00:01
[?25hCollecting boto3>=1.14.12 (from sagemaker==1.72.0)
[?25l  Downloading https://files.pythonhosted.org/packages/eb/e4/1668f3e325b53a0dce17fd25fce1a14de15a70c3f13d60b4852dc80a55e2/boto3-1.15.6-py2.py3-none-any.whl (129kB)
[K    100% |████████████████████████████████| 133kB 58.2MB/s ta 0:00:01
Collecting protobuf3-to-dict>=0.1.5 (from sagemaker==1.72.0)
  Downloading https://files.pythonhosted.org/packages/6b/55/522bb43539fed463275ee803d79851faaebe86d17e7e3dbc89870d0322b9/protobuf3-to-dict-0.1.5.tar.gz
Collecting smdebug-rulesconfig==0.1.4 (from sagemaker==1.72.0)
  Downloading https://files.pythonhosted.org/packages/2c/d7/80252c50e8848101914457d1cf58ef7e20f34406fc660d26108a1fec866d/smdebug_rulesconfig-0.1.4-py2.py3-none-any.

Building wheels for collected packages: kfp
  Building wheel for kfp (setup.py) ... [?25ldone
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-t4_e3_8p/wheels/81/b7/33/00ef9dd992b13add014c4875a2c130d9d70288127a793c4af6
Successfully built kfp
Installing collected packages: kfp
  Found existing installation: kfp 0.1.29
    Uninstalling kfp-0.1.29:
      Successfully uninstalled kfp-0.1.29
Successfully installed kfp-0.1.29
[33mYou are using pip version 19.0.1, however version 20.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
!pip install awscli==1.18.140

Collecting awscli==1.18.140
[?25l  Downloading https://files.pythonhosted.org/packages/6d/67/c87f1b25c27b7970e83814dacb1353a8a21862e6dd15c24f50c4e0a66fa0/awscli-1.18.140-py2.py3-none-any.whl (3.3MB)
[K    100% |████████████████████████████████| 3.3MB 15.3MB/s eta 0:00:01
Collecting botocore==1.17.63 (from awscli==1.18.140)
[?25l  Downloading https://files.pythonhosted.org/packages/d1/96/5f11cca11d08703a5149bf668b2d455d5e633e12ae8e58a860f442b02112/botocore-1.17.63-py2.py3-none-any.whl (6.6MB)
[K    100% |████████████████████████████████| 6.6MB 8.2MB/s eta 0:00:011
[31mboto3 1.15.6 has requirement botocore<1.19.0,>=1.18.6, but you'll have botocore 1.17.63 which is incompatible.[0m
Installing collected packages: botocore, awscli
  Found existing installation: botocore 1.18.6
    Uninstalling botocore-1.18.6:
      Successfully uninstalled botocore-1.18.6
  Found existing installation: awscli 1.17.8
    Uninstalling awscli-1.17.8:
      Successfully uninstalled awscli-1.17.8
Successf

# _Note: Ignore all pip install warning or errors above_

In [None]:
# Restart the kernel to pick up pip installed libraries
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

# Setup Environment Variables

In [1]:
import boto3

aws_region_as_slist=!curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone | sed 's/\(.*\)[a-z]/\1/'
region = aws_region_as_slist.s
print('Region: {}'.format(region))

account_id=boto3.client('sts').get_caller_identity().get('Account')
print('Account ID: {}'.format(account_id))

bucket='sagemaker-{}-{}'.format(region, account_id)
print('S3 Bucket: {}'.format(bucket))

role='arn:aws:iam::{}:role/TeamRole'.format(account_id)
print('SageMaker Role ARN: {}'.format(role))

Region: us-east-1
Account ID: 146478650644
S3 Bucket: sagemaker-us-east-1-146478650644
SageMaker Role ARN: arn:aws:iam::146478650644:role/TeamRole


# Copy Data from Public S3 to Private S3

In [2]:
s3_public_path_tsv = 's3://amazon-reviews-pds/tsv'

In [3]:
s3_private_path_tsv = 's3://{}/amazon-reviews-pds/tsv'.format(bucket)
print(s3_private_path_tsv)

s3://sagemaker-us-east-1-146478650644/amazon-reviews-pds/tsv


In [4]:
!aws s3 cp --recursive $s3_public_path_tsv/ $s3_private_path_tsv/ --exclude "*" --include "amazon_reviews_us_Digital_Software_v1_00.tsv.gz"
!aws s3 cp --recursive $s3_public_path_tsv/ $s3_private_path_tsv/ --exclude "*" --include "amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz"
!aws s3 cp --recursive $s3_public_path_tsv/ $s3_private_path_tsv/ --exclude "*" --include "amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tsv.gz"

copy: s3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz to s3://sagemaker-us-east-1-146478650644/amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz
copy: s3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz to s3://sagemaker-us-east-1-146478650644/amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz
copy: s3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tsv.gz to s3://sagemaker-us-east-1-146478650644/amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tsv.gz


In [5]:
raw_input_data_s3_uri = 's3://{}/amazon-reviews-pds/tsv/'.format(bucket)

# Build Pipeline

In [6]:
import kfp
from kfp import components
from kfp import dsl
from kfp.aws import use_aws_secret

In [7]:
sagemaker_process_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/process/component.yaml')

In [8]:
sagemaker_train_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/train/component.yaml')

In [9]:
sagemaker_model_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/model/component.yaml')

In [10]:
sagemaker_deploy_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/3ebd075212e0a761b982880707ec497c36a99d80/components/aws/sagemaker/deploy/component.yaml')

# Setup Pre-Processing Code 

In [11]:
processing_code_s3_uri = 's3://{}/processing_code/preprocess-scikit-text-to-bert.py'.format(bucket)
print(processing_code_s3_uri)

!aws s3 cp ./preprocess-scikit-text-to-bert.py $processing_code_s3_uri

s3://sagemaker-us-east-1-146478650644/processing_code/preprocess-scikit-text-to-bert.py
upload: ./preprocess-scikit-text-to-bert.py to s3://sagemaker-us-east-1-146478650644/processing_code/preprocess-scikit-text-to-bert.py


# Package and Upload Training Code to S3

In [12]:
!tar -cvzf sourcedir.tar.gz -C ./code .

./
./inference.py
./test-local.sh
./tf_bert_reviews.py
./requirements.txt
./test_data/
./test_data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz


In [13]:
training_code_s3_uri = 's3://{}/training_code/sourcedir.tar.gz'.format(bucket)
print(training_code_s3_uri)

!aws s3 cp sourcedir.tar.gz $training_code_s3_uri

s3://sagemaker-us-east-1-146478650644/training_code/sourcedir.tar.gz
upload: ./sourcedir.tar.gz to s3://sagemaker-us-east-1-146478650644/training_code/sourcedir.tar.gz


In [14]:
def processing_input(input_name, s3_uri, local_path, s3_data_distribution_type):
    return {
        "InputName": input_name,
        "S3Input": {
            "LocalPath": local_path,
            "S3Uri": s3_uri,
            "S3DataType": "S3Prefix",
            "S3DataDistributionType": s3_data_distribution_type,
            "S3InputMode": "File",
        },
    }

def processing_output(output_name, s3_uri, local_path, s3_upload_mode):
    return {
        "OutputName": output_name,
        "S3Output": {
            "LocalPath": local_path,            
            "S3Uri": s3_uri,
            "S3UploadMode": s3_upload_mode
        },
    }

In [15]:
def training_input(input_name, s3_uri, s3_data_distribution_type):
    return {
        "ChannelName": input_name,
        "DataSource": {
            "S3DataSource": {
                "S3Uri": s3_uri, 
                "S3DataType": "S3Prefix",
                "S3DataDistributionType": s3_data_distribution_type                
            }
        },
    }

# Setup Pipeline

In [16]:
@dsl.pipeline(
    name="BERT Pipeline",
    description="BERT Pipeline",
)
def bert_pipeline(role=role, 
                  bucket=bucket, 
                  region=region,
                  raw_input_data_s3_uri=raw_input_data_s3_uri):

    import time
    import json

    processing_image='763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-training:2.1.0-cpu-py36-ubuntu18.04'.format(region)
    train_image='763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-training:2.1.0-cpu-py36-ubuntu18.04'.format(region)        
    serve_image='763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-inference:2.1.0-cpu'.format(region)

    pipeline_name = 'kubeflow-pipeline-sagemaker-{}'.format(int(time.time()))

    network_isolation=False
    
    ########################
    # FEATURE ENGINEERING
    ########################    
    
    max_seq_length=64
    train_split_percentage=0.90
    validation_split_percentage=0.05
    test_split_percentage=0.05
    balance_dataset=True

    processed_train_data_s3_uri = 's3://{}/{}/processing/output/bert-train'.format(bucket, pipeline_name)
    processed_validation_data_s3_uri = 's3://{}/{}/processing/output/bert-validation'.format(bucket, pipeline_name)
    processed_test_data_s3_uri = 's3://{}/{}/processing/output/bert-test'.format(bucket, pipeline_name)

    processing_instance_type = 'ml.c5.2xlarge'
    processing_instance_count = 2

    # Training input and output location based on bucket name
    process = sagemaker_process_op(
        role=role,
        region=region,
        image=processing_image,
        network_isolation=network_isolation,
        instance_type=processing_instance_type,
        instance_count=processing_instance_count,
        container_arguments=['--train-split-percentage', str(train_split_percentage),
                             '--validation-split-percentage', str(validation_split_percentage),
                             '--test-split-percentage', str(test_split_percentage),
                             '--max-seq-length', str(max_seq_length),
                             '--balance-dataset', str(balance_dataset)],
        container_entrypoint=[
            "python3",
            "/opt/ml/processing/input/code/preprocess-scikit-text-to-bert.py",
        ],
        input_config=[
            processing_input(
                input_name="raw_input",
                s3_uri="{}".format(raw_input_data_s3_uri),
                local_path="/opt/ml/processing/input/data/",
                s3_data_distribution_type="ShardedByS3Key"
            ),
            processing_input(
                input_name="code",
                s3_uri="{}".format(processing_code_s3_uri),
                local_path="/opt/ml/processing/input/code",
                s3_data_distribution_type="FullyReplicated"
            ),
        ],
        output_config=[
            processing_output(
                output_name="bert-train",
                s3_uri="{}".format(processed_train_data_s3_uri),
                local_path="/opt/ml/processing/output/bert/train",
                s3_upload_mode="EndOfJob"
            ),
            processing_output(
                output_name="bert-validation",
                s3_uri="{}".format(processed_validation_data_s3_uri),
                local_path="/opt/ml/processing/output/bert/validation",
                s3_upload_mode="EndOfJob"
            ),
            processing_output(
                output_name="bert-test",
                s3_uri="{}".format(processed_test_data_s3_uri),
                local_path="/opt/ml/processing/output/bert/test",
                s3_upload_mode="EndOfJob"
            ),
        ],
    )


    ########################
    # TRAIN
    ########################
    
    train_channels = [
        training_input(input_name="train", 
                       s3_uri=processed_train_data_s3_uri,
                       s3_data_distribution_type="ShardedByS3Key"
        ),
        training_input(input_name="validation", 
                       s3_uri=processed_validation_data_s3_uri,
                       s3_data_distribution_type="ShardedByS3Key"
        ),                       
        training_input(input_name="test", 
                       s3_uri=processed_test_data_s3_uri,
                       s3_data_distribution_type="ShardedByS3Key"
        )
    ]

    epochs=1
    learning_rate=0.00001
    epsilon=0.00000001
    train_batch_size=128
    validation_batch_size=128
    test_batch_size=128
    train_steps_per_epoch=100
    validation_steps=100
    test_steps=100
    train_volume_size=1024
    use_xla=True
    use_amp=True
    freeze_bert_layer=False
    enable_sagemaker_debugger=False
    enable_checkpointing=False
    enable_tensorboard=False
    input_mode='Pipe'
    run_validation=True
    run_test=True
    run_sample_predictions=True

    train_instance_type='ml.c5.9xlarge'   
    train_instance_count=1

    train_output_location = "s3://{}/{}/output".format(bucket, pipeline_name)
    
    hyperparameters={
        'epochs': '{}'.format(epochs),
        'learning_rate': '{}'.format(learning_rate),
        'epsilon': '{}'.format(epsilon),
        'train_batch_size': '{}'.format(train_batch_size),
        'validation_batch_size': '{}'.format(validation_batch_size),
        'test_batch_size': '{}'.format(test_batch_size),                                             
        'train_steps_per_epoch': '{}'.format(train_steps_per_epoch),
        'validation_steps': '{}'.format(validation_steps),
        'test_steps': '{}'.format(test_steps),
        'use_xla': '{}'.format(use_xla),
        'use_amp': '{}'.format(use_amp),                                             
        'max_seq_length': '{}'.format(max_seq_length),
        'freeze_bert_layer': '{}'.format(freeze_bert_layer),
        'enable_sagemaker_debugger': '{}'.format(enable_sagemaker_debugger),
        'enable_checkpointing': '{}'.format(enable_checkpointing),
        'enable_tensorboard': '{}'.format(enable_tensorboard),                                        
        'run_validation': '{}'.format(run_validation),
        'run_test': '{}'.format(run_test),
        'run_sample_predictions': '{}'.format(run_sample_predictions),
        'model_dir': '{}'.format(train_output_location),
        'sagemaker_program': 'tf_bert_reviews.py',
        'sagemaker_region': '{}'.format(region),
        'sagemaker_submit_directory': training_code_s3_uri
    }
    hyperparameters_json = json.dumps(hyperparameters)
   
    # metric_definitions='{"val_acc": "val_accuracy: ([0-9\\\\.]+)"}',
    metrics_definitions = [
        {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'},
        {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\.]+)'},
        {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\.]+)'},
        {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\.]+)'},
    ]
    metrics_definitions_json = json.dumps(metrics_definitions)
    print(metrics_definitions_json)

    # .after(process) is explicitly appended below
    training = sagemaker_train_op(
        region=region,
        image=train_image,
        network_isolation=network_isolation,        
        instance_type=train_instance_type,
        instance_count=train_instance_count,
        hyperparameters=hyperparameters_json,
        training_input_mode=input_mode,    
        channels=train_channels,        
        model_artifact_path=train_output_location,
        # metric_definitions=metrics_definitions_json,
        # TODO:  Add rules
        role=role        
    ).after(process)


    ########################
    # DEPLOY
    ########################
    
    # .after(training) is implied because we depend on training.outputs[]
    create_model = sagemaker_model_op(
        region=region,
        model_name=training.outputs["job_name"],
        image=serve_image,
        network_isolation=network_isolation,       
        model_artifact_url=training.outputs["model_artifact_url"],
        role=role
    )

    deploy_instance_type='ml.c5.9xlarge'
    deploy_instance_count=1

    # .after(create_model) is implied because we depend on create_model.outputs
    deploy_model = sagemaker_deploy_op(
        region=region,
        variant_name_1='AllTraffic',
        model_name_1=create_model.output,
        instance_type_1=deploy_instance_type,
        initial_instance_count_1=deploy_instance_count        
    )

# Compile Kubeflow Pipeline

In [17]:
kfp.compiler.Compiler().compile(bert_pipeline, 'bert-pipeline.zip')

[{"Name": "train:loss", "Regex": "loss: ([0-9\\.]+)"}, {"Name": "train:accuracy", "Regex": "accuracy: ([0-9\\.]+)"}, {"Name": "validation:loss", "Regex": "val_loss: ([0-9\\.]+)"}, {"Name": "validation:accuracy", "Regex": "val_accuracy: ([0-9\\.]+)"}]


  serialized_value),


In [18]:
!ls -al ./bert-pipeline.zip

-rw-r--r-- 1 root users 2440 Sep 26 22:44 ./bert-pipeline.zip


In [19]:
!unzip -o ./bert-pipeline.zip

Archive:  ./bert-pipeline.zip
  inflating: pipeline.yaml           


In [20]:
!pygmentize pipeline.yaml

[94mapiVersion[39;49;00m: argoproj.io/v1alpha1
[94mkind[39;49;00m: Workflow
[94mmetadata[39;49;00m:
  [94mannotations[39;49;00m:
    [94mpipelines.kubeflow.org/pipeline_spec[39;49;00m: [33m'[39;49;00m[33m{"description":[39;49;00m[31m [39;49;00m[33m"BERT[39;49;00m[31m [39;49;00m[33mPipeline",[39;49;00m[31m [39;49;00m[33m"inputs":[39;49;00m
      [33m[{"default":[39;49;00m[31m [39;49;00m[33m"arn:aws:iam::146478650644:role/TeamRole",[39;49;00m[31m [39;49;00m[33m"name":[39;49;00m[31m [39;49;00m[33m"role"},[39;49;00m[31m [39;49;00m[33m{"default":[39;49;00m
      [33m"sagemaker-us-east-1-146478650644",[39;49;00m[31m [39;49;00m[33m"name":[39;49;00m[31m [39;49;00m[33m"bucket"},[39;49;00m[31m [39;49;00m[33m{"default":[39;49;00m[31m [39;49;00m[33m"us-east-1",[39;49;00m
      [33m"name":[39;49;00m[31m [39;49;00m[33m"region"},[39;49;00m[31m [39;49;00m[33m{"default":[39;49;00m[31m [39;49;00m[33m"s3://sagemaker-us-east

# Launch Pipeline on Kubernetes Cluster

In [21]:
client = kfp.Client()

experiment = client.create_experiment(name='kubeflow')

my_run = client.run_pipeline(experiment.id, 
                             'bert-pipeline', 
                             'bert-pipeline.zip')

# Review Training Job

_Note:  The above training job may take 5-10 minutes.  Please be patient._

In the meantime, open the SageMaker Console to monitor the progress of your training job.

![SageMaker Training Job Console](img/sagemaker-training-job-console.png)

# Review the Endpoint
First, we need to get the endpoint name of our newly-deployed SageMaker Prediction Endpoint.

Open AWS console and enter SageMaker service, find the endpoint name as the following picture shows.

![download-pipeline](img/sm-endpoint.jpg)

# Make a Prediction

In [22]:
import boto3

sm_runtime = boto3.Session(region_name=region).client('sagemaker-runtime')

## _YOU MUST COPY/PASTE THE `endpoint_name` BEFORE CONTINUING_
Make sure to include preserve the single-quotes as shown below.

![](img/sm-endpoint-kubeflow.png)

In [23]:
endpoint_name = 'Endpoint-20200926225117-50FH'

In [24]:
import json

review = "This is great!".encode('utf-8')

response = sm_runtime.invoke_endpoint(EndpointName=endpoint_name,
                                      ContentType='application/json',
                                      Body=review)

predicted_class = json.loads(response['Body'].read().decode())
print('Predicted Star Rating: {}'.format(predicted_class))

Predicted Star Rating: [5]
