In [8]:
import boto3
import sagemaker
from sagemaker import get_execution_role
import json
import tarfile
import os

In [9]:
def train_my_xgboost(train, code_files, script, hyperparameters={}, role=None, prefix=None, bucket=None, train_instance_type='ml.m5.xlarge'):
    
    # 创建tar.gz文件
    def create_tar_file(source_files, target=None):
        if target:
            filename = target
        else:
            _, filename = tempfile.mkstemp()

        with tarfile.open(filename, mode="w:gz") as t:
            for sf in source_files:
                # Add all files from the directory into the root of the directory structure of the tar
                t.add(sf, arcname=os.path.basename(sf))
        return filename
    # 超参数encode成json
    def json_encode_hyperparameters(hyperparameters):
        return {str(k): json.dumps(v) for (k, v) in hyperparameters.items()}
    
    
    sagemaker_session = sagemaker.session.Session()
    
    # 取得默认的bucket
    if not bucket:
        print('Using default bucket ', end='')
        bucket = sagemaker_session.default_bucket()
        print(bucket)
    
    if not code_files[0].startswith('s3://'):
        print('Uploading code to S3:', end='')
        # 把代码文件打爆
        create_tar_file(code_files, "sourcedir.tar.gz")
        # 上传代码文件
        sources = sagemaker_session.upload_data("sourcedir.tar.gz", bucket, prefix + "/code")
        print(sources)
    else:
        sources = code_files
    
    # 把代码的s3位置放进超参数
    hyperparameters['sagemaker_submit_directory']= sources
    
    # encode超参数
    hyperparameters = json_encode_hyperparameters(
        hyperparameters
    )
    
    if not role:
        print('Getting default Role', end='')
        role = get_execution_role()
        print(role)
    
    # 放入如下内容
    # docker ecr链接
    # role
    # 同时训练的数量
    # 机器类型
    # training jobs 前缀
    # 超参数
    est = sagemaker.estimator.Estimator(
        '337058716437.dkr.ecr.ca-central-1.amazonaws.com/xgboost_001',
        role,
        train_instance_count=1,
        train_instance_type='ml.m5.xlarge',
        # train_instance_type="local",
        base_job_name=prefix,
        hyperparameters=hyperparameters,
    )
    
    # 这个可以做映射的文件，假如有666，那么文件会被挂载到/opt/ml/input/data/666/
    est.fit({"train": train})
    
    
train = 's3://sagemaker-ca-central-1-337058716437/script-mode-container-2/train/'
script = 'train.py'

role = 'arn:aws:iam::337058716437:role/SageMaker-Execution'
code_files = ["code/source_dir/train.py"]
script = 'train.py'
prefix = 'test-mlops'

hyperparameters = {
                     "sagemaker_program": "train.py",
                     "hp1": {'xgboost':'123',
                             'test':'ttt'
                            },
                     "hp2": 300,
                     "hp3": 0.001,
                   }
train_my_xgboost(train, code_files, script, hyperparameters=hyperparameters,
                 role=role,
                 prefix=prefix
                )

UnknownServiceError: Unknown service: 'sagemaker-featurestore-runtime'. Valid service names are: accessanalyzer, acm, acm-pca, alexaforbusiness, amplify, apigateway, apigatewaymanagementapi, apigatewayv2, appconfig, appflow, application-autoscaling, application-insights, appmesh, appstream, appsync, athena, autoscaling, autoscaling-plans, backup, batch, braket, budgets, ce, chime, cloud9, clouddirectory, cloudformation, cloudfront, cloudhsm, cloudhsmv2, cloudsearch, cloudsearchdomain, cloudtrail, cloudwatch, codeartifact, codebuild, codecommit, codedeploy, codeguru-reviewer, codeguruprofiler, codepipeline, codestar, codestar-connections, codestar-notifications, cognito-identity, cognito-idp, cognito-sync, comprehend, comprehendmedical, compute-optimizer, config, connect, connectparticipant, cur, dataexchange, datapipeline, datasync, dax, detective, devicefarm, directconnect, discovery, dlm, dms, docdb, ds, dynamodb, dynamodbstreams, ebs, ec2, ec2-instance-connect, ecr, ecs, efs, eks, elastic-inference, elasticache, elasticbeanstalk, elastictranscoder, elb, elbv2, emr, es, events, firehose, fms, forecast, forecastquery, frauddetector, fsx, gamelift, glacier, globalaccelerator, glue, greengrass, groundstation, guardduty, health, honeycode, iam, identitystore, imagebuilder, importexport, inspector, iot, iot-data, iot-jobs-data, iot1click-devices, iot1click-projects, iotanalytics, iotevents, iotevents-data, iotsecuretunneling, iotsitewise, iotthingsgraph, ivs, kafka, kendra, kinesis, kinesis-video-archived-media, kinesis-video-media, kinesis-video-signaling, kinesisanalytics, kinesisanalyticsv2, kinesisvideo, kms, lakeformation, lambda, lex-models, lex-runtime, license-manager, lightsail, logs, machinelearning, macie, macie2, managedblockchain, marketplace-catalog, marketplace-entitlement, marketplacecommerceanalytics, mediaconnect, mediaconvert, medialive, mediapackage, mediapackage-vod, mediastore, mediastore-data, mediatailor, meteringmarketplace, mgh, migrationhub-config, mobile, mq, mturk, neptune, networkmanager, opsworks, opsworkscm, organizations, outposts, personalize, personalize-events, personalize-runtime, pi, pinpoint, pinpoint-email, pinpoint-sms-voice, polly, pricing, qldb, qldb-session, quicksight, ram, rds, rds-data, redshift, redshift-data, rekognition, resource-groups, resourcegroupstaggingapi, robomaker, route53, route53domains, route53resolver, s3, s3control, s3outposts, sagemaker, sagemaker-a2i-runtime, sagemaker-runtime, savingsplans, schemas, sdb, secretsmanager, securityhub, serverlessrepo, service-quotas, servicecatalog, servicediscovery, ses, sesv2, shield, signer, sms, sms-voice, snowball, sns, sqs, ssm, sso, sso-admin, sso-oidc, stepfunctions, storagegateway, sts, support, swf, synthetics, textract, timestream-query, timestream-write, transcribe, transfer, translate, waf, waf-regional, wafv2, workdocs, worklink, workmail, workmailmessageflow, workspaces, xray

In [54]:
import boto3
import sagemaker
from sagemaker import get_execution_role
import json
import tarfile
import os

def train_my_xgboost(train, code_files, script, hyperparameters={}, role=None, prefix=None, bucket=None, train_instance_type='ml.m5.xlarge'):
    
    # 创建tar.gz文件
    def create_tar_file(source_files, target=None):
        if target:
            filename = target
        else:
            _, filename = tempfile.mkstemp()

        with tarfile.open(filename, mode="w:gz") as t:
            for sf in source_files:
                # Add all files from the directory into the root of the directory structure of the tar
                t.add(sf, arcname=os.path.basename(sf))
        return filename
    # 超参数encode成json
    def json_encode_hyperparameters(hyperparameters):
        return {str(k): json.dumps(v) for (k, v) in hyperparameters.items()}
    
    
    sagemaker_session = sagemaker.session.Session()
    
    # 取得默认的bucket
    if not bucket:
        print('Using default bucket ', end='')
        bucket = sagemaker_session.default_bucket()
        print(bucket)
    
    if not code_files[0].startswith('s3://'):
        print('Uploading code to S3:', end='')
        # 把代码文件打爆
        create_tar_file(code_files, "sourcedir.tar.gz")
        # 上传代码文件
        sources = sagemaker_session.upload_data("sourcedir.tar.gz", bucket, prefix + "/code")
        print(sources)
    else:
        sources = code_files
    
    # 把代码的s3位置放进超参数
    hyperparameters['sagemaker_submit_directory']= sources
    
    # encode超参数
    hyperparameters = json_encode_hyperparameters(
        hyperparameters
    )
    
    if not role:
        print('Getting default Role', end='')
        role = get_execution_role()
        print(role)
    
    # 放入如下内容
    # docker ecr链接
    # role
    # 同时训练的数量
    # 机器类型
    # training jobs 前缀
    # 超参数
    est = sagemaker.estimator.Estimator(
        '337058716437.dkr.ecr.ca-central-1.amazonaws.com/xgboost_001',
        role,
        train_instance_count=1,
        train_instance_type=train_instance_type,
        # train_instance_type="local",
        base_job_name=prefix,
        hyperparameters=hyperparameters,
    )
    
    # 这个可以做映射的文件，假如有666，那么文件会被挂载到/opt/ml/input/data/666/
    est.fit({"train": train})
    return est
    
train = 's3://ca-central-sagemaker-test/iris-data/'
script = 'train.py'

# role = 'arn:aws:iam::337058716437:role/SageMaker-Execution'
code_files = ["code/train.py"]
script = 'train.py'
prefix = 'test-mlops'

hyperparameters = {
                     "sagemaker_program": script,
                     "hp1": {'xgboost':'123',
                             'test':'ttt'
                            },
                     "hp2": 300,
                     "hp3": 0.001,
                   }

est = train_my_xgboost(train, code_files, script, hyperparameters=hyperparameters,
                 # role=role,
                 prefix=prefix
                )

print("Training Finished!")

Using default bucket sagemaker-ca-central-1-337058716437
Uploading code to S3:s3://sagemaker-ca-central-1-337058716437/test-mlops/code/sourcedir.tar.gz
Getting default Rolearn:aws:iam::337058716437:role/SageMaker-Execution
2021-08-12 06:13:34 Starting - Starting the training job...
2021-08-12 06:13:36 Starting - Launching requested ML instances......
2021-08-12 06:14:47 Starting - Preparing the instances for training......
2021-08-12 06:15:51 Downloading - Downloading input data
2021-08-12 06:15:51 Training - Downloading the training image.....[34m2021-08-12 02:16:37,563 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-08-12 02:16:43,807 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-08-12 02:16:43,832 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-08-12 02:16:43,840 sagemaker-training-toolkit INFO     Invoking user script
[0m
[34mTraini

In [58]:
est.model_data

's3://sagemaker-ca-central-1-337058716437/test-mlops-2021-08-12-06-13-33-859/output/model.tar.gz'

In [33]:
from sklearn.datasets import load_iris
import pandas as pd

In [31]:
train = load_iris()

In [35]:
df = pd.DataFrame(train['data'])

In [38]:
df['target'] = train.target

In [52]:
df.to_csv("iris.csv", index=None)

In [41]:
import xgboost as xgb

clf = xgb.XGBClassifier(objective='multi:softprob')
clf.fit(df.drop('target', axis=1), df['target'])

In [59]:
clf.save_model('testmodel')

In [60]:
!ls

code	  iris.csv	    sourcedir.tar.gz  training.py
data	  model.pkl	    testmodel	      Untitled1.ipynb
irid.csv  requirements.txt  test.py	      Untitled.ipynb


In [None]:
model_file = open('xx.pkl' + "model.dummy", "w")
model_file.write("Dummy model.")
model_file.close()


In [50]:
import pickle
with open('model.pkl', 'wb') as fid:
    pickle.dump(clf, fid)

In [66]:
%%time

import os
import boto3
import re
import json
import sagemaker
from sagemaker import get_execution_role

region = boto3.Session().region_name

role = get_execution_role()

bucket = sagemaker.Session().default_bucket()

CPU times: user 171 ms, sys: 15.9 ms, total: 186 ms
Wall time: 242 ms


In [64]:
from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(boto3.Session().region_name, "xgboost", "0.90-2")

endpoint_name = 'test'

In [68]:
%%time
from time import gmtime, strftime

model_name = endpoint_name + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model_url = 's3://sagemaker-ca-central-1-337058716437/test-mlops-2021-08-12-06-13-33-859/output/model.tar.gz'
sm_client = boto3.client("sagemaker")

print(model_url)

primary_container = {
    "Image": container,
    "ModelDataUrl": model_url,
}

create_model_response2 = sm_client.create_model(
    ModelName=model_name, ExecutionRoleArn=role, PrimaryContainer=primary_container
)

print(create_model_response2["ModelArn"])

s3://sagemaker-ca-central-1-337058716437/test-mlops-2021-08-12-06-13-33-859/output/model.tar.gz
arn:aws:sagemaker:ca-central-1:337058716437:model/test2021-08-12-06-25-18
CPU times: user 54.2 ms, sys: 3.79 ms, total: 58 ms
Wall time: 419 ms


In [70]:
from time import gmtime, strftime

endpoint_config_name = "DEMO-CICD-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_config_name)
create_endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "InstanceType": "ml.m4.xlarge",
            "InitialInstanceCount": 1,
            "InitialVariantWeight": 1,
            "ModelName": model_name,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])

DEMO-CICD-2021-08-12-06-25-52
Endpoint Config Arn: arn:aws:sagemaker:ca-central-1:337058716437:endpoint-config/demo-cicd-2021-08-12-06-25-52


In [71]:
import time

endpoint_name = "DEMO-CICD-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(endpoint_name)
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)
print(create_endpoint_response["EndpointArn"])

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

DEMO-CICD-2021-08-12-06-26-08
arn:aws:sagemaker:ca-central-1:337058716437:endpoint/demo-cicd-2021-08-12-06-26-08
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:ca-central-1:337058716437:endpoint/demo-cicd-2021-08-12-06-26-08
Status: InService


In [72]:
runtime_client = boto3.client("runtime.sagemaker")

In [75]:
import numpy as np

point_X = df.iloc[1]
point_X = np.expand_dims(point_X, axis=0)

In [77]:
point_y = df['target'][0]

In [78]:
np.savetxt("test_point.csv", point_X, delimiter=",")

In [80]:
import json


file_name = (
    "test_point.csv"  # customize to your test file, will be 'mnist.single.test' if use data above
)

with open(file_name, "r") as f:
    payload = f.read().strip()

response = runtime_client.invoke_endpoint(
    EndpointName=endpoint_name, ContentType="text/csv", Body=payload
)
result = response["Body"].read().decode("ascii")
print("Predicted Class Probabilities: {}.".format(result))

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from model with message "Unable to load model: Model at /opt/ml/model cannot be loaded:
Can't get attribute 'XGBoostLabelEncoder' on <module 'xgboost.compat' from '/miniconda3/lib/python3.6/site-packages/xgboost/compat.py'>
[06:36:51] /workspace/src/learner.cc:349: Check failed: fi->Read(&name_obj_[0], len) == len (94544 vs. 126635668) : BoostLearner: wrong model format
Stack trace:
  [bt] (0) /miniconda3/xgboost/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x24) [0x7f23ef4d7cb4]
  [bt] (1) /miniconda3/xgboost/libxgboost.so(xgboost::LearnerImpl::Load(dmlc::Stream*)+0x688) [0x7f23ef573b58]
  [bt] (2) /miniconda3/xgboost/libxgboost.so(XGBoosterLoadModel+0x37) [0x7f23ef4cc417]
  [bt] (3) /miniconda3/lib/python3.6/lib-dynload/../../libffi.so.7(+0x69dd) [0x7f241eb459dd]
  [bt] (4) /miniconda3/lib/python3.6/lib-dynload/../../libffi.so.7(+0x6067) [0x7f241eb45067]
  [bt] (5) /miniconda3/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7f241eb5dd0e]
  [bt] (6) /miniconda3/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x13745) [0x7f241eb5e745]
  [bt] (7) /miniconda3/bin/python3(_PyObject_FastCallDict+0x8b) [0x55c5eec240bb]
  [bt] (8) /miniconda3/bin/python3(+0x199c4e) [0x55c5eecabc4e]

". See https://ca-central-1.console.aws.amazon.com/cloudwatch/home?region=ca-central-1#logEventViewer:group=/aws/sagemaker/Endpoints/DEMO-CICD-2021-08-12-06-26-08 in account 337058716437 for more information.

In [2]:
import sagemaker
import subprocess
import sys
import random
import math
import pandas as pd
import os
import boto3
import numpy as np
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sagemaker.pytorch import PyTorch
from sagemaker.xgboost import XGBoost
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.serializers import NumpySerializer, JSONSerializer, CSVSerializer
from sagemaker.deserializers import NumpyDeserializer, JSONDeserializer
from sagemaker.predictor import Predictor
# from generate_synthetic_housing_data import *

'1.50.18.post0'

In [3]:
import boto3
import base64
import docker

def build_docker(tag, path='.'):
    print('*' * 30)
    print('Start building')
    docker_client = docker.from_env()
    image, build_log = docker_client.images.build(
        path=path, tag=tag, rm=True)
    for line in build_log:
        if 'stream' in line:
            print(line['stream'],end='')
    return image

def push_to_ecr(image, ecr_repo_name):
    print('*' * 30)
    print('Start pushing')
    sess = boto3.Session()
    resp = sess.client('ecr').get_authorization_token()
    token = resp['authorizationData'][0]['authorizationToken']
    token = base64.b64decode(token).decode()
    username, password = token.split(':')
    auth_config = {'username': username, 'password': password}
    
    ecr_url = resp['authorizationData'][0]['proxyEndpoint']
    
    client = docker.from_env()
    
    try:
        ecr_client = boto3.client('ecr')
        response = ecr_client.create_repository(
            repositoryName=ecr_repo_name,
        )
        print('[Info]Repository {} created'.format(ecr_repo_name))
    except:
        print('[Info]Repository {} existed'.format(ecr_repo_name))
    
    ecr_repo_name = '{}/{}'.format(
        ecr_url.replace('https://', ''), ecr_repo_name)
    print(ecr_repo_name)
    
    image.tag(ecr_repo_name, tag='latest')
    
    push_log = client.images.push(ecr_repo_name, auth_config=auth_config)
    print(push_log.replace('"status":"', '').replace('{', '').replace('}', '').replace(']', '').replace('"', ''))
    return ecr_repo_name

def build_and_push(tag, dockerfile_path, ecr_repo_name):
    image = build_docker(tag, dockerfile_path)
    print('\n\n\n')
    ecr_repo_name = push_to_ecr(image, ecr_repo_name)
    return ecr_repo_name



In [5]:
# from docker_utils import build_and_push

image = build_and_push(tag='xgboost-multi', dockerfile_path='/home/ec2-user/SageMaker/make_multi-endpoint/container', ecr_repo_name='xgboost-multi')

******************************
Start building
Step 1/13 : FROM ubuntu:18.04
 ---> 39a8cfeef173
Step 2/13 : LABEL com.amazonaws.sagemaker.capabilities.multi-models=true
 ---> Using cache
 ---> 66c8dc55527e
Step 3/13 : LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
 ---> Using cache
 ---> 4a1f3ce3f717
Step 4/13 : RUN apt-get update &&     apt-get -y install --no-install-recommends     build-essential     ca-certificates     openjdk-8-jdk-headless     python3-dev     curl     vim     && rm -rf /var/lib/apt/lists/*     && curl -O https://bootstrap.pypa.io/get-pip.py     && python3 get-pip.py
 ---> Using cache
 ---> 5c8f66d263ab
Step 5/13 : RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1
 ---> Using cache
 ---> 15df7e7db6d3
Step 6/13 : RUN update-alternatives --install /usr/local/bin/pip pip /usr/local/bin/pip3 1
 ---> Using cache
 ---> 4f10dba8efdd
Step 7/13 : RUN pip3 --no-cache-dir install multi-model-server                                

In [None]:
from time import gmtime, strftime
import boto3
import os
import tarfile

#endpoint名字
endpoint_name = 'test-mlops2' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
#模型列表，注意最后结尾带上 /
model_url = 's3://sagemaker-ca-central-1-337058716437/test-mlops-2021-08-12-06-13-33-859/output/'
container = '337058716437.dkr.ecr.ca-central-1.amazonaws.com/xgboost-multi'
role = 'arn:aws:iam::337058716437:role/SageMaker-Execution'

sm_client = boto3.client(service_name='sagemaker')
runtime_sm_client = boto3.client(service_name='sagemaker-runtime')

container = {
    'Image': container,
    'ModelDataUrl': model_url,
    'Mode': 'MultiModel'
}

model_name = endpoint_name

create_model_response = sm_client.create_model(
    ModelName = model_name,
    ExecutionRoleArn = role,
    Containers = [container])

endpoint_config_name = model_name
print('Endpoint config name: ' + endpoint_config_name)

create_endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'InstanceType': 'ml.m5.large',
        'InitialInstanceCount': 2,
        'InitialVariantWeight': 1,
        'ModelName': model_name,
        'VariantName': 'AllTraffic'}])

print('Endpoint name: ' + endpoint_name)

create_endpoint_response = sm_client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name)

print('Endpoint Arn: ' + create_endpoint_response['EndpointArn'])


resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp['EndpointStatus']
print("Endpoint Status: " + status)

print('Waiting for {} endpoint to be in service...'.format(endpoint_name))
waiter = sm_client.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=endpoint_name)

Endpoint config name: test-mlops22021-08-12-08-43-49
Endpoint name: test-mlops22021-08-12-08-43-49
Endpoint Arn: arn:aws:sagemaker:ca-central-1:337058716437:endpoint/test-mlops22021-08-12-08-43-49
Endpoint Status: Creating
Waiting for test-mlops22021-08-12-08-43-49 endpoint to be in service...
