## AWS y Intel Hackathon: Model Training

### Install Python SDKs

In [1]:
import sys

In [2]:
!{sys.executable} -m pip install sagemaker-experiments==0.1.24

Collecting sagemaker-experiments==0.1.24
  Downloading sagemaker_experiments-0.1.24-py3-none-any.whl (36 kB)
Installing collected packages: sagemaker-experiments
Successfully installed sagemaker-experiments-0.1.24


### Install PyTroch

In [3]:
!{sys.executable} -m pip install torch==1.1.0
!{sys.executable} -m pip install torchvision==0.3.0
!{sys.executable} -m pip install pillow==6.2.2
!{sys.executable} -m pip install --upgrade sagemaker
!{sys.executable} -m pip install torchsummary

Collecting torch==1.1.0
  Downloading torch-1.1.0-cp36-cp36m-manylinux1_x86_64.whl (676.9 MB)
     |████████████████████████████████| 676.9 MB 1.9 kB/s             
Installing collected packages: torch
Successfully installed torch-1.1.0
Collecting torchvision==0.3.0
  Downloading torchvision-0.3.0-cp36-cp36m-manylinux1_x86_64.whl (2.6 MB)
     |████████████████████████████████| 2.6 MB 27.7 MB/s            
Installing collected packages: torchvision
Successfully installed torchvision-0.3.0
Collecting pillow==6.2.2
  Downloading Pillow-6.2.2-cp36-cp36m-manylinux1_x86_64.whl (2.1 MB)
     |████████████████████████████████| 2.1 MB 26.0 MB/s            
[?25hInstalling collected packages: pillow
  Attempting uninstall: pillow
    Found existing installation: Pillow 8.4.0
    Uninstalling Pillow-8.4.0:
      Successfully uninstalled Pillow-8.4.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the

### Setup

In [4]:
import time

import boto3
import numpy as np
import pandas as pd
from IPython.display import set_matplotlib_formats, display
from matplotlib import pyplot as plt
from torchvision import datasets, transforms, models

import torch

import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session

from tqdm.notebook import tqdm

from torchsummary import summary
import glob
from PIL import Image

import random

set_matplotlib_formats("retina")

### Download the data

In [5]:
!mkdir data

### Original dataset
Run the two cells below to use the original dataset. It should provide good results in training and test although the training process will take several minutes.

In [None]:
!wget https://www.dropbox.com/s/zhljom0hth586p9/dataset_original.zip

In [None]:
!mv dataset_original.zip data/dataset.zip
!unzip -quo data/dataset.zip -d data/

dataset_path = "./data/dataset.zip"

### Reduced Dataset
Run the two cells below to use the reduced dataset. It should provide worse results than the original but it will reduce the training process time.

In [None]:
!wget https://www.dropbox.com/s/evm0ts2obk7n3cb/dataset_reduced.zip

In [None]:
!mv dataset_reduced.zip data/dataset.zip
!unzip -quo data/dataset.zip -d data/

dataset_path = "./data/dataset.zip"


### Dataset_for_tests
Run the two cells below to use a very reduced dataset. It can be used for very fast tests although it will yield to very poor results in the predictions and training accuracy. 

In [6]:
!wget https://www.dropbox.com/s/zivlm0skt19k3wh/dataset_for_tests.zip

--2022-04-21 09:14:21--  https://www.dropbox.com/s/zivlm0skt19k3wh/dataset_for_tests.zip
Resolving www.dropbox.com (www.dropbox.com)... 162.125.64.18, 2620:100:6028:18::a27d:4712
Connecting to www.dropbox.com (www.dropbox.com)|162.125.64.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/zivlm0skt19k3wh/dataset_for_tests.zip [following]
--2022-04-21 09:14:22--  https://www.dropbox.com/s/raw/zivlm0skt19k3wh/dataset_for_tests.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucf77a09d255ca6f19baace4fd86.dl.dropboxusercontent.com/cd/0/inline/BjzHv6defNXDCrBCkUC-YzD-DXK5hXZSrJWLDyZe8ASyqg4e_qaEqq2TdRev2BEDRRw0c_BZkB7Qx4UgWNjeuyYi02F2SE8H9xBnHkBvLeUnTX6gntR5rJ7lTT9LwuYDKFwODK-4RoapLRH8NqjU87MbwM8APAu_akLakNYC76Trqw/file# [following]
--2022-04-21 09:14:22--  https://ucf77a09d255ca6f19baace4fd86.dl.dropboxusercontent.com/cd/0/inline/BjzHv6defNXDCrBCkUC-YzD-DXK5hXZSrJWLDy

In [7]:
!mv dataset_for_tests.zip data/dataset.zip
!unzip -quo data/dataset.zip -d data/

dataset_path = "./data/dataset.zip"

### Upload dataset to S3 as zip file

In [8]:
sm_sess = sagemaker.Session()
sess = sm_sess.boto_session
sm = sm_sess.sagemaker_client
role = get_execution_role()

In [9]:
account_id = sess.client("sts").get_caller_identity()["Account"]
bucket = "sagemaker-hackathon-demo-{}-{}".format(sess.region_name, account_id)
prefix = "hackathon"

try:
    if sess.region_name == "us-east-1":
        sess.client("s3").create_bucket(Bucket=bucket)
    else:
        sess.client("s3").create_bucket(
            Bucket=bucket, CreateBucketConfiguration={"LocationConstraint": sess.region_name}
        )
except Exception as e:
    print(e)

In [10]:
bucket

'sagemaker-hackathon-demo-eu-west-1-017233837209'

In [11]:
s3_resource = boto3.resource("s3", region_name = sess.region_name)

inputs = None

try:

    
    inputs = sagemaker.Session().upload_data(path=dataset_path, bucket=bucket, key_prefix=prefix)
    print("input spec: {}".format(inputs))
except Exception as exp:
    print("exp: ", exp)


input spec: s3://sagemaker-hackathon-demo-eu-west-1-017233837209/hackathon/dataset.zip


### Training

In [12]:
!pygmentize model.py

[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mjson[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m

[37m#import sagemaker_containers[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m[04m[36m.[39;49;00m[04m[36mdistributed[39;49;00m [34mas[39;49;00m [04m[36mdist[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m[04m[36m.[39;49;00m[04m[36mnn[39;49;00m [34mas[39;49;00m [04m[36mnn[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m[04m[36m.[39;49;00m[04m[36mnn[39;49;00m[04m[36m.[39;49;00m[04m[36mfunctional[39;49;00m [34mas[39;49;00m [04m[36mF[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m[04m[36m.[39;49;00m[04m[36moptim[39;49;00m [34mas[39;49;00m [04m[36moptim[39;49;00m
[34mimport[39;49;00m [04m[36mt

In [13]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
    entry_point="model.py",
    role=role,
    framework_version="1.4.0",
    py_version="py3",
    instance_count=2,
    instance_type="ml.m4.xlarge",
    hyperparameters={"epochs": 6, "backend": "gloo"},
)

In [14]:
estimator.fit({"training": inputs})

2022-04-21 09:14:36 Starting - Starting the training job...
2022-04-21 09:15:02 Starting - Preparing the instances for trainingProfilerReport-1650532475: InProgress
.........
2022-04-21 09:16:23 Downloading - Downloading input data......
2022-04-21 09:17:19 Training - Downloading the training image...
2022-04-21 09:18:04 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[35mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[35mbash: no job control in this shell[0m
[35m2022-04-21 09:18:07,655 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[35m2022-04-21 09:18:07,658 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[35m2022-04-21 09:18:07,674 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succee

In [16]:
estimator.model_data

's3://sagemaker-eu-west-1-017233837209/pytorch-training-2022-04-21-09-14-35-467/output/model.tar.gz'

### Deployment

In [17]:
estimator.model_data

's3://sagemaker-eu-west-1-017233837209/pytorch-training-2022-04-21-09-14-35-467/output/model.tar.gz'

First we create a folder to save model trained model, and download the model.tar.gz file to local directory.

In [18]:
%%sh -s $estimator.model_data
mkdir model
aws s3 cp $1 model/ 
tar xvzf model/model.tar.gz --directory ./model

download: s3://sagemaker-eu-west-1-017233837209/pytorch-training-2022-04-21-09-14-35-467/output/model.tar.gz to model/model.tar.gz
model.pth
model.pth


Convert your model into the TorchScript format using torch.jit.trace or torch.jit.script.

In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 5)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


model_loaded = torch.load("model/model.pth")
model = Net().to("cpu")
model = torch.nn.DataParallel(model)
model.load_state_dict(model_loaded)
print("completed")

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [20]:
import subprocess

trace_input = torch.rand(1, 3, 28, 28)
traced_model = torch.jit.trace(model.eval(), trace_input)

torch.jit.save(traced_model, "model.pth")
subprocess.call(["tar", "-czvf", "traced_hackathon_model.tar.gz", "model.pth"])

0

In [87]:
!pygmentize deploy_ei.py

[37m# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.[39;49;00m
[37m#[39;49;00m
[37m# Licensed under the Apache License, Version 2.0 (the "License"). You[39;49;00m
[37m# may not use this file except in compliance with the License. A copy of[39;49;00m
[37m# the License is located at[39;49;00m
[37m#[39;49;00m
[37m#     http://aws.amazon.com/apache2.0/[39;49;00m
[37m#[39;49;00m
[37m# or in the "license" file accompanying this file. This file is[39;49;00m
[37m# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF[39;49;00m
[37m# ANY KIND, either express or implied. See the License for the specific[39;49;00m
[37m# language governing permissions and limitations under the License.[39;49;00m
[34mfrom[39;49;00m [04m[36m__future__[39;49;00m [34mimport[39;49;00m absolute_import

[34mimport[39;49;00m [04m[36mlogging[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00

In [88]:
from sagemaker.pytorch import PyTorchModel
from datetime import datetime

instance_type = "ml.m4.xlarge"

# TorchScript model
tar_filename = "traced_hackathon_model.tar.gz"

# You can also upload model artifacts to S3
# print('Upload tarball to S3')
# model_data = sagemaker_session.upload_data(path=tar_filename, bucket=bucket, key_prefix=prefix)
model_data = tar_filename

endpoint_name = ("hackathon-ei-endpoint").replace(".", "").replace("_", "")

In [89]:
pytorch = PyTorchModel(
    model_data=model_data,
    role=role,
    entry_point="deploy_ei.py",
    framework_version="1.3.1",
    py_version="py3",
    sagemaker_session=sm_sess,
)

In [90]:
# Attach EI remotely

# Function will exit before endpoint is finished creating
predictor = pytorch.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    endpoint_name=endpoint_name,
    wait=True,
)


-------!

In [56]:
predictor

<sagemaker.pytorch.model.PyTorchPredictor at 0x7f0594676320>

### Access the endpoint from outside for using real-time inference

In [91]:
import boto3

import json

import numpy as np

endpoint = 'hackathon-ei-endpoint'
 
runtime = boto3.Session().client('sagemaker-runtime')
 
# Read image into memory
with open("plastic171.jpg", 'rb') as f:
    payload = f.read()
    
print("type payload", type(payload))

type payload <class 'bytes'>


In [92]:
# Send image via InvokeEndpoint API
response = runtime.invoke_endpoint(EndpointName=endpoint, ContentType='application/x-image', Body=payload)
result = json.loads(response['Body'].read().decode())
result = np.array(result)
prediction = result.argmax(axis=1)[0]
print("class predicted", prediction + 1)


class predicted 5


### Delete endpoint

In [86]:
predictor.delete_endpoint()