## AWS y Intel Hackathon: Model Training

### Install Python SDKs

In [1]:
import sys

In [2]:
!{sys.executable} -m pip install sagemaker-experiments==0.1.24



### Install PyTroch

In [3]:
!{sys.executable} -m pip install torch==1.1.0
!{sys.executable} -m pip install torchvision==0.3.0
!{sys.executable} -m pip install pillow==6.2.2
!{sys.executable} -m pip install --upgrade sagemaker
!{sys.executable} -m pip install torchsummary



### Setup

In [4]:
import time

import boto3
import numpy as np
import pandas as pd
from IPython.display import set_matplotlib_formats, display
from matplotlib import pyplot as plt
from torchvision import datasets, transforms, models

import torch

import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session

from tqdm.notebook import tqdm

from torchsummary import summary
import glob
from PIL import Image

import random

set_matplotlib_formats("retina")

### Download the data

In [6]:
!mkdir data

#### Original dataset
Run the two cells below to use the original dataset. It should provide good results in training and test although the training process will take several minutes.

In [None]:
!wget https://www.dropbox.com/s/zhljom0hth586p9/dataset_original.zip

In [None]:
!mv dataset_original.zip data/dataset.zip
!unzip -quo data/dataset.zip -d data/

dataset_path = "./data/dataset.zip"

#### Reduced Dataset

Run the two cells below to use the reduced dataset. It should provide worse results than the original but it will reduce the training process time.

In [None]:
!wget https://www.dropbox.com/s/evm0ts2obk7n3cb/dataset_reduced.zip

In [None]:
!mv dataset_reduced.zip data/dataset.zip
!unzip -quo data/dataset.zip -d data/

dataset_path = "./data/dataset.zip"


#### Dataset_for_tests
Run the two cells below to use a very reduced dataset. It can be used for very fast tests although it will yield to very poor results in the predictions and training accuracy. 

In [7]:
!wget https://www.dropbox.com/s/zivlm0skt19k3wh/dataset_for_tests.zip

--2022-04-19 13:57:13--  https://www.dropbox.com/s/zivlm0skt19k3wh/dataset_for_tests.zip
Resolving www.dropbox.com (www.dropbox.com)... 162.125.69.18, 2620:100:6020:18::a27d:4012
Connecting to www.dropbox.com (www.dropbox.com)|162.125.69.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/zivlm0skt19k3wh/dataset_for_tests.zip [following]
--2022-04-19 13:57:14--  https://www.dropbox.com/s/raw/zivlm0skt19k3wh/dataset_for_tests.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uca0f546b6c536b85e92b8ee6432.dl.dropboxusercontent.com/cd/0/inline/Bjtu8ynqe0r98dTvb1oxEyN_aX12R7jWE9aUTa4bcoL5whNImOvdGceoYoywi28h-9uoYum1nYjmTLHCvqL9HVdbFeVKHs0fhV7nKt-gNLXwfItp1RgRlxN7Z3To4eRj26r7BAOJqmbhb5ouaA1I2bDATfwzy4b_Z-6r6WFpkWU2CA/file# [following]
--2022-04-19 13:57:14--  https://uca0f546b6c536b85e92b8ee6432.dl.dropboxusercontent.com/cd/0/inline/Bjtu8ynqe0r98dTvb1oxEyN_aX12R7jWE9aUTa

In [8]:
!mv dataset_for_tests.zip data/dataset.zip
!unzip -quo data/dataset.zip -d data/

dataset_path = "./data/dataset.zip"

### Upload dataset to S3 as zip file

In [9]:
sm_sess = sagemaker.Session()
sess = sm_sess.boto_session
sm = sm_sess.sagemaker_client
role = get_execution_role()

In [10]:
account_id = sess.client("sts").get_caller_identity()["Account"]
bucket = "sagemaker-hackathon-demo-{}-{}".format(sess.region_name, account_id)
prefix = "hackathon"

try:
    if sess.region_name == "us-east-1":
        sess.client("s3").create_bucket(Bucket=bucket)
    else:
        sess.client("s3").create_bucket(
            Bucket=bucket, CreateBucketConfiguration={"LocationConstraint": sess.region_name}
        )
except Exception as e:
    print(e)

In [11]:
bucket

'sagemaker-hackathon-demo-eu-west-1-017233837209'

In [12]:
s3_resource = boto3.resource("s3", region_name = sess.region_name)

inputs = None

try:

    
    inputs = sagemaker.Session().upload_data(path=dataset_path, bucket=bucket, key_prefix=prefix)
    print("input spec: {}".format(inputs))
except Exception as exp:
    print("exp: ", exp)


input spec: s3://sagemaker-hackathon-demo-eu-west-1-017233837209/hackathon/dataset.zip


### Training

In [13]:
from sagemaker.pytorch import PyTorch, PyTorchModel

In [14]:
estimator = PyTorch(
    py_version="py3",
    entry_point="./model.py",
    role=role,
    sagemaker_session=sagemaker.Session(sagemaker_client=sm),
    framework_version="1.1.0",
    instance_count=1,
    instance_type="ml.c5.2xlarge",
    hyperparameters={
        "epochs": 2,
        "backend": "gloo",
        "dropout": 0.2,
        "kernel_size": 5,
        "optimizer": "sgd",
    },
    metric_definitions=[
        {"Name": "train:loss", "Regex": "Train Loss: (.*?);"},
        {"Name": "test:loss", "Regex": "Test Average loss: (.*?),"},
        {"Name": "test:accuracy", "Regex": "Test Accuracy: (.*?)%;"},
    ],
    enable_sagemaker_metrics=True,
)

cnn_training_job_name = "cnn-training-job-{}".format(int(time.time()))

classifier = estimator.fit(
    inputs={"training": inputs},
    job_name=cnn_training_job_name,
    wait=True,
)


time.sleep(2)

2022-04-19 13:57:45 Starting - Starting the training job...
2022-04-19 13:58:08 Starting - Preparing the instances for trainingProfilerReport-1650376665: InProgress
......
2022-04-19 13:59:08 Downloading - Downloading input data..
2022-04-19 13:59:28 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-04-19 13:59:29,922 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-04-19 13:59:29,924 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-04-19 13:59:29,935 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-04-19 13:59:29,936 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-04-19 13:59:30,154 sagemaker-containers INFO     Module model does not provide a s

NoneType