# Using External Datasets and Deploying Models
Here we will use **hyperparameter optimization** to train a model, but the **data we will use will be present in an S3 bucket**. We will **also deploy the trained model, query it and get the result.**

First we need to import the things we need. We will be using the MNIST dataset, but we will be uploading it to an S3 bucket.

## `mnist.py`
<details>
  <summary> Click here to see the full script code </summary>
   
```python
import argparse
import json
import logging
import os
import sys


import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import torch.utils.data.distributed
from torchvision import datasets, transforms

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


def _get_train_data_loader(batch_size, training_dir):
    logger.info("Get train data loader")
    dataset = datasets.MNIST(
        training_dir,
        train=True,
        transform=transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
        ),
    )

    return torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True
    )


def _get_test_data_loader(test_batch_size, training_dir):
    logger.info("Get test data loader")
    return torch.utils.data.DataLoader(
        datasets.MNIST(
            training_dir,
            train=False,
            transform=transforms.Compose(
                [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
            ),
        ),
        batch_size=test_batch_size,
        shuffle=True,
    )

def train(args):
    train_loader = _get_train_data_loader(args.batch_size, args.data_dir)
    test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir)

    model = Net()

    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    for epoch in range(1, args.epochs + 1):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader, 1):
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % 100 == 0:
                logger.info(
                    "Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}".format(
                        epoch,
                        batch_idx * len(data),
                        len(train_loader.dataset),
                        100.0 * batch_idx / len(train_loader),
                        loss.item(),
                    )
                )
        test(model, test_loader)
    save_model(model, args.model_dir)


def test(model, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            test_loss += F.nll_loss(output, target, size_average=False).item()  # sum up batch loss
            pred = output.max(1, keepdim=True)[1]  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    logger.info(
        "Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
            test_loss, correct, len(test_loader.dataset), 100.0 * correct / len(test_loader.dataset)
        )
    )


def model_fn(model_dir):
    model = Net()
    with open(os.path.join(model_dir, "model.pth"), "rb") as f:
        model.load_state_dict(torch.load(f))
    return model


def save_model(model, model_dir):
    logger.info("Saving the model.")
    path = os.path.join(model_dir, "model.pth")
    torch.save(model.cpu().state_dict(), path)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # Data and model checkpoints directories
    parser.add_argument(
        "--batch-size",
        type=int,
        default=64,
        metavar="N",
        help="input batch size for training (default: 64)",
    )
    parser.add_argument(
        "--test-batch-size",
        type=int,
        default=1000,
        metavar="N",
        help="input batch size for testing (default: 1000)",
    )
    parser.add_argument(
        "--epochs",
        type=int,
        default=5,
        metavar="N",
        help="number of epochs to train (default: 10)",
    )
    parser.add_argument(
        "--lr", type=float, default=0.01, metavar="LR", help="learning rate (default: 0.01)"
    )
    parser.add_argument(
        "--momentum", type=float, default=0.5, metavar="M", help="SGD momentum (default: 0.5)"
    )

    # Container environment
    parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
    parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"])
    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--data-dir", type=str, default=os.environ["SM_CHANNEL_TRAINING"])
    parser.add_argument("--num-gpus", type=int, default=os.environ["SM_NUM_GPUS"])

    train(parser.parse_args())
```

</details>

In [2]:
!pip install --upgrade sagemaker

Collecting sagemaker
  Downloading sagemaker-2.135.1.post0.tar.gz (674 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m674.4/674.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting importlib-metadata<5.0,>=1.4.0
  Using cached importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-2.135.1.post0-py2.py3-none-any.whl size=911947 sha256=ade89c861d6358f9a841856f5b3c544e95e8319864cb0eda84971b6bcf9943f5
  Stored in directory: /root/.cache/pip/wheels/a7/27/f2/5548fa5c8150562b57808525fe8b695493c51edf625277ea62
Successfully built sagemaker
Installing collected packages: importlib-metadata, sagemaker
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib-metadata 6.0.0
    Uninstalling importlib-metadata-6.0.0:
      Succe

In [5]:
!pip install torchvision
import torchvision
torchvision.__version__

Collecting torchvision
  Downloading torchvision-0.14.1-cp37-cp37m-manylinux1_x86_64.whl (24.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.2/24.2 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting torch==1.13.1
  Downloading torch-1.13.1-cp37-cp37m-manylinux1_x86_64.whl (887.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m775.8 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cublas-cu11==11.10.3.66
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nvidia-cuda-nvrtc-cu11==11.7.99
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m00:

'0.14.1+cu117'

### For Hyperparameter Tuning Jobs

In [6]:
import sagemaker
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = "sagemaker/DEMO-pytorch-mnist"   # Where dataset would be stored in S3 bucket

role = sagemaker.get_execution_role()

## Fetching Data and Uploading it to S3
We will fetch the MNIST dataset from Pytorch. To upload it to an S3 bucket we will use the `sagemaker_session` object. We will need to specify the bucket name as well as a prefix (folder) in which to upload our data. This is important because we will use it later when submitting our job for training.

In [7]:
from torchvision.datasets import MNIST
from torchvision import transforms

local_dir = 'data'
# The 'MNIST.mirrors' attribute is set to a URL that points to a copy of the dataset stored on Amazon S3 in the SageMaker Sample Files bucket
MNIST.mirrors = ["https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/"]

MNIST(local_dir,
      download=True,
      transform=transforms.Compose([transforms.ToTensor(), 
                                    transforms.Normalize((0.1307,), (0.3081,))])
     )

Dataset MNIST
    Number of datapoints: 60000
    Root location: data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.1307,), std=(0.3081,))
           )

#### Upload the data to S3 and store it in 'inputS'

In [8]:
inputs = sagemaker_session.upload_data(path="data", bucket=bucket, key_prefix=prefix)
print("input spec (in this case, just an S3 path): {}".format(inputs))

input spec (in this case, just an S3 path): s3://sagemaker-us-east-1-293789295245/sagemaker/DEMO-pytorch-mnist


## Hyperparameter Tuning and Deploying
We will create our Pytorch estimator and `HyperparameterTuner` object like before, but this time we will need to specify the the path to our training data when calling fit. We will specify it in a dictionary where the key will be the training channel.

### Creating an estimator to be trained

In [9]:
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
    entry_point="mnist.py",
    role=role,
    py_version='py36',
    framework_version="1.8",
    instance_count=1,
    instance_type="ml.m5.large"
)

### Hyperparameter Ranges

In [11]:
hyperparameter_ranges = {
    "lr": ContinuousParameter(0.001, 0.1),  # Range within an inclusive
    "batch-size": CategoricalParameter([32, 64, 128, 256, 512]),
}

### Metric Optimization

In [12]:
objective_metric_name = "average test loss"   # Name of the objective metric
objective_type = "Minimize"                   # Type of objective for optimization: Minimize or Maximize
metric_definitions = [{"Name": "average test loss", "Regex": "Test set: Average loss: ([0-9\\.]+)"}]  # Use Regex based metric definition as in mnist.py

### HPO: Tuner

In [13]:
tuner = HyperparameterTuner(
    estimator,                        # Pass the estimator to the tuner
    objective_metric_name,            # Pass the objective metric name
    hyperparameter_ranges,            # Hyperparameter Ranges
    metric_definitions,               # Metric definitions
    max_jobs=4,                       # Max number of jobs with hyperparameter combinations to be trained
    max_parallel_jobs=2,              # No. of jobs to be performed using parallelism
    objective_type=objective_type,    # Type of objective to be achieved
)

### Fit the tuner to generate the Hyperparameter Tuning Job models along with the best estimator

In [14]:
tuner.fit({"training": inputs})

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


...................................................................................................!


### Deploy model

Deploying the model is simple. We need to call the `deploy` method with the instance type and the number of instances. <br>**By default, the best trained model will be deployed.**

In [15]:
predictor = tuner.deploy(initial_instance_count=1, instance_type="ml.t2.medium")


2023-03-02 08:46:18 Starting - Found matching resource for reuse
2023-03-02 08:46:18 Downloading - Downloading input data
2023-03-02 08:46:18 Training - Training image download completed. Training in progress.
2023-03-02 08:46:18 Uploading - Uploading generated training model
2023-03-02 08:46:18 Completed - Resource retained for reuse
--------!

## Query Model
We can now use this predictor to classify hand-written digits. 

We will read a random image and call the `predict` method of our `predictor` with the input image. We can then parse the result for the answer.

In [16]:
import gzip 
import numpy as np
import random
import os

data_dir = 'data/MNIST/raw'
with gzip.open(os.path.join(data_dir, "t10k-images-idx3-ubyte.gz"), "rb") as f:
    images = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1, 28, 28).astype(np.float32)

mask = random.sample(range(len(images)), 16) # randomly select some of the test images
mask = np.array(mask, dtype=np.int)
data = images[mask]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  # This is added back by InteractiveShellApp.init_path()


In [17]:
response = predictor.predict(np.expand_dims(data, axis=1))
print("Raw prediction result:")
print(response)
print()

labeled_predictions = list(zip(range(10), response[0]))
print("Labeled predictions: ")
print(labeled_predictions)
print()

labeled_predictions.sort(key=lambda label_and_prob: 1.0 - label_and_prob[1])
print("Most likely answer: {}".format(labeled_predictions[0]))

Raw prediction result:
[[-1681.63000488 -1932.20776367 -1670.44299316 -1313.7434082
   -937.14550781 -1353.66992188 -2189.12182617  -939.8026123
   -944.21130371     0.        ]
 [-1144.87780762 -1185.35852051     0.          -999.53070068
  -2257.35449219 -2794.6484375  -2813.71972656  -850.98846436
  -1141.57678223 -2004.98278809]
 [-1772.26513672 -2107.78271484 -2111.03979492 -1177.6081543
  -1670.1862793      0.         -1099.08740234 -2280.99316406
  -1161.20910645  -939.92712402]
 [-1340.52685547 -1636.41259766 -1139.63830566 -1095.74707031
  -1695.48620605 -1599.33325195 -2577.59277344     0.
  -1462.16101074  -817.81286621]
 [ -854.70410156 -1239.40270996  -990.65203857 -1023.97900391
  -1204.18115234 -1218.88342285 -1837.94799805     0.
  -1054.31152344  -429.64361572]
 [-1880.3215332  -1551.77172852 -1550.35253906 -1792.82287598
      0.         -1459.06933594 -1292.09875488 -1511.66357422
  -1449.56860352 -1120.17883301]
 [    0.         -2545.34179688 -1627.01672363 -2117.5

### Cleanup

After you have finished with this example, remember to delete the prediction endpoint to release the instance(s) associated with it

In [20]:
tuner.delete_endpoint()