# Using External Datasets and Deploying Models
Here we will use hyperparameter optimization to train a model, but the data we will use will be present in an S3 bucket. We will also deploy the trained model, query it and get the result.

First we need to import the things we need. We will be using the MNIST dataset, but we will be uploading it to an S3 bucket.

## `mnist.py`
<details>
  <summary> Click here to see the full script code </summary>
   
```python
import argparse
import json
import logging
import os
import sys


import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import torch.utils.data.distributed
from torchvision import datasets, transforms

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


def _get_train_data_loader(batch_size, training_dir):
    logger.info("Get train data loader")
    dataset = datasets.MNIST(
        training_dir,
        train=True,
        transform=transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
        ),
    )

    return torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True
    )


def _get_test_data_loader(test_batch_size, training_dir):
    logger.info("Get test data loader")
    return torch.utils.data.DataLoader(
        datasets.MNIST(
            training_dir,
            train=False,
            transform=transforms.Compose(
                [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
            ),
        ),
        batch_size=test_batch_size,
        shuffle=True,
    )

def train(args):
    train_loader = _get_train_data_loader(args.batch_size, args.data_dir)
    test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir)

    model = Net()

    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    for epoch in range(1, args.epochs + 1):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader, 1):
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % 100 == 0:
                logger.info(
                    "Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}".format(
                        epoch,
                        batch_idx * len(data),
                        len(train_loader.dataset),
                        100.0 * batch_idx / len(train_loader),
                        loss.item(),
                    )
                )
        test(model, test_loader)
    save_model(model, args.model_dir)


def test(model, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            test_loss += F.nll_loss(output, target, size_average=False).item()  # sum up batch loss
            pred = output.max(1, keepdim=True)[1]  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    logger.info(
        "Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
            test_loss, correct, len(test_loader.dataset), 100.0 * correct / len(test_loader.dataset)
        )
    )


def model_fn(model_dir):
    model = Net()
    with open(os.path.join(model_dir, "model.pth"), "rb") as f:
        model.load_state_dict(torch.load(f))
    return model


def save_model(model, model_dir):
    logger.info("Saving the model.")
    path = os.path.join(model_dir, "model.pth")
    torch.save(model.cpu().state_dict(), path)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # Data and model checkpoints directories
    parser.add_argument(
        "--batch-size",
        type=int,
        default=64,
        metavar="N",
        help="input batch size for training (default: 64)",
    )
    parser.add_argument(
        "--test-batch-size",
        type=int,
        default=1000,
        metavar="N",
        help="input batch size for testing (default: 1000)",
    )
    parser.add_argument(
        "--epochs",
        type=int,
        default=5,
        metavar="N",
        help="number of epochs to train (default: 10)",
    )
    parser.add_argument(
        "--lr", type=float, default=0.01, metavar="LR", help="learning rate (default: 0.01)"
    )
    parser.add_argument(
        "--momentum", type=float, default=0.5, metavar="M", help="SGD momentum (default: 0.5)"
    )

    # Container environment
    parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
    parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"])
    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--data-dir", type=str, default=os.environ["SM_CHANNEL_TRAINING"])
    parser.add_argument("--num-gpus", type=int, default=os.environ["SM_NUM_GPUS"])

    train(parser.parse_args())
```

</details>

In [1]:
!pip install --upgrade sagemaker

Collecting sagemaker
  Downloading sagemaker-2.229.0-py3-none-any.whl.metadata (4.1 kB)
Collecting boto3<2.0,>=1.34.142 (from sagemaker)
  Downloading boto3-1.35.0-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.36.0,>=1.35.0 (from boto3<2.0,>=1.34.142->sagemaker)
  Downloading botocore-1.35.0-py3-none-any.whl.metadata (5.7 kB)
Downloading sagemaker-2.229.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading boto3-1.35.0-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.1/139.1 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.35.0-py3-none-any.whl (12.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.5/12.5 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: botocore, boto3, sagemaker
  Attempting uninstall: botocore
    Found exist

## Import libs and prepare environment

In [3]:
import sagemaker
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)

session = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket = session.default_bucket()
prefix = "hpo_deploy"


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


## Fetching Data and Uploading it to S3
We will fetch the MNIST dataset from Pytorch. To upload it to an S3 bucket we will use the `session` object. We will need to specify the bucket name as well as a prefix (folder) in which to upload our data. This is important because we will use it later when submitting our job for training.

In [4]:
from torchvision.datasets import MNIST
from torchvision import transforms

# Download training data
local_dir = "data"
MNIST.mirrors = ["https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/"]

# Instance a transformation for our training data
train_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

MNIST(local_dir, download=True, transform=train_transform)

Downloading https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/train-images-idx3-ubyte.gz
Downloading https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 192263847.82it/s]


Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw

Downloading https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/train-labels-idx1-ubyte.gz
Downloading https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 6023654.59it/s]

Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw

Downloading https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/t10k-images-idx3-ubyte.gz





Downloading https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 94554310.75it/s]


Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw

Downloading https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/t10k-labels-idx1-ubyte.gz
Downloading https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/MNIST/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 2136907.32it/s]


Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw



Dataset MNIST
    Number of datapoints: 60000
    Root location: data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.1307,), std=(0.3081,))
           )

In [10]:
print(torchvision.__version__)

NameError: name 'torchvision' is not defined

In [6]:
# Upload data to S3
inputs = session.upload_data(path="data", bucket=bucket, key_prefix=prefix)
print("Input spec (in this case, just an S3 path): {}".format(inputs))

Input spec (in this case, just an S3 path): s3://sagemaker-us-east-1-533701990481/hpo_deploy


## Hyperparameter Tuning
We will create our Pytorch estimator and `HyperparameterTuner` object like before, but this time we will need to specify the the path to our training data when calling fit. We will specify it in a dictionary where the key will be the training channel.

### Instance the estimator

In [7]:
from sagemaker.pytorch import PyTorch

# Create the pytorch estimator using our training script
estimator = PyTorch(
    entry_point="scripts/mnist.py",
    role=role,
    py_version='py36',
    framework_version="1.8",
    instance_count=1,
    instance_type="ml.m5.large"
)

### Configure the hyperparameter tuner

In [8]:
# Set the hyperparameters to try
hyperparameter_ranges = {
    "lr": ContinuousParameter(0.001, 0.1),
    "batch-size": CategoricalParameter([32, 64, 128, 256, 512]), #ASK WHY IT IS CATEGORICAL
}

objective_metric_name = "Average test loss"
objective_type = "Minimize"
metric_definitions = [{"Name": "Average test loss",
                        "Regex": "Test set: Average loss: ([0-9\\.]+)"}] #Where from the logs SM will get the value

# Instance hyperparameter tuner
tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=4,
    max_parallel_jobs=2,
    objective_type=objective_type,
)

In [9]:
# Start the training job
tuner.fit({"training": inputs})

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


......................................................................................................!


### Deploy the best model

Deploying the model is simple. We need to call the `deploy` method with the instance type and the number of instances. By default, the best trained model will be deployed.

In [11]:
predictor = tuner.deploy(initial_instance_count=1,
                        instance_type="ml.t2.medium")


2024-08-17 01:55:36 Starting - Found matching resource for reuse
2024-08-17 01:55:36 Downloading - Downloading the training image
2024-08-17 01:55:36 Training - Training image download completed. Training in progress.
2024-08-17 01:55:36 Uploading - Uploading generated training model
2024-08-17 01:55:36 Completed - Resource retained for reuse
---------!

## Query Model
We can now use this predictor to classify hand-written digits. 

We will read a random image and call the `predict` method of our `predictor` with the input image. We can then parse the result for the answer.

In [13]:
import gzip 
import numpy as np
import random
import os

data_dir = "data/MNIST/raw"

# UNDERSTAND THE DATA AND ASK FOR FULL EXPLANATION
with gzip.open(os.path.join(data_dir, "t10k-images-idx3-ubyte.gz"), "rb") as f:
    images = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1, 28, 28).astype(np.float32)

mask = random.sample(range(len(images)), 16) # randomly select some of the test images
mask = np.array(mask, dtype=int)
data = images[mask]

In [14]:
response = predictor.predict(np.expand_dims(data, axis=1))
print("Raw prediction result:")
print(response)
print()

labeled_predictions = list(zip(range(10), response[0]))
print("Labeled predictions: ")
print(labeled_predictions)
print()

#EXPLAIN THE NEXT LINE
labeled_predictions.sort(key=lambda label_and_prob: 1.0 - label_and_prob[1])
print("Most likely answer: {}".format(labeled_predictions[0]))

Raw prediction result:
[[-2078.0390625  -1248.00891113 -1382.17858887 -1332.3873291
      0.         -1296.86328125 -1577.96276855 -1145.82995605
   -904.97827148  -752.45788574]
 [-1219.67956543  -741.51977539 -1034.12133789 -1277.19213867
   -853.19671631  -450.61679077     0.         -1556.73779297
   -638.02819824 -1405.20568848]
 [-1352.98010254  -652.11376953 -1116.59460449 -1479.92285156
   -696.0178833   -615.23535156     0.         -1754.4230957
   -738.98962402 -1545.35327148]
 [ -941.46032715 -1082.77661133  -928.97595215  -570.22912598
  -1328.91345215     0.          -473.51397705 -1149.63232422
   -353.77716064  -991.16949463]
 [-1687.76879883 -1637.52893066 -1028.81896973 -1110.0489502
  -1547.65185547 -1283.07312012 -1938.08459473 -1303.72045898
      0.         -1126.125     ]
 [-1251.01818848 -1168.29333496  -706.27801514  -892.3838501
  -1071.16552734  -988.64581299 -1391.57861328 -1047.1348877
      0.          -862.14099121]
 [-2222.14306641 -1432.18896484     0.  

### Cleanup

After you have finished with this example, remember to delete the prediction endpoint to release the instance(s) associated with it.
You can double check in the SageMaker dashboard if u want.

In [16]:
predictor.endpoint_name

'pytorch-training-240817-0147-003-b20f7a6f'

In [17]:
# The next function is no longer operational in SageMaker version 2 and above
#we'll proceed to an alternative
#tuner.delete_endpoint()

import boto3

client = boto3.client("sagemaker")

endpoint_name = predictor.endpoint_name

client.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '86ea176b-4ea9-40e1-a04e-f3cae0c6be42',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '86ea176b-4ea9-40e1-a04e-f3cae0c6be42',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Sat, 17 Aug 2024 02:18:53 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}