# Download the Data

In [None]:
import os
import io
import tarfile
import urllib
import shutil
import json
import random
import numpy as np
from tqdm import tqdm
from pathlib import Path

import boto3
import sagemaker

from matplotlib import pyplot as plt
from xml.etree import ElementTree as ET
from PIL import Image, ImageDraw, ImageFont

%matplotlib inline

urls = ['http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz',
        'http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz']


In [None]:
def download(download_dir):
    for url in urls:
        target_file = url.split('/')[-1]
        if target_file not in os.listdir(download_dir):
            print(f'Downloading {target_file} ...')
            urllib.request.urlretrieve(url, os.path.join(download_dir, target_file))
        else:
            print(f'Already downloaded {target_file}')

def extract(data_dir, download_dir):
    for url in urls:
        target_file = url.split('/')[-1]
        target_dir = target_file.replace('.tar.gz', '')
        assert target_file in os.listdir(download_dir), f'{target_file} not found in {download_dir}'
        if target_dir not in os.listdir(data_dir):
            print(f'Extracting {target_file} ...')
            tf = tarfile.open(url.split('/')[-1])
            tf.extractall(data_dir)
        else:
            print(f'Already extracted {target_file}')


Path('data').mkdir(parents=True, exist_ok=True)

download('.')
extract('data', '.')


# Visualize Data

Even thought the "annotations" directory have more masks (`.png`) than the actual images (`.jpg`) in the "images" directory, we'll only use the ones we need (the ones we have the `.jpg` for).

The structure of the files is like `image_name.jpg` and then the mask with the same name but with a `.png` extension.

In [None]:
trimaps_dir = 'data/annotations/trimaps/'

maps = [x for x in os.listdir(trimaps_dir) if x[-3:] == 'png']
print(f"num of masks: {len(maps)}")


In [None]:
image_dir = 'data/images/'

images = [x for x in os.listdir(image_dir) if x[-3:] == 'jpg']
print(f"num of images:{len(images)}")


In [None]:
images[5]


In [None]:
plt.figure(figsize=(12, 12))

for i in range(0, 4):
    index = random.randint(0, len(images) - 1)
    image_name = images[index]
    map_name = images[index].split('.')[0] + '.png'

    plt.subplot(4, 2, 1 + i*2)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(plt.imread(os.path.join(trimaps_dir, map_name)))
    plt.subplot(4, 2, 2 + i*2)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(plt.imread(os.path.join(image_dir, image_name)))
plt.show()


In [None]:
# number of classes in the mask
# a class for background, a class for the object (forground), and a class for the object boundary (or also known as niether region)
img = Image.open(os.path.join(trimaps_dir, maps[0]))
print(np.unique(img))


# SageMaker Setup

In [None]:
# create a sagemaker role
role = sagemaker.get_execution_role()

# A bucket is a fundamental storage container within AWS Simple Storage Service (S3).
# A bucket provides a secure and scalable way to store and retrieve any amount of data from anywhere on the web.
# You need to create the S3 bucket (from the AWS website) before you can use it.
# note that the bucket name must be unique across all AWS accounts (not just your account)!
bucket_name = "petdatamy"

# we'll be using the sagemaker's built-in semantic segmentation algorithm
# the algorithm is a pre-built container that is hosted on Amazon Elastic Container Registry (ECR)
# the `training_image`, refers to the URI of the container image that contains the algorithm
# `boto3.Session().region_name` is the region where the algorithm is hosted
training_image = sagemaker.image_uris.retrieve(
    framework='semantic-segmentation', 
    region=boto3.Session().region_name, 
    version='latest',
    )

print(f"Training image: {training_image}")


In [None]:
# train -> holds training images
# validation -> holds validation images

# train_annotation -> holds training trimaps (masks)
# validation_annotation -> holds validation trimaps (masks)

pre = Path('local_bucket')
folders = ['train', 'train_annotation', 'validation', 'validation_annotation']

for folder in folders:
    folder = pre / folder
    # uncomment to delete local_bucket
    shutil.rmtree(folder) if folder.exists() else None
    folder.mkdir(parents=True, exist_ok=True)


# Preparing Data for SageMaker
We move the dataset to AWS Bucket for the SageMaker to work on.

In [None]:
def get_map_file(image):
    """get the trimap for a given image"""
    map_file = image.split('.')[0] + '.png'
    assert map_file in maps
    return map_file


# if images exist in the local bucket, we skip the loop to prevent a messy copy
# delete the local bucket if you want to split and copy all images again
if len(set((pre / 'train').iterdir())) > 0:
    print(f'Images exists in {pre}')
else:
    for image in tqdm(images):
        # randomly assign 75% of the images to the training set and 25% to the validation set
        # since we are looping over each image once, there is no need to prevent duplicates
        # note that this random assignment doesn't guarantee exactly 75% / 25% split!
        # It's probabilistic, so we might get slightly different proportions each time the code runs
        target_set = 'train' if random.randint(0, 99) < 75 else 'validation'
        
        # constructing the image path
        image_file_path = Path('data/images') / image
        image_target_path = pre / target_set / image  # for the local bucket
        
        # constructing the trimap path
        map_file_path = Path(trimaps_dir) / get_map_file(image)
        map_target_path = pre / (target_set + '_annotation') / get_map_file(image)  # for the local bucket
        
        # copying the images to the target directories
        shutil.copy(image_file_path, image_target_path)
        shutil.copy(map_file_path, map_target_path)


In [None]:
train_images = set((pre / 'train').iterdir())
train_annots = set((pre / 'train_annotation').iterdir())

print(f"{len(train_annots)=}, \n{len(train_images)=}")

# Uploading Data to S3

Technically, S3 doesn't have any folder structure.
It uses a flat namespace to store the objects.
However, the S3 console displays the objects in a folder-like structure by using prefixes.
when we specify a prefix, S3 will create a virtual folder structure for us.

In [None]:
sess = sagemaker.Session()

upload = False  # set to True to upload the data to S3

if upload:
    print("Starting uploading to S3 ...")
    print("Uploading train images ...")

    s3_train_path = sess.upload_data(path=pre/'train', bucket=bucket_name, key_prefix='train') 

    print("Uploading train annotation ...")
    s3_train_annotation_path = sess.upload_data(path=pre/'train_annotation', bucket=bucket_name, key_prefix='train_annotation')

    print("Uploading validation images ...")
    s3_val_path = sess.upload_data(path=pre/'validation', bucket=bucket_name, key_prefix='validation')

    print("Uploading validation annotation ...")
    s3_val_annotation_path = sess.upload_data(path=pre/'validation_annotation', bucket=bucket_name, key_prefix='validation_annotation')

    print(f"Finished uploading to S3")


In [None]:
print(s3_train_path)

# SageMaker Estimator

In [None]:
model_api = sagemaker.estimator.Estimator(
    training_image,
    role = role,
    sagemaker_session=sess,

    instance_count=1,  # number of GPUs
    
    # note that the instance type must be compatible with the algorithm
    instance_type='ml.p3.2xlarge', # 16 GB GPU machine

    # The size of the EBS (Elastic Block Store) volume that will be attached to the instance
    volume_size=100,  # in GB

    # Upper limit of the time that the model can run for
    max_run=36_000,  # in seconds

    # the mode that the input data is stored in the bucket
    input_mode='File',

    output_path=f's3://{bucket_name}/output',  # where the trained model artifacts will be stored
)

# Hyperparameters

**What is Pacemaker?**

an open-source high availability (HA) resource manager that AWS uses to orchestrate and manage the availability of SAP applications and databases, ensuring minimal downtime in case of failures

In [None]:
model_api.set_hyperparameters(
    backbone='resnet50',
    use_pretrained_model=True,  # pretrained on the ImageNet dataset
    algorithm='fcn', # Fully Convolutional Network
    crop_size=240,  # the size of the input images
    
    # the number of classes in the dataset.
    # eventhough we have 3 classes, our trimaps classes are not 0-2 but 1-3
    # so we have 2 options: either we change all the trimap pngs to 0-2, or we set the num_classes to 4
    # and hope the algorithm will learn to ignore class zero.
    num_classes=4,
    num_training_samples=len(train_images),

    epochs=10,
    learning_rate=0.001,
    optimizer='rmsprop', # adam, sgd, or rmsprop
    lr_scheduler='poly',  # step, cosine, or poly
    # weight_decay=0.0001,

    mini_batch_size=16,
    validation_mini_batch_size=16,

    # early_stopping=True,
    # early_stopping_patience=5,
    # early_stopping_metric='validation:loss',
    # early_stopping_metric_criteria='min',
)

# Data Channels

These will point to the data locations in "S3". We pass this to the model to know where to look.

In [None]:
train_data = sagemaker.session.s3_input(
    s3_train_path,
    distribution='FullyReplicated',  # the data is replicated across all instances if we are distribute training
    content_type='image/jpeg',
    # `S3Prefix` will look at the files in the directory and create the training data channel
    s3_data_type='S3Prefix',  # or S3DataTypeManifest
)

train_ann_data = sagemaker.session.s3_input(
    s3_train_annotation_path,
    distribution='FullyReplicated',
    content_type='image/png',
    s3_data_type='S3Prefix',
)

val_data = sagemaker.session.s3_input (
    s3_val_path,
    distribution='FullyReplicated',
    content_type='image/jpeg',
    s3_data_type='S3Prefix',
)

val_ann_data = sagemaker.session.s3_input(
    s3_val_annotation_path,
    distribution='FullyReplicated',
    content_type='image/png',
    s3_data_type='S3Prefix',
)


In [None]:
data_channels = {
    'train': train_data,
    'train_annotation': train_ann_data,
    'validation': val_data,
    'validation_annotation': val_ann_data
}

print(data_channels)


# Model Training

# Deploy Model

# Predictions

In [None]:
image_dir = 'validation'
images = [x for x in os.listdir(image_dir) if x[-3:] == 'jpg']
print(len(images))

In [None]:
index = 2

image_path = os.path.join(image_dir, images[index])
# image_path = 'dog_cat.jfif'

with open(image_path, 'rb') as f:
    b = bytearray(f.read())

results = deployed_model.predict(b)

In [None]:
mask = np.array(Image.open(io.BytesIO(results)))

In [None]:
plt.imshow(plt.imread(image_path));

In [None]:
plt.imshow(mask);

Don't forget!! You need to delete endpoint or else you will continue to accrue cost!

In [None]:
sagemaker.Session().delete_endpoint(deployed_model.endpoint)