In [1]:
import sagemaker
from sagemaker.pytorch import PyTorch

In [2]:
# Define input image dimensions for resizing
height = 448
width = 448

# Define model hyperparameters
lr = 0.01
momentum = 0.9
T_0 = 225 # e.g. 899 / 4 (train_dataset_size / batch_size)
T_mult = 1
epochs = 20
batch_size = 4 # For both train and test sets

# Define number of layers for the ResNet neural network, select from [18, 34, 50 ,101, 152]
num_layers = 50

pretrained_weights = True
unfreeze_all_layers = False # i.e. Default: 'False', unfreezes last layer only for tuning

train_augmentation = False

In [3]:
# model = './model/resnet_sgd_cosineannealing_inference.py'
model = './model/resnet_sgd_cosineannealing_inference.py'

model_save_name = 'ResNet{}-{}x{}-variants-5'.format(num_layers, height, width)

model_save_path = 's3://car-tagging-datasets/sagemaker-models/trained-model-output'
model_code_save_path = 's3://car-tagging-datasets/sagemaker-models/trained-model-custom-code'

In [4]:
# Warm Restart Parameters:

bucket = None
saved_model_path = None

# Uncomment (if necessary) and amend the below paths for warm restarts using custom pre-trained weights

# bucket = 'custom-labels-console-eu-west-1-52749662e9'
# saved_model_id = 'resnet50-sgd-cosineannealing-448x448-2021-01-08-14-07-00-745'
# saved_model_path = 'for-custom-labels/train-dataset-augmented/sagemaker-pytorch-models/{}/output/model.tar.gz'.format(saved_model_id)

In [5]:
train = 's3://car-tagging-datasets/all-variants/variants-5/train/'

test_complex = 's3://car-tagging-datasets/all-variants/variants-5/test-complex/'

test_showroom = 's3://car-tagging-datasets/all-variants/variants-5/test/'

In [25]:
sagemaker_session = sagemaker.Session()

role = sagemaker.get_execution_role()

#Check the status of dataloader
estimator = PyTorch(entry_point=model,
                    role=role,
                    framework_version='1.4.0',
                    py_version='py3',
                    instance_count=1,
                    instance_type='ml.g4dn.xlarge',
                    base_job_name=model_save_name,
                    output_path=model_save_path,
#                     code_location=custom_code_uri,
                    metric_definitions=[
                        {'Name': 'train:loss', 'Regex': 'train Loss: (.*?);'},
                        {'Name': 'train:acc', 'Regex': 'train Acc: (.*?);'},
                        {'Name': 'validation:loss', 'Regex': 'validation Loss: (.*?);'},
                        {'Name': 'validation:acc', 'Regex': 'validation Acc: (.*?);'},
                        {'Name': 'learning_rate', 'Regex': 'lr: (.*?);'},
                        {'Name': 'epoch', 'Regex': 'epoch: (.*?);'},
                        {'Name': 'train_f1_score', 'Regex': 'train Avg. F1 Score: (.*?);'},
                        {'Name': 'validation_f1_score', 'Regex': 'validation Avg. F1 Score: (.*?);'},
                        {'Name': 'test_f1_score', 'Regex': 'test Avg. F1 Score: (.*?);'},
                        {'Name': 'classification_report', 'Regex': 'classification_report: (.*?);'}
                    ],
                    hyperparameters={
                        'image-height': height,
                        'image-width': width,
                        'batch-size': batch_size,
                        'epochs': epochs,
                        'lr': lr,
                        'momentum': momentum,
                        'T_0': T_0,
                        'T_mult': T_mult,
                        'num-layers': num_layers,
                        'pretrained-weights': pretrained_weights,
                        's3-bucket': bucket,
                        'warm-restart': saved_model_path,
                        'unfreeze-all-layers': unfreeze_all_layers,
                        'train-augmentation': train_augmentation
                    })

In [26]:
estimator.fit({'train': train,
               'validation': test_complex,
               'test': test_showroom})

2021-06-02 09:49:08 Starting - Starting the training job...
2021-06-02 09:49:30 Starting - Launching requested ML instancesProfilerReport-1622627347: InProgress
......
2021-06-02 09:50:30 Starting - Preparing the instances for training......
2021-06-02 09:51:31 Downloading - Downloading input data...
2021-06-02 09:51:52 Training - Downloading the training image......
2021-06-02 09:53:05 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-06-02 09:53:07,115 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-06-02 09:53:07,136 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-06-02 09:53:07,139 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-06-02 09:53:07,450 sagemaker-containers INFO     Mod