In [2]:
import sys
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.tensorflow import TensorFlow
from sagemaker import ScriptProcessor
import sagemaker
import  datetime
import boto3
import os

gpus_per_host_dict = {
    'ml.p3.2xlarge': 1,
    'p3.2xlarge': 1,
    'ml.p3.8xlarge': 4,
    'p3.8xlarge': 4,
    'ml.p3.16xlarge': 8,
    'p3.16xlarge': 8,
    'p2.8xlarge': 8
}

timezone = datetime.timezone(datetime.timedelta(hours=-7))
datetime.datetime.now(tz=datetime.timezone(datetime.timedelta(hours=-7))).strftime('%H:%M')

'15:33'

In [3]:
# Making sure we are using sagemaker version 2 
!{sys.executable} -m pip install sagemaker -U -q


You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
# Setting job names and bucket info


training_job_basename = 'simclr-train'
dev_bucket_name = 'simclr-dev' 

s3_raw_input_tr_images = 's3://data/raw/train/images'
s3_training_input = 's3://data/processed/train/images'  

In [4]:
sess = sagemaker.session.Session()
region = sess.boto_region_name
sm_boto_client = sess.boto_session.client(service_name='sagemaker')
s3_boot_client = sess.boto_session.client(service_name='s3')
sm_role = sagemaker.get_execution_role()

In [5]:
region, sm_role

('us-west-2',
 'arn:aws:iam::575348091205:role/service-role/AmazonSageMaker-ExecutionRole-20210616T173263')

## Training in cluster

In [6]:
metric_re = '[-+]?[0-9]+[.]?[0-9]*([eE][-+]?[0-9]+)?'
metric_definitions = [{'Name': 'weight_decay', 'Regex': f'^.+ train/weight_decay = ({metric_re})'},
                      {'Name': 'total_loss'  , 'Regex': f'^.+ train/total_loss   = ({metric_re})'},
                      {'Name': 'contrast_loss', 'Regex': f'^.+ train/contrast_loss = ({metric_re})'},
                      {'Name': 'contrast_acc', 'Regex': f'^.+ train/contrast_acc = ({metric_re})'},
                      {'Name': 'contrast_entropy', 'Regex': f'^.+ train/contrast_entropy = ({metric_re})'},
                      {'Name': 'supervised_loss', 'Regex': f'^.+ train/supervised_loss = ({metric_re})'},
                      {'Name': 'supervised_acc', 'Regex': f'^.+ train/supervised_acc = ({metric_re})'},
                      {'Name': 'supervised_msens', 'Regex': f'^.+ train/supervised_msens = ({metric_re})'},
                      {'Name': 'val_supervised_loss', 'Regex': f'^.+ val/supervised_loss = ({metric_re})'},
                      {'Name': 'val_supervised_acc', 'Regex': f'^.+ val/supervised_acc = ({metric_re})'},
                      {'Name': 'val_supervised_msens', 'Regex': f'^.+ val/supervised_msens = ({metric_re})'} 
                     ]

### Using boto3 API

In [7]:
import time
from time import strftime

import sagemaker

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

In [159]:
!tar -cvzf sourcedir.tar.gz -C src/  `ls src`

base_h_params = {'seed': "1234",
                 'train_epochs': "100",
                 'train_epochs_finetune':"10",
                 'train_batch_size':'32',
                 'oversample':'True',     #'--nooversample', 'True'
                 'oversample_f':'True',   #'--nooversample_f', 'True'
                 'optimizer':'adam',
                 'learning_rate':'0.001',
                 'learning_rate_finetuning':'0.001',
                 'use_step_decay':"True",  #'--nouse_step_decay', 'True'
                 'num_proj_layers': '1',
                 'ft_proj_selector':'0',
                 'proj_head_mode': 'nonlinear',
                 'pretrain_on_train_only': '--nopretrain_on_train_only', #'--nopretrain_on_train_only',
                 'train_mode' : 'finetune',  #pretrain_then_finetune, finetune
                 'base_weights': 'imagenet',  #imagenet, None
                 'weight_decay': "1e-3",
                 'distrib_library': "smdistributed"  # 'None', 'horovod', 'smdistributed'
                }
base_h_params.update({'hparams_header': ",".join(list(base_h_params.keys()))})  #list of HP parameters as shown in Tensorboard

timestamp = datetime.datetime.now(tz=timezone).strftime('-%y-%m-%d-%Hh%Mm')

training_instance_type = 'ml.p3.16xlarge' # ml.p3.16xlarge #'ml.g4dn.2xlarge' | 'ml.m4.xlarge'|'ml.m4.2xlarge'|'ml.m4.4xlarge'|'ml.m4.10xlarge'|'ml.m4.16xlarge'|'ml.g4dn.xlarge'|'ml.g4dn.2xlarge'|'ml.g4dn.4xlarge'|'ml.g4dn.8xlarge'|'ml.g4dn.12xlarge'|'ml.g4dn.16xlarge'|'ml.m5.large'|'ml.m5.xlarge'|'ml.m5.2xlarge'|'ml.m5.4xlarge'|'ml.m5.12xlarge'|'ml.m5.24xlarge'|'ml.c4.xlarge'|'ml.c4.2xlarge'|'ml.c4.4xlarge'|'ml.c4.8xlarge'|'ml.p2.xlarge'|'ml.p2.8xlarge'|'ml.p2.16xlarge'|'ml.p3.2xlarge'|'ml.p3.8xlarge'|'ml.p3.16xlarge'|'ml.p3dn.24xlarge'|'ml.p4d.24xlarge'|'ml.c5.xlarge'|'ml.c5.2xlarge'|'ml.c5.4xlarge'|'ml.c5.9xlarge'|'ml.c5.18xlarge'|'ml.c5n.xlarge'|'ml.c5n.2xlarge'|'ml.c5n.4xlarge'|'ml.c5n.9xlarge'|'ml.c5n.18xlarge',
instance_count = 1

t_job_name = 'test-smd-wd1e3-1-16xlarge-buff1'+timestamp   #training_job_basename+timestamp

s3_code=sess.upload_data(path='sourcedir.tar.gz', 
                         bucket = dev_bucket_name, 
                         key_prefix='code/'+t_job_name
                        )
print(s3_code)

processes_per_host = gpus_per_host_dict[training_instance_type]
distributed = True
h_parameters = base_h_params
h_parameters.update({
    'sagemaker_submit_directory': s3_code,    
    'sagemaker_container_log_level': "20",
    'sagemaker_enable_cloudwatch_metrics': "true",
    'sagemaker_job_name': t_job_name,
    'sagemaker_region': region,    
    'sagemaker_program': "simclr_run_distrib.py", 
    's3_tensorboard_uri': f"s3://{dev_bucket_name}/model/tensorboard/logs_distrib/" + t_job_name
})

if base_h_params['distrib_library'] == "horovod":
    h_parameters.update({'sagemaker_mpi_custom_mpi_options': "-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none",
                         'sagemaker_mpi_enabled': str(distributed),
                         "sagemaker_mpi_num_of_processes_per_host": str(processes_per_host)
                        })
    
elif base_h_params['distrib_library'] == "smdistributed":
    h_parameters.update({'sagemaker_mpi_custom_mpi_options':"-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none",
                         'sagemaker_distributed_dataparallel_enabled': str(distributed)
                        })    


s3_base_output_uri = f"s3://{dev_bucket_name}/model/checkpoints/" 
s3_checkpoints = s3_base_output_uri + t_job_name

training_container_img= sagemaker.image_uris.retrieve(framework="tensorflow", 
                                                        image_scope='training',
                                                        version='2.4', 
                                                        region='us-west-2',
                                                        instance_type=training_instance_type,
                                                        py_version='py37'
                                                        )
response = sm_boto_client.create_training_job(
                TrainingJobName=t_job_name,
                RoleArn=sm_role,
                AlgorithmSpecification={
                    'TrainingImage': training_container_img,
                    'TrainingInputMode': 'File',
                    'EnableSageMakerMetricsTimeSeries': True,
                    'MetricDefinitions': metric_definitions
                },
                InputDataConfig=[{'ChannelName': 'train',
                                  'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
                                                                  'S3Uri': f'{s3_isic_training_input}/folderized',
                                                                  'S3DataDistributionType': 'FullyReplicated'}},
                                                },
                                ],
                HyperParameters=h_parameters,
                OutputDataConfig={'KmsKeyId': '',
                                  'S3OutputPath': s3_base_output_uri}, #SM automatically adds jobname/output/ to this URI
                ResourceConfig={'InstanceType': training_instance_type,
                                'InstanceCount': instance_count,
                                'VolumeSizeInGB': 200},
                StoppingCondition={'MaxRuntimeInSeconds': 86400},
                CheckpointConfig = {
                    "S3Uri": s3_checkpoints
                }
           )

Ensembling.py
Evaluate.py
asymetric_loss.py
feed.py
model.py
preprocess.py
preprocessing/
preprocessing/to_tfrecord.py
preprocessing/folderize.py
preprocessing/__init__.py
preprocessingRefined.py
requirements.txt
resnet.py
simclr_aug_util.py
simclr_config.py
simclr_data.py
simclr_eval.py
simclr_lars_opt.py
simclr_metrics.py
simclr_model.py
simclr_objective.py
simclr_run.py
simclr_run_distrib.py
stats.py
train.py
untitled.flow
visualizer.py
s3://simclr-isic-dev-oregon/code/test-smd-wd1e3-1-16xlarge-buff1-21-10-14-09h45m/sourcedir.tar.gz


## Model Evaluation


In [59]:
!tar -cvzf sourcedir.tar.gz -C src/  `ls src`


jn = "evaluation"

base_h_params = {'seed': "1234",
                 'oversample':'False',   
                 'oversample_f':'False',   
                }

timestamp = datetime.datetime.now(tz=timezone).strftime('-%y-%m-%d-%Hh%Mm')

eval_instance_type = 'ml.p3.2xlarge' #'ml.g4dn.2xlarge' | 'ml.m4.xlarge'|'ml.m4.2xlarge'|'ml.m4.4xlarge'|'ml.m4.10xlarge'|'ml.m4.16xlarge'|'ml.g4dn.xlarge'|'ml.g4dn.2xlarge'|'ml.g4dn.4xlarge'|'ml.g4dn.8xlarge'|'ml.g4dn.12xlarge'|'ml.g4dn.16xlarge'|'ml.m5.large'|'ml.m5.xlarge'|'ml.m5.2xlarge'|'ml.m5.4xlarge'|'ml.m5.12xlarge'|'ml.m5.24xlarge'|'ml.c4.xlarge'|'ml.c4.2xlarge'|'ml.c4.4xlarge'|'ml.c4.8xlarge'|'ml.p2.xlarge'|'ml.p2.8xlarge'|'ml.p2.16xlarge'|'ml.p3.2xlarge'|'ml.p3.8xlarge'|'ml.p3.16xlarge'|'ml.p3dn.24xlarge'|'ml.p4d.24xlarge'|'ml.c5.xlarge'|'ml.c5.2xlarge'|'ml.c5.4xlarge'|'ml.c5.9xlarge'|'ml.c5.18xlarge'|'ml.c5n.xlarge'|'ml.c5n.2xlarge'|'ml.c5n.4xlarge'|'ml.c5n.9xlarge'|'ml.c5n.18xlarge',

instance_count = 1  

eval_job_name = 'eval-job'+timestamp   #training_job_basename+timestamp

s3_code=sess.upload_data(path='sourcedir.tar.gz', 
                         bucket = dev_bucket_name, 
                         key_prefix='code/'+eval_job_name
                        )
print(s3_code)

processes_per_host = 1 #gpus_per_host_dict[training_instance_type]

h_parameters = base_h_params
h_parameters.update({
    'sagemaker_submit_directory': s3_code,    
    'sagemaker_container_log_level': "20",
    'sagemaker_enable_cloudwatch_metrics': "true",
    'sagemaker_job_name': eval_job_name,
    'sagemaker_region': region,    
    'sagemaker_program': "simclr_eval.py"
})
 
training_container_img= sagemaker.image_uris.retrieve(framework="tensorflow", 
                                                        image_scope='training',
                                                        version='2.4', 
                                                        region='us-west-2',
                                                        instance_type=training_instance_type,
                                                        py_version='py37'
                                                        )
response = sm_boto_client.create_training_job(
                TrainingJobName=eval_job_name,
                RoleArn=sm_role,
                AlgorithmSpecification={
                    'TrainingImage': training_container_img,
                    'TrainingInputMode': 'File',
                    'EnableSageMakerMetricsTimeSeries': True,
                    'MetricDefinitions': metric_definitions
                },
                InputDataConfig=[{'ChannelName': 'train',
                                  'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
                                                                  'S3Uri': f'{s3_isic_training_input}/folderized',
                                                                  'S3DataDistributionType': 'FullyReplicated'}},
                                                },
                                 {'ChannelName': 'model_to_eval',
                                  'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
                                                                  'S3Uri': f's3://simclr-isic-dev-oregon/model/checkpoints/{jn}/final_model/{jn}/output/model.tar.gz',
                                                                  'S3DataDistributionType': 'FullyReplicated'}},
                                                },
                                ],
                HyperParameters=h_parameters,
                OutputDataConfig={'KmsKeyId': '',
                                  'S3OutputPath': s3_checkpoints + "final_model/"},
                ResourceConfig={'InstanceType': training_instance_type,
                                'InstanceCount': instance_count,
                                'VolumeSizeInGB': 200},
                StoppingCondition={'MaxRuntimeInSeconds': 86400},
                CheckpointConfig = {
                    "S3Uri": s3_checkpoints
                }
#                ExperimentConfig = {
#                "TrialName" : demo_trial.trial_name,
#                "TrialComponentDisplayName" : "TrainingJob",
#                }
           )

Ensembling.py
Evaluate.py
asymetric_loss.py
feed.py
model.py
preprocess.py
preprocessing/
preprocessing/to_tfrecord.py
preprocessing/folderize.py
preprocessing/__init__.py
preprocessingRefined.py
requirements.txt
resnet.py
simclr_aug_util.py
simclr_config.py
simclr_data.py
simclr_eval.py
simclr_lars_opt.py
simclr_metrics.py
simclr_model.py
simclr_objective.py
simclr_run.py
stats.py
train/
train/.gitignore
train.py
visualizer.py
s3://simclr-isic-dev-oregon/code/eval-job-21-10-04-12h06m/sourcedir.tar.gz
