# Sagemaker Train

This script creates and trains the model with the uploaded image in ECR.

## Import modules

In [15]:
import time
import boto3
import sagemaker
from sagemaker import get_execution_role

## Setup

Modify according to your configurations.

In [16]:
# Bucket name in S3
bucket = "hermione-sagemaker"

In [17]:
# Set session
region_name="us-east-1"
boto3.setup_default_session(region_name=region_name)

In [18]:
# Get user role
role = get_execution_role()

In [19]:
# Get AWS Account ID
account_number = boto3.client("sts").get_caller_identity()["Account"]

In [20]:
# Image previous uploaded in ECR
image_name = "hermione-train"
image_uri = f"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name}"

In [21]:
# Input and output paths to execute train
paths = {
    'train_processed': f"s3://{bucket}/PREPROCESSING/TRAIN_PROCESSED",
    'val_processed': f"s3://{bucket}/PREPROCESSING/VAL_PROCESSED",
    'model': f"s3://{bucket}/PREPROCESSING/MODEL"
}

In [22]:
# instance to run the code
instance_type="ml.m5.large"

## Train

In [23]:
# Receives the processed train data in S3
train_config = sagemaker.inputs.TrainingInput(
    paths['train_processed'],
    content_type='text/csv',
)

In [24]:
# Receives the processed validation data in S3
val_config = sagemaker.inputs.TrainingInput(
    paths['val_processed'],
    content_type='text/csv'
)

In [25]:
# Saves the model object in S3
output_path = paths['model']

In [26]:
# Metrics to visualize in the Monitor
metrics = [
    {
        "Name": "accuracy",
        "Regex": "accuracy=(.*?);",
    },
    {
        "Name": "f1",
        "Regex": "f1=(.*?);",
    },
    {
        "Name": "precision",
        "Regex": "precision=(.*?);",
    },
    {
        "Name": "recall",
        "Regex": "recall=(.*?);",
    },
]

In [27]:
# Creates the estimator to access the ECR image
est = sagemaker.estimator.Estimator(
    image_uri,
    role, 
    instance_count=1, 
    instance_type=instance_type,
    volume_size = 30,
    output_path = output_path,
    base_job_name = "Hermione-train",
    use_spot_instances=True,
    max_run = 24*60*60,
    max_wait = 24*60*60,       # timeout in seconds. Required if use_spot_instances == True
    metric_definitions=metrics
)

In [28]:
%%time
# Train the model and validate
est.fit({'train':train_config, 'validation':val_config}, wait=True, logs=True)

2021-05-26 12:41:29 Starting - Starting the training job...
2021-05-26 12:41:52 Starting - Launching requested ML instancesProfilerReport-1622032889: InProgress
......
2021-05-26 12:42:52 Starting - Preparing the instances for training......
2021-05-26 12:43:52 Downloading - Downloading input data
2021-05-26 12:43:52 Training - Downloading the training image.....[34m2021-05-26 09:44:41,407 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m

2021-05-26 12:45:00 Uploading - Uploading generated training model
2021-05-26 12:45:00 Completed - Training job completed
[34m2021-05-26 09:44:47,642 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-05-26 09:44:47,653 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-05-26 09:44:47,663 sagemaker-training-toolkit INFO     Invoking user script
[0m
[34mTraining Env:
[0m
[34m{
    "additional_framework_parameters": {},