## 1. Upload data to S3
Here I use pokeman dataset as an example, which is composed of 833 image-text pairs. To scale up, you can just process your data into the same format.

In [1]:
# !pip install sagemaker

In [2]:
import sagemaker
import boto3
import datetime
import json
import os

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:
sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()

role = sagemaker.get_execution_role()

In [4]:
prefix_data = 'datasets/midea_data/custom_data_v6'

local_data_path = "../data/custom_data_v6"
input_data = sagemaker_session.upload_data(path=local_data_path, key_prefix=prefix_data)
print(input_data)

s3://sagemaker-us-west-2-452145973879/datasets/midea_data/custom_data_v6


## 2. Upload pretrained models to S3

In [5]:
prefix_model = 'models/whisper-large-v3'

local_model_path = "/home/ec2-user/SageMaker/efs/Models/whisper-large-v3"
input_model = sagemaker_session.upload_data(path=local_model_path, key_prefix=prefix_model)
print(input_model)

s3://sagemaker-us-west-2-452145973879/models/whisper-large-v3


## 3. Start a training job

In [None]:
import time
from sagemaker.estimator import Estimator

region = sagemaker_session.boto_session.region_name

# image_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-sagemaker'
image_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-training:2.3.0-gpu-py311-cu121-ubuntu20.04-sagemaker'

instance_count = 1
# instance_type = 'ml.g5.48xlarge'
instance_type = 'ml.p4d.24xlarge' # 8xA100 40G

checkpoint_s3_uri = f's3://{bucket}/checkpoints/whisper_checkpoint_v6'
checkpoint_local_path = "/opt/ml/checkpoints"

environment = {
    'NODE_NUMBER': str(instance_count),
    'TRAIN_DATA_PATH': f's3://{bucket}/{prefix_data}/train/',
    'VALID_DATA_PATH': f's3://{bucket}/{prefix_data}/valid/',
    'PRETRAINED_MODEL_S3_PATH': f"{input_model}/",
    'OUTPUT_MODEL_S3_PATH': checkpoint_s3_uri, # destination
}

estimator = Estimator(role=role,
                      entry_point='entry.py',
                      source_dir='./sm_scripts',
                      base_job_name='whisper-launch',
                      instance_count=instance_count,
                      instance_type=instance_type,
                      volume_size=1024, # in GB
                      image_uri=image_uri,
                      environment=environment,
                      max_run=3*24*3600, #任务最大存续时间，默认2day，需要提交ticket提升quota最大28天
                      disable_profiler=True,
                      debugger_hook_config=False,
                      checkpoint_s3_uri=checkpoint_s3_uri,
                      checkpoint_local_path=checkpoint_local_path)

estimator.fit()

INFO:sagemaker:Creating training-job with name: whisper-launch-2024-06-12-03-20-46-211


2024-06-12 03:20:48 Starting - Starting the training job...
2024-06-12 03:21:00 Pending - Training job waiting for capacity............................................
2024-06-12 03:29:18 Pending - Preparing the instances for training...........................
2024-06-12 03:33:50 Downloading - Downloading input data...
2024-06-12 03:34:19 Downloading - Downloading the training image.....................
2024-06-12 03:37:25 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-06-12 03:37:53,192 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-06-12 03:37:53,283 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-06-12 03:37:53,292 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2



Training seconds: 11200
Billable seconds: 11200
