# Text to Fashion Images

## 1. Upload data to S3
Here I use pokeman dataset as an example, which is composed of 833 image-text pairs. To scale up, you can just process your data into the same format.

In [5]:
# !pip install sagemaker

In [3]:
import sagemaker
import boto3
import datetime
import json
import os

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [7]:
sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()

role = sagemaker.get_execution_role()

prefix_data = 'datasets/zuimei-radar-cropped-debug'
#input_data = 's3://sagemaker-us-west-2-452145973879/datasets/zuimei-radar-cropped-debug/'

local_data_path = "../data/zuimei-radar-cropped-debug"
input_data = sagemaker_session.upload_data(path=local_data_path, key_prefix=prefix_data)
print(input_data)

s3://sagemaker-us-west-2-452145973879/datasets/zuimei-radar-cropped-debug


## 2. Upload pretrained models to S3

In [8]:
prefix_model = 'models/dgmr_all'
#input_data = 's3://sagemaker-us-west-2-452145973879/models/dgmr_all/'

local_model_path = "../models"
input_model = sagemaker_session.upload_data(path=local_model_path, key_prefix=prefix_model)
print(input_model)

s3://sagemaker-us-west-2-452145973879/models/dgmr_all


## 2. Start a training job

In [15]:
import time
from sagemaker.estimator import Estimator

region = sagemaker_session.boto_session.region_name

image_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-sagemaker'

instance_count = 1
instance_type = 'ml.g5.48xlarge'
# instance_type = 'ml.p4d.24xlarge' ## p4d - 8*40G / p4de - 8*80G
# instance_type = 'ml.g5.48xlarge'

#     'TRAIN_DIR': '/opt/ml/input/data/train'
environment = {
    'NODE_NUMBER': str(instance_count),
    'TRAIN_DATA_PATH': f's3://{bucket}/{prefix_data}/train/',
    'VALID_DATA_PATH': f's3://{bucket}/{prefix_data}/valid/',
    'PRETRAINED_MODEL_S3_PATH': f"{input_model}/dgmr/",
    'OUTPUT_MODEL_S3_PATH': f's3://{bucket}/checkpoints/dgmr_checkpoint', # destination
#    'LATEST_CHECKPOINT_S3_PATH': f's3://{bucket}/checkpoints/BrushNet_urbanic_random_custommask/checkpoint-60000/'
}

estimator = Estimator(role=role,
                      entry_point='entry.py',
                      source_dir='./sm_scripts',
                      base_job_name='dgmr-launch',
                      instance_count=instance_count,
                      instance_type=instance_type,
                      image_uri=image_uri,
                      environment=environment,
                      max_run=3*24*3600, #任务最大存续时间，默认2day，需要提交ticket提升quota最大28天
                      disable_profiler=True,
                      debugger_hook_config=False)

estimator.fit({'train': input_data})

INFO:sagemaker:Creating training-job with name: dgmr-launch-2024-05-11-05-55-58-071


2024-05-11 05:56:00 Starting - Starting the training job
2024-05-11 05:56:00 Pending - Training job waiting for capacity......
2024-05-11 05:56:55 Pending - Preparing the instances for training......
2024-05-11 05:57:56 Downloading - Downloading input data......
2024-05-11 05:58:36 Downloading - Downloading the training image.........
2024-05-11 06:00:27 Training - Training image download completed. Training in progress........[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-05-11 06:01:35,334 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-05-11 06:01:35,397 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-05-11 06:01:35,408 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-05-11 06:01:35,410 sagemaker_pytorch_container.training INFO 