# 环境准备

In [5]:
!pip install "sagemaker>=2.140.0" "transformers[torch]==4.26.1" "datasets[s3]==2.10.1" --upgrade

Collecting datasets==2.10.1 (from datasets[s3]==2.10.1)
  Using cached datasets-2.10.1-py3-none-any.whl (469 kB)
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 2.16.1
    Uninstalling datasets-2.16.1:
      Successfully uninstalled datasets-2.16.1
Successfully installed datasets-2.10.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::051995725733:role/service-role/AmazonSageMaker-ExecutionRole-20240108T152914
sagemaker bucket: sagemaker-us-west-2-051995725733
sagemaker session region: us-west-2


In [7]:
from transformers import AutoProcessor
from datasets import load_dataset
import numpy as np
from PIL import Image
from random import randint

bucket_name = 'celianih-urbanic'

# s3 key prefix for the data
s3_prefix = 'datasets/goods'

# FeatureExtractor used in preprocessing
model_name = 'google/vit-base-patch16-224-in21k'

image_processor = AutoProcessor.from_pretrained(model_name)

# 训练数据准备

In [25]:
from zipfile import ZipFile
from PIL import Image
import os

def is_image_file(file_path):
    try:
        Image.open(file_path)
        return True
    except (IOError, SyntaxError):
        return False
    

def resize_image(input_path, output_path, target_size=(100, 100)):
    with Image.open(input_path) as img:
        resized_img = img.resize(target_size)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)  # 创建目标文件夹
        resized_img.save(output_path)

def process_zip(zip_file_path, output_zip_path, target_size=(100, 100)):
    with ZipFile(zip_file_path, 'r') as zip_ref:
        # 创建一个临时目录以提取图像
        temp_dir = 'temp_extracted_images'
        os.makedirs(temp_dir, exist_ok=True)

        # 从 ZIP 存档中提取所有文件和文件夹
        zip_ref.extractall(temp_dir)

        # 删除 macOS 元数据目录
        macosx_dir = os.path.join(temp_dir, '__MACOSX')
        shutil.rmtree(macosx_dir, ignore_errors=True)

        # 调整每个图像的大小并保存在输出目录中
        resized_images_dir = 'resized_images'
        os.makedirs(resized_images_dir, exist_ok=True)

        # 遍历临时目录中的所有图像文件
        for root, dirs, files in os.walk(temp_dir):
            for file_name in files:
                input_path = os.path.join(root, file_name)
                if is_image_file(input_path):
                    output_path = os.path.join(resized_images_dir, os.path.relpath(input_path, temp_dir))
                    resize_image(input_path, output_path, target_size)

        # 使用调整大小的图像创建一个新的 ZIP 文件
        with ZipFile(output_zip_path, 'w') as new_zip:
            for root, dirs, files in os.walk(resized_images_dir):
                for file_name in files:
                    file_path = os.path.join(root, file_name)
                    arcname = os.path.relpath(file_path, resized_images_dir)
                    new_zip.write(file_path, arcname=arcname)
os.rmdir(temp_dir)
# Example usage
zip_file_path = './datasets/urbanic.zip'
output_zip_path = './datasets/urbanic_small.zip'
target_size = (100, 100)

process_zip(zip_file_path, output_zip_path, target_size)

In [26]:
dataset_result = load_dataset("imagefolder", data_files="./datasets/urbanic_small.zip")

Generating train split: 0 examples [00:00, ? examples/s]

In [29]:
from datasets import Features, Array3D

dataset = dataset_result['train']
# we need to extend the features 
features = Features({
    **dataset.features,
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
})

# extractor helper function
def preprocess_images(examples):
    # get batch of images
    images =  examples['image']
    inputs = image_processor(images=images)
    examples['pixel_values'] = inputs['pixel_values']

    return examples

# preprocess dataset
dataset = dataset.map(preprocess_images, batched=True,features=features)

# set to torch format for training
dataset.set_format('torch', columns=['pixel_values', 'label'])

# remove unused column
dataset = dataset.remove_columns("image")

Map:   0%|          | 0/434 [00:00<?, ? examples/s]

In [30]:
# split up training into training + validation
splits = dataset.train_test_split(test_size=0.3)
train_dataset = splits['train']
test_dataset = splits['test']

In [33]:
import botocore
from datasets.filesystems import S3FileSystem

# save train_dataset to s3
training_input_path = f's3://{bucket_name}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path, num_shards=1)

# save test_dataset to s3
test_input_path = f's3://{bucket_name}/{s3_prefix}/test'
test_dataset.save_to_disk(test_input_path, num_shards=1)

print(f"train dataset is uploaded to {training_input_path}")
print(f"test dataset is uploaded to {test_input_path}")

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Saving the dataset (0/1 shards):   0%|          | 0/303 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/131 [00:00<?, ? examples/s]

train dataset is uploaded to s3://celianih-urbanic/datasets/goods/train
test dataset is uploaded to s3://celianih-urbanic/datasets/goods/test


# 模型训练

In [34]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters={'num_train_epochs': 4, # train epochs
                 'per_device_train_batch_size': 3, # batch size
                 'model_name': model_name, # model which will be trained on
                }

In [36]:
huggingface_estimator = HuggingFace(entry_point='train.py',
                            source_dir='./scripts',
                            instance_type='ml.p3.2xlarge',
                            instance_count=1,
                            role=role,
                            transformers_version='4.26',
                            pytorch_version='1.13',
                            
                            py_version='py39',
                            hyperparameters = hyperparameters)

In [39]:
huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2024-01-08-13-26-03-732


2024-01-08 13:26:04 Starting - Starting the training job......
2024-01-08 13:26:38 Starting - Preparing the instances for training...
2024-01-08 13:27:32 Downloading - Downloading input data...
2024-01-08 13:27:57 Downloading - Downloading the training image.....................
2024-01-08 13:31:13 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-01-08 13:31:29,682 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-01-08 13:31:29,706 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-01-08 13:31:29,720 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-01-08 13:31:29,723 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-01-08 13:31:30,

# 模型部署

In [45]:
env={'HF_TASK':'image-classification'}

In [58]:
from sagemaker.huggingface.model import HuggingFaceModel

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data="s3://sagemaker-us-west-2-051995725733/huggingface-pytorch-training-2024-01-08-13-26-03-732/output/model.tar.gz",  # path to your trained SageMaker model
   role=role,                                            # IAM role with permissions to create an endpoint
   transformers_version="4.26",                           # Transformers version used
   pytorch_version="1.13",                                # PyTorch version used
   py_version='py39',                                    # Python version used
   env=env,
)

In [59]:
predictor = huggingface_model.deploy(
   initial_instance_count=1,
   instance_type="ml.m5.xlarge"
)

INFO:sagemaker:Creating model with name: huggingface-pytorch-inference-2024-01-09-02-39-21-968
INFO:sagemaker:Creating endpoint-config with name huggingface-pytorch-inference-2024-01-09-02-39-22-610
INFO:sagemaker:Creating endpoint with name huggingface-pytorch-inference-2024-01-09-02-39-22-610


----!

# 模型效果测试

In [61]:
from sagemaker.serializers import DataSerializer
	
predictor.serializer = DataSerializer(content_type='image/x-image')

# Make sure the input file "cats.jpg" exists
with open("./test/1702467048348.jpg", "rb") as f:
	data = f.read()
predictor.predict(data)

[{'score': 0.8575611114501953, 'label': 'full'},
 {'score': 0.04050292819738388, 'label': 'upper'},
 {'score': 0.03603997081518173, 'label': 'lower'},
 {'score': 0.0340271033346653, 'label': 'nohead'},
 {'score': 0.03186880424618721, 'label': 'noperson'}]

In [55]:
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: huggingface-pytorch-inference-2024-01-08-13-50-14-779
INFO:sagemaker:Deleting endpoint with name: huggingface-pytorch-inference-2024-01-08-13-50-14-779
