# 环境准备

In [2]:
!pip install "sagemaker>=2.140.0" "transformers[torch]==4.26.1" "datasets[s3]==2.10.1" --upgrade

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::051995725733:role/service-role/AmazonSageMaker-ExecutionRole-20240204T100428
sagemaker bucket: sagemaker-us-west-2-051995725733
sagemaker session region: us-west-2


In [55]:
from transformers import AutoProcessor
from datasets import load_dataset
import numpy as np
from PIL import Image
from random import randint

bucket_name = 'celianih-urbanic'

# 用于存放数据集的Bucket Prefix
s3_prefix = 'datasets/goods'

# 用于训练的底模
model_name = 'google/vit-base-patch16-224-in21k'

image_processor = AutoProcessor.from_pretrained(model_name)

# 训练数据准备

In [5]:
!pip install --upgrade pillow

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


###### 开始对数据集进行处理，因为vit-base-patch16-224-in21k对于训练的input features要求时3 * 224 * 224的维度的数据。因为需要对图片进行预先处理。处理有以下方式：
- 1.对图片按照224*224等比例压缩
- 2.对图片按照244*244等比例压缩后，进行填充，保持训练图片为224*224

In [None]:
from zipfile import ZipFile
from PIL import Image
import os
import shutil

def is_image_file(file_path):
    try:
        Image.open(file_path)
        return True
    except (IOError, SyntaxError):
        return False


def resize_image(input_path, output_path, target_size=(224, 224)):
    with Image.open(input_path) as img:
        img.thumbnail(target_size)
        image = img.convert("L")
        # new_image = Image.new("RGB", target_size, color=(255, 255, 255))
        # left_margin = (target_size[0] - img.width) // 2
        # top_margin = (target_size[1] - img.height) // 2
        # new_image.paste(img, (left_margin, top_margin))
        os.makedirs(os.path.dirname(output_path), exist_ok=True)  # 创建目标文件夹
        image.save(output_path)

def process_zip(zip_file_path, output_zip_path, target_size=(224, 224)):
    with ZipFile(zip_file_path, 'r') as zip_ref:
        # 创建一个临时目录以提取图像
        temp_dir = 'temp_extracted_images'
        os.makedirs(temp_dir, exist_ok=True)

        # 从 ZIP 存档中提取所有文件和文件夹
        zip_ref.extractall(temp_dir)

        # 删除 macOS 元数据目录
        macosx_dir = os.path.join(temp_dir, '__MACOSX')
        shutil.rmtree(macosx_dir, ignore_errors=True)

        # 调整每个图像的大小并保存在输出目录中
        resized_images_dir = 'resized_images'
        os.makedirs(resized_images_dir, exist_ok=True)

        # 遍历临时目录中的所有图像文件
        for root, dirs, files in os.walk(temp_dir):
            for file_name in files:
                input_path = os.path.join(root, file_name)
                if is_image_file(input_path):
                    output_path = os.path.join(resized_images_dir, os.path.relpath(input_path, temp_dir))
                    resize_image(input_path, output_path, target_size)

        # 使用调整大小的图像创建一个新的 ZIP 文件
        with ZipFile(output_zip_path, 'w') as new_zip:
            for root, dirs, files in os.walk(resized_images_dir):
                for file_name in files:
                    file_path = os.path.join(root, file_name)
                    arcname = os.path.relpath(file_path, resized_images_dir)
                    new_zip.write(file_path, arcname=arcname)
        os.rmdir(temp_dir)
# Example usage
zip_file_path = './datasets/urbanic.zip'
output_zip_path = './datasets/urbanic_small.zip'
target_size = (224, 224)

process_zip(zip_file_path, output_zip_path, target_size)

In [121]:
dataset_result = load_dataset("imagefolder", data_files="./datasets/urbanic_small.zip")

Downloading and preparing dataset imagefolder/default to /root/.cache/huggingface/datasets/imagefolder/default-5cf9e7d044407545/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...


Downloading data files: 0it [00:00, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset imagefolder downloaded and prepared to /root/.cache/huggingface/datasets/imagefolder/default-5cf9e7d044407545/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [122]:
from datasets import Features, Array3D
import pyarrow

pyarrow.PyExtensionType.set_auto_load(True)
dataset = dataset_result['train']
# we need to extend the features 
features = Features({
    **dataset.features,
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
})

# extractor helper function
def preprocess_images(examples):
    # get batch of images
    images =  examples['image']
    inputs = image_processor(images=images)
    examples['pixel_values'] = inputs['pixel_values']

    return examples

# preprocess dataset
dataset = dataset.map(preprocess_images, batched=True,features=features)

# set to torch format for training
dataset.set_format('torch', columns=['pixel_values', 'label'])

# remove unused column
dataset = dataset.remove_columns("image")

Map:   0%|          | 0/644 [00:00<?, ? examples/s]

  pa.PyExtensionType.__init__(self, self.storage_dtype)


In [123]:
# 将数据集分割成 training + test，因为数据集相对较少，因此比例设置为7:3。并且要注意的是，保持label按比例进行取样，且具有随机性
splits = dataset.train_test_split(test_size=0.3,stratify_by_column='label')
train_dataset = splits['train']
test_dataset = splits['test']

In [124]:
# 查看测试集的label取样数据
import pandas as pd

# 将 test_dataset 转换为 Pandas DataFrame
df_test = test_dataset.to_pandas()

# 查看每个标签的数量
label_counts = df_test['label'].value_counts()
print(label_counts)

  pa.PyExtensionType.__init__(self, self.storage_dtype)


label
6    60
1    51
0    33
2    30
5     7
3     7
4     6
Name: count, dtype: int64


In [125]:
import botocore
from datasets.filesystems import S3FileSystem

# save train_dataset to s3
training_input_path = f's3://{bucket_name}/{s3_prefix}/train_fill'
train_dataset.save_to_disk(training_input_path, num_shards=1)

# save test_dataset to s3
test_input_path = f's3://{bucket_name}/{s3_prefix}/test_fill'
test_dataset.save_to_disk(test_input_path, num_shards=1)

print(f"train dataset is uploaded to {training_input_path}")
print(f"test dataset is uploaded to {test_input_path}")

Flattening the indices:   0%|          | 0/450 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/450 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/194 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/194 [00:00<?, ? examples/s]

train dataset is uploaded to s3://celianih-urbanic/datasets/goods/train_fill
test dataset is uploaded to s3://celianih-urbanic/datasets/goods/test_fill


# 模型训练

In [141]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters={'num_train_epochs': 9, # train epochs
                 'per_device_train_batch_size': 3, # batch size
                 'model_name': model_name, # model which will be trained on
                }

In [142]:
huggingface_estimator = HuggingFace(entry_point='train_vit.py',
                            source_dir='./scripts',
                            instance_type='ml.p3.2xlarge',
                            instance_count=1,
                            role=role,
                            transformers_version='4.26',
                            pytorch_version='1.13',
                            
                            py_version='py39',
                            hyperparameters = hyperparameters)

In [143]:
huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2024-02-06-16-24-02-627


2024-02-06 16:24:03 Starting - Starting the training job...
2024-02-06 16:24:23 Pending - Training job waiting for capacity...
2024-02-06 16:24:47 Pending - Preparing the instances for training......
2024-02-06 16:25:59 Downloading - Downloading input data...
2024-02-06 16:26:34 Downloading - Downloading the training image.....................
2024-02-06 16:30:05 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-02-06 16:30:23,708 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-02-06 16:30:23,729 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-02-06 16:30:23,742 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-02-06 16:30:23,745 sagemaker_pytorch_container.training 

# 模型部署

In [3]:
env={'HF_TASK':'image-classification'}

#### 从S3部署

In [6]:
from sagemaker.huggingface.model import HuggingFaceModel

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data="s3://sagemaker-us-west-2-051995725733/huggingface-pytorch-training-2024-02-04-15-56-07-719/output/model.tar.gz",  # path to your trained SageMaker model
   role=role,                                            # IAM role with permissions to create an endpoint
   transformers_version="4.26",                           # Transformers version used
   pytorch_version="1.13",                                # PyTorch version used
   py_version='py39',                                    # Python version used
   env=env,
)

In [7]:
predictor = huggingface_model.deploy(
   initial_instance_count=1,
   instance_type="ml.m5.xlarge"
)

----!

#### 训练后立刻部署

In [133]:
predictor = huggingface_estimator.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge",env=env)

INFO:sagemaker:Creating model with name: huggingface-pytorch-training-2024-02-06-15-55-40-776
INFO:sagemaker:Creating endpoint-config with name huggingface-pytorch-training-2024-02-06-15-55-40-776
INFO:sagemaker:Creating endpoint with name huggingface-pytorch-training-2024-02-06-15-55-40-776


----!

# 模型效果测试

In [None]:
#### 测试整个验证集合=

In [136]:
import os
from sagemaker.serializers import DataSerializer

# 设置数据序列化器
predictor.serializer = DataSerializer(content_type='image/x-image')

# 定义目录路径
directory_path = "./test/profile_train/"

# 遍历目录下的所有文件
for filename in os.listdir(directory_path):
    if filename.endswith(".jpg"):
        # 构建文件的完整路径
        file_path = os.path.join(directory_path, filename)
        image = Image.open(file_path)
        # 调整图像大小为 224x224
       # image = image.resize((200, 226))

        # 将调整大小后的图像转换为字节流
        with io.BytesIO() as output:
            image.save(output, format="JPEG")
            data = output.getvalue()
        # 进行推理
        prediction_result = predictor.predict(data)
        max_score_label = max(prediction_result, key=lambda x: x['score'])
        print(max_score_label)

{'score': 0.818080723285675, 'label': 'fullprofile'}
{'score': 0.7842152118682861, 'label': 'fullprofile'}
{'score': 0.8390139937400818, 'label': 'fullprofile'}
{'score': 0.8065673112869263, 'label': 'fullprofile'}
{'score': 0.5374179482460022, 'label': 'fullprofile'}
{'score': 0.839031457901001, 'label': 'fullprofile'}
{'score': 0.7631337642669678, 'label': 'fullprofile'}
{'score': 0.8350762128829956, 'label': 'fullprofile'}
{'score': 0.5555331110954285, 'label': 'fullprofile'}
{'score': 0.8246079683303833, 'label': 'fullprofile'}
{'score': 0.8048135638237, 'label': 'fullprofile'}
{'score': 0.8111885786056519, 'label': 'fullprofile'}
{'score': 0.42862579226493835, 'label': 'fullprofile'}
{'score': 0.823401927947998, 'label': 'fullprofile'}
{'score': 0.7019846439361572, 'label': 'fullprofile'}
{'score': 0.7781262397766113, 'label': 'fullfront'}
{'score': 0.7975015044212341, 'label': 'fullprofile'}
{'score': 0.8518428206443787, 'label': 'fullprofile'}
{'score': 0.7967789769172668, 'labe

### 测试单张图片

In [42]:
from sagemaker.serializers import DataSerializer
	
predictor.serializer = DataSerializer(content_type='image/x-image')

# Make sure the input file "cats.jpg" exists
with open("./test/profile/Admin Portal-63.jpg", "rb") as f:
	data = f.read()
predictor.predict(data)

[{'score': 0.8839594125747681, 'label': 'fullfront'},
 {'score': 0.03828425705432892, 'label': 'fullprofile'},
 {'score': 0.01733628287911415, 'label': 'nohead'},
 {'score': 0.01655077561736107, 'label': 'lower'},
 {'score': 0.016185248270630836, 'label': 'noperson'}]

#### 删除endpoint

In [137]:
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: huggingface-pytorch-training-2024-02-06-15-55-40-776
INFO:sagemaker:Deleting endpoint with name: huggingface-pytorch-training-2024-02-06-15-55-40-776
