使用 Conda_python3 环境即可。

In [None]:
import sagemaker
import boto3
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()

account = sess.boto_session.client("sts").get_caller_identity()["Account"]
region = sess.boto_session.region_name
s3_bkt = sagemaker.Session().default_bucket()
s3_bkt

## Prepare a docker image

In [None]:
%%writefile Dockerfile
## You should change below region code to the region you used
From 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.0.1-gpu-py310-cu118-ubuntu20.04-sagemaker 

## Install packages needed in this NLP example
RUN pip install transformers==4.31.0 wandb==0.15.8 peft==0.4.0 \
    braceexpand==0.1.7 einops_exts==0.0.4 webdataset==0.2.48 \
    orjson==3.9.5 ijson==3.2.3 yajl==0.3.5 sentencepiece==0.1.99 

RUN apt update && apt install libyajl2 -y

RUN wget https://github.com/peak/s5cmd/releases/download/v2.2.1/s5cmd_2.2.1_linux_amd64.deb && \
    dpkg -i s5cmd_2.2.1_linux_amd64.deb && rm s5cmd_2.2.1_linux_amd64.deb

ENV LANG=C.UTF-8
ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE

## Make all local GPUs visible
ENV NVIDIA_VISIBLE_DEVICES="all"

## enabel EFA
# ENV FI_PROVIDER="efa"
# ENV NCCL_PROTO=simple
# ENV FI_EFA_USE_DEVICE_RDMA=1

# ENV NCCL_LAUNCH_MODE="PARALLEL"
# ENV NCCL_NET_SHARED_COMMS="0"

### ECR Login (Must run before docker build)

In [None]:
## You should change below region code to the region you used, and 
!aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com

### Build image and push to ECR.

In [None]:
## define repo name, should contain *sagemaker* in the name
repo_name = "sagemaker-hf-accelerate-otter"

In [None]:
%%script env repo_name=$repo_name bash

#!/usr/bin/env bash

# This script shows how to build the Docker image and push it to ECR to be ready for use
# by SageMaker.

# The argument to this script is the image name. This will be used as the image on the local
# machine and combined with the account and region to form the repository name for ECR.
# The name of our algorithm
algorithm_name=${repo_name}

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build -t ${algorithm_name} .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

In [None]:
BASE_MODEL_DIR = '/opt/ml/base_model'

In [None]:
%%writefile sm-train.py
#!/usr/bin/env python3

import os
import sys
import json
import socket
import yaml

# import sagemaker_ssh_helper
# sagemaker_ssh_helper.setup_and_start_ssh()

BASE_MODEL_DIR = '/opt/ml/base_model'

def download_s5cmd():
    os.system('wget -q https://github.com/peak/s5cmd/releases/download/v2.2.1/s5cmd_2.2.1_linux_amd64.deb && \
    dpkg -i s5cmd_2.2.1_linux_amd64.deb && rm s5cmd_2.2.1_linux_amd64.deb')


def update_text_config():
    file_name = f'{BASE_MODEL_DIR}/OTTER-LLaMA7B-INIT/config.json'
    with open(file_name) as f:
        model_config = json.loads(f.read())

    print("updating text config for model OTTER-LLaMA7B-INIT")
    model_config['text_config']['_name_or_path'] = f'{BASE_MODEL_DIR}/llama-7b-hf'
    model_config['text_config']['architectures'] = ["LlamaForCausalLM"]
    # model_config['text_config']['architectures'] = None

    with open(file_name, 'w') as f:
        json.dump(model_config, f)


def download_model():
    model_s3_url = os.environ.get('MODEL_S3_BASE')
    if len(model_s3_url) == 0:
        return

    if not model_s3_url.startswith('s3://'):
        model_s3_url = 's3://' + model_s3_url

    print(f'downloading model from {model_s3_url}')
    os.system(f"mkdir -p {BASE_MODEL_DIR}")
    os.system(f"s5cmd sync {model_s3_url}/* {BASE_MODEL_DIR}/")


def download_data():
    data_s3_url = os.environ.get('DATA_S3_BASE')
    if len(data_s3_url) == 0:
        return

    if not data_s3_url.startswith('s3://'):
        data_s3_url = 's3://' + model_s3_url

    print(f'downloading trainig data from {data_s3_url}')
    os.system(f"s5cmd sync {data_s3_url}/* /opt/ml/input/data/")


if __name__ == "__main__":
    try:
        download_s5cmd()
        download_model()
        update_text_config()
        download_data()
    except Exception as e:
        print(e)

    print(os.system('ls -Rlh /opt/ml/'))

    hosts = json.loads(os.environ['SM_HOSTS'])
    current_host = os.environ['SM_CURRENT_HOST']
    num_gpus = int(os.environ['SM_NUM_GPUS']) # num of gpu in current container
    host_rank = int(hosts.index(current_host))
    
    master = json.loads(os.environ['SM_TRAINING_ENV'])['master_hostname']
    master_addr = socket.gethostbyname(master)
    
    ########################
    os.environ['NODE_INDEX'] = str(host_rank)
    os.environ['SM_MASTER'] = str(master)
    os.environ['SM_MASTER_ADDR'] = str(master_addr)
    
    os.environ['NCCL_SOCKET_IFNAME'] = 'eth0'
    # os.environ['FI_PROVIDER'] = "efa"
    # os.environ['NCCL_PROTO'] = "simple"
    # os.environ['FI_EFA_USE_DEVICE_RDMA'] = "1"

#     os.environ['NCCL_LAUNCH_MODE'] = "PARALLEL"
#     os.environ['NCCL_NET_SHARED_COMMS'] = "0"
    #########################

    acclerate_file_name = './pipeline/accelerate_configs/accelerate_config_fsdp.yaml'
    with open(acclerate_file_name) as f:
        doc = yaml.safe_load(f)
    doc['machine_rank'] = host_rank
    doc['main_process_ip'] = str(master_addr)
    doc['num_machines'] = len(hosts)  # how many intances in this training job
    doc['num_processes'] = len(hosts) * num_gpus  # how many GPU cards in total
    with open(acclerate_file_name, 'w') as f:
        yaml.safe_dump(doc, f)
    
    os.system(f"accelerate launch --config_file={acclerate_file_name} pipeline/train/instruction_following.py {' '.join(sys.argv[1:])}")

In [None]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

### 把模型拷贝到S3

In [None]:
%pip install huggingface-hub -Uqq

from huggingface_hub import snapshot_download
from pathlib import Path

def download_and_push_model_to_s3(model_id, commit_hash, s3_prefix):
    local_model_path = Path(f'/home/ec2-user/SageMaker/model/{model_id}')
    local_model_path.mkdir(exist_ok=True, parents=True)
    model_name = model_id.split('/')[1]
    snapshot_download(repo_id=model_id, revision=commit_hash, cache_dir=local_model_path)

    model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]

    !aws s3 cp --recursive {model_snapshot_path} s3://{s3_bkt}/{s3_prefix}/


In [None]:
download_and_push_model_to_s3("luodian/llama-7b-hf", "27f2d847cf30ab1cdbd4a2ad82fc38309cd57257", "viture/training/models/luodian/llama-7b-hf")
download_and_push_model_to_s3("luodian/OTTER-LLaMA7B-INIT", "cc075926603ab1ffdef5f0a7809f84201ec31346", "viture/training/models/luodian/OTTER-LLaMA7B-INIT")

In [None]:
# 复制数据到S3

! aws s3 sync ~/SageMaker/Viture/Data/ s3://{s3_bkt}/otter/data/

### Notice
Before run below code, make sure you have :

- Config VPC endpoint for S3, and add related route to below subnet you used
- Config VPC NAT Gateway (if you need pip install during the training or download from internet
- Add route(0.0.0.0/0 through NAT GW) to route table which is used by below subnet you used
- Config security group (MUST if you use p4d/p4de instances)
- Add inbound rule, allow all traffic in from the security itself
- Add outbound rule, allow all traffic out to the security itself

In [None]:
import time
from sagemaker.estimator import Estimator

image_uri = "{}.dkr.ecr.{}.amazonaws.com/{}:latest".format(account, region, repo_name)


environment = {
    # no tailing /
    'MODEL_S3_BASE': f's3://{s3_bkt}/otter/training/model',  # The bucket to store pretrained model and fine-tune model
    'DATA_S3_BASE': f's3://{s3_bkt}/otter/data',
    "PYTHONPATH": ".",  # required for training
}

hp = {
    "pretrained_model_name_or_path": f"{BASE_MODEL_DIR}/OTTER-LLaMA7B-INIT",
    "mimicit_path": "/opt/ml/input/data/MIMIC-IT-Release/VST/VST_instructions.json",
    "images_path": "/opt/ml/input/data/MIMIC-IT-Release/VST/VST.json",
    "train_config_path": "/opt/ml/input/data/MIMIC-IT-Release/VST/VST_train.json",
    "batch_size": 1,
    "num_epochs": 6,
    "run_name": 'OTTER-LLaMA7B-densecaption',
    "wandb_project": 'OTTER-LLaMA7B',
    "workers": 1,
    "lr_scheduler": "cosine",
    "learning_rate": '1e-5',
    "warmup_steps_ratio": '0.01',
    "model_name": "flamingo",
    "save_hf_model": True,
    "offline": True,
}

base_job_name = 'viture-otter'

instance_type = 'ml.g5.12xlarge'

estimator = Estimator(role=role,
                      entry_point='sm-train.py',
                      source_dir='./',
                      base_job_name=base_job_name,
                      instance_count=1,
                      instance_type=instance_type,
                      image_uri=image_uri,
                      environment=environment,
                      hyperparameters=hp,
                    #   subnets=['subnet-56d99b20'], # Should be same vpc with FSx, best to use same subnet with FSx
                    #   security_group_ids=['sg-e6c3059f'], # Needed when use FSx
                      keep_alive_period_in_seconds=60*30, # Optional to set, Recommend use when debug and fast to relaunch without provision instances and images download, need submit warm pool instances limit increase first
                      volume_size = 50,
                      disable_profiler=True,
                      debugger_hook_config=False,
                      wait=False)

# estimator.fit({"MIMIC-IT-Release": "s3://xxx"})
estimator.fit()

# SM Serving
----

In [None]:
download_and_push_model_to_s3('luodian/OTTER-Image-MPT7B', '6c2012970fcd643c015769a717259597f20e08f6', "viture/serving/models/luodian/OTTER-Image-MPT7B")