## Notebook环境
Notebook的运行环境可以选择conda_tensorflow_p36，这里所用的sagemaker版本为2.42.0，接下来我们会安装对应的版本。

In [None]:
! pip install --upgrade pip
! pip install sagemaker==2.42.0

In [1]:
import boto3
import sagemaker
from sagemaker import get_execution_role

region = boto3.session.Session().region_name
role   = get_execution_role()
sess   = sagemaker.Session()
bucket = sess.default_bucket()

In [2]:
bucket

'sagemaker-us-east-1-022346938362'

## 准备Docker image

In [2]:
import boto3

account_id = boto3.client('sts').get_caller_identity().get('Account')
region = boto3.Session().region_name
ecr_repository = 'sagemaker-wenet'

# 登录ECR服务
!aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com


https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


### 创建注册表

In [4]:
!aws ecr create-repository --repository-name $ecr_repository


An error occurred (RepositoryAlreadyExistsException) when calling the CreateRepository operation: The repository with name 'sagemaker-wenet' already exists in the registry with id '022346938362'


### 构建训练镜像

In [3]:
training_docker_file_path = '/fsx/wenet_smddp'

!cat $training_docker_file_path/Dockerfile-py36-pt1.8.1-cu111-sox-ready

FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.8.1-gpu-py36-cu111-ubuntu18.04

RUN cd / && \
    pip install ninja && \
    apt update && \
    apt-get install sox libsox-dev libsox-fmt-all pkg-config -y && \
    CUDNN_VERSION=8.0.5.39 && \
apt-get install cuda-nvrtc-11-1 cuda-nvrtc-dev-11-1 libcudnn8-dev=$CUDNN_VERSION-1+cuda11.1 -y && \
    TORCHAUDIO_VERSION=v0.8.1 && \
    git clone -b ${TORCHAUDIO_VERSION} https://github.com/pytorch/audio torchaudio && \
    cd torchaudio && \
    git submodule update --init --recursive && \
    pip install .

COPY ./requirements.txt /tmp/

RUN pip install -r /tmp/requirements.txt && \
    pip install sagemaker-training && \
    apt-get clean
    

In [None]:
# 构建训练镜像并推送到ECR
tag = ':training-pt181'
training_repository_uri = '{}.dkr.ecr.{}.amazonaws.com/{}'.format(account_id, region, ecr_repository + tag)
print('training_repository_uri: ', training_repository_uri)

!cd $training_docker_file_path && docker build -t "$ecr_repository$tag" . -f Dockerfile-py36-pt1.8.1-cu111-sox-ready
!docker tag {ecr_repository + tag} $training_repository_uri
!docker push $training_repository_uri


# !docker pull $training_repository_uri

### 构建推理镜像

In [33]:
decoding_docker_file_path='/fsx/wenet_smddp/runtime/server/x86'

!cat $decoding_docker_file_path/Dockerfile

FROM ubuntu:latest
MAINTAINER <zhendong.peng@mobvoi.com>
ENV DEBIAN_FRONTEND=noninteractive
RUN sed -i s@/archive.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list
RUN apt-get update && apt-get install -y git cmake wget build-essential
RUN git clone https://github.com/mobvoi/wenet.git /home/wenet
ARG model=20210327_unified_transformer_exp_server.tar.gz
RUN wget -P /home http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/aishell2/$model
RUN tar -xzf /home/$model -C /home
ARG build=/home/wenet/runtime/server/x86/build
RUN mkdir $build && cmake -S $build/.. -B $build



In [None]:
# 构建推理容器并推送到ECR
tag = ':decoding'
decoding_repository_uri = '{}.dkr.ecr.{}.amazonaws.com/{}'.format(account_id, region, ecr_repository + tag)
print('decoding_repository_uri: ', decoding_repository_uri)


!cd $decoding_docker_file_path && docker build -t "$ecr_repository$tag" .
!docker tag {ecr_repository + tag} $decoding_repository_uri
!docker push $decoding_repository_uri


## 数据准备

### 数据下载

In [None]:
cd /fsx/wenet_smddp/examples/aishell/s0 && \
bash run.sh --stage -1 --stop_stage -1 --data /fsx/asr-data/OpenSLR/33


### 数据预处理

In [8]:
from sagemaker.inputs import FileSystemInput
from sagemaker.pytorch.estimator import PyTorch

# bash run.sh --stage 4 --stop_stage 4 --trail_dir /opt/ml/input/data/train --train_set train --data /opt/ml/input/data/33

# 指定文件系统的id.
file_system_id = 'fs-0f8a3b8eef47b6ff8'
# 提供数据集所在的路径，注意格式
file_system_path = '/yobzhbmv'
# 指定挂载文件系统的访问模式，支持"ro"（只读）或"rw"（读写）两种，注意内置算法只支持 以 ro 的方式挂载
file_system_access_mode = 'rw'
# 指定文件系统的类型, 支持"EFS" 或 "FSxLustre"两种.
file_system_type = 'FSxLustre'
# 以VPC内的方式启动 Amazon SageMaker 训练任务,指定所在子网和安全组，subnet需要为list或者tuple格式
security_group_ids = ['sg-04acfc98f6929ee4e']
# subnets= ['vpc-3c49de46']
subnets= ['subnet-07ce0ab63b4cfeb25']

# 定义数据输入
file_system_input_train = FileSystemInput(file_system_id=file_system_id,
                                  file_system_type=file_system_type,
                                  directory_path=file_system_path,
                                  file_system_access_mode=file_system_access_mode)

data_dir   = '/opt/ml/input/data/train/asr-data/OpenSLR/33'
trail_dir  = '/opt/ml/input/data/train/sm-train/trail0'
shared_dir = '/opt/ml/input/data/train/sm-train/shared'
# shared_dir = '/opt/ml/input/data/train/shared'

## 数据预处理 - SageMaker托管实例

In [None]:


bash run.sh --stage 4 --stop_stage 4 --train_set train  \
    --data /opt/ml/input/data/train/asr-data/OpenSLR/33 \
    --trail_dir /opt/ml/input/data/train/sm-train/trail0 \
    --shared_dir /opt/ml/input/data/train/sm-train/shared 

# /opt/ml/input/data/train  <==> /fsx
# /opt/ml/input/data/train/asr-data/OpenSLR/33  <==> /fsx/asr-data/OpenSLR/33
# /opt/ml/input/data/train/sm-train ==> /fsx/sm-train

hp= {
    'stage': 0, 'stop_stage': 3, 'train_set':'train', 
    'data': data_dir, 'trail_dir': trail_dir, 'shared_dir': shared_dir
}

estimator=PyTorch(
    entry_point='examples/aishell/s0/sm-run.sh',
    image_uri=training_repository_uri,
    instance_type='ml.c5.4xlarge',
    instance_count=1,
    source_dir='.',
    role=role,
    hyperparameters=hp,
    
    subnets=subnets,
    security_group_ids=security_group_ids,
    
    debugger_hook_config=False,
    disable_profiler=True
)

# estimator.fit({'train':'file:///fsx/trail_local_0/', 'wav':'file:///fsx/asr-data/OpenSLR/33/data_aishell/wav/'})

estimator.fit(inputs={'train': file_system_input_train})


2021-06-08 09:49:56 Starting - Starting the training job...
2021-06-08 09:49:58 Starting - Launching requested ML instances......
2021-06-08 09:51:11 Starting - Preparing the instances for training......
2021-06-08 09:52:06 Downloading - Downloading input data
2021-06-08 09:52:06 Training - Downloading the training image...........[34m2021-06-08 09:54:06,765 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-06-08 09:54:08,241 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-06-08 09:54:08,251 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-06-08 09:54:08,258 sagemaker-training-toolkit INFO     Invoking user script
[0m
[34mTraining Env:
[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "train": "/opt/ml/input/data/train"
    },
    "current_host": "algo-1",
    "framework_module": null,
    "hosts": [


## 模型训练 - SageMaker托管实例


In [6]:
!pwd

/fsx/wenet_smddp/examples/aishell/s0


In [None]:
%cd /fsx/wenet_smddp

# bash run.sh --stage 4 --stop_stage 4 --train_set train  \
#     --data /opt/ml/input/data/train/asr-data/OpenSLR/33 \
#     --trail_dir /opt/ml/input/data/train/sm-train/trail0 \
#     --shared_dir /opt/ml/input/data/train/sm-train/shared 


data_dir   = '/opt/ml/input/data/train/asr-data/OpenSLR/33'
trail_dir  = '/opt/ml/input/data/train/sm-train/trail0'
shared_dir = '/opt/ml/input/data/train/sm-train/shared'

# instance_type='ml.g4dn.4xlarge'
instance_type='ml.p4d.24xlarge'
instance_count = 2
# CUDA_VISIBLE_DEVICES='0,1,2,3'
CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'

hp= {
    'stage': 4, 'stop_stage': 4, 'train_set':'train', 
    'data': data_dir, 'trail_dir': trail_dir, 'shared_dir': shared_dir,
    'CUDA_VISIBLE_DEVICES': CUDA_VISIBLE_DEVICES, 
    'ddp_init_path': '/opt/ml',
    'num_nodes': instance_count
}

estimator=PyTorch( 
    entry_point='examples/aishell/s0/sm-run.sh',
    image_uri=training_repository_uri,
#     image_uri='022346938362.dkr.ecr.us-east-1.amazonaws.com/sagemaker-wenet:training-pt181',
    instance_type =instance_type,
    instance_count=instance_count,
    source_dir='.',
    role=role,
    hyperparameters=hp,
    
    subnets=subnets,
    security_group_ids=security_group_ids,
    
    debugger_hook_config=False,
    disable_profiler=True,
    distribution = {
        'smdistributed':{
            'dataparallel':{
                'enabled': True, 
#                 "custom_mpi_options": "-verbose -x NCCL_DEBUG=VERSION"
            }
        }
    }
    # Parameters required to enable checkpointing
#     checkpoint_s3_uri=checkpoint_s3_bucket,
#     checkpoint_local_path=checkpoint_local_path
)


estimator.fit(inputs={'train': file_system_input_train})
