## Using Huggingface DLC to Host the Whisper Model for Automatic Speech Recognition Tasks

## Common set up 
**❗If you run this notebook in SageMaker Studio, please select the Data Science 2.0 image and choose the ml.m5.large instance.**

In [10]:
# # Install required packages
# %pip install openai-whisper # ==20230918 # -q
# %pip install torchaudio # ==2.1.0 # -q
# %pip install datasets # ==2.16.1 # -q
# %pip install sagemaker # ==2.184.0 # -q
# %pip install librosa  # -q
# %pip install soundfile # -q

In [11]:
# !pip install transformers # >=4.28.1 -q
# !pip install accelerate # >=0.20.3 -q

In [8]:
# !conda install -y ffmpeg

In [106]:
#!pip install -U sagemaker

**❗Please restart the kernel before executing the cells below.**

In [43]:
# import required packages 
import torch
import whisper
import torchaudio
import sagemaker
import time
import boto3

In [44]:
# basic configurations 
sess = sagemaker.session.Session()
bucket = sess.default_bucket()
prefix = 'models/whisper_ckpts'
prefix = 'whisper_blog_post'
role = sagemaker.get_execution_role()
region = sess._region_name 

# below boto3 clients are for invoking asynchronous endpoint 
sm_runtime = boto3.client("sagemaker-runtime")

### Create Whisper Hugging Face model artifacts and upload to S3 bucket

In [49]:
!mkdir -p model

In [50]:
from transformers import WhisperProcessor, AutoModelForSpeechSeq2Seq

# Load the pre-trained model
model_name = "/home/ec2-user/SageMaker/efs/Projects/whisper/checkpoint/checkpoint-v6/checkpoint-40" # "openai/whisper-base"
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)

# Define a directory where you want to save the model
save_directory = "./model"

# Save the model to the specified directory
model.save_pretrained(save_directory)

from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(save_directory)
processor = WhisperProcessor.from_pretrained(model_name)
processor.save_pretrained(save_directory)

Loading checkpoint shards: 100%|██████████| 2/2 [00:15<00:00,  7.83s/it]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[]

In [51]:
!tar cvzf model.tar.gz -C model/ .

model_uri = sess.upload_data('model.tar.gz', bucket = bucket, key_prefix=f"{prefix}/huggingface/model")
!rm model.tar.gz
!rm -rf model
model_uri

./
./tokenizer_config.json
./preprocessor_config.json
./model-00002-of-00002.safetensors
./model.safetensors.index.json
./config.json
./merges.txt
./generation_config.json
./special_tokens_map.json
./added_tokens.json
./vocab.json
./model-00001-of-00002.safetensors
./normalizer.json


's3://sagemaker-us-west-2-452145973879/whisper_blog_post/huggingface/model/model.tar.gz'

In [135]:
# Generate a unique model name and provide image uri

id = int(time.time())
model_name = f'whisper-hf-model-{id}'

# !Please change the image URI for the region that you are using:e.g. us-east-1
# image = f"763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-inference:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04"
# image = f"763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-inference:2.3.0-gpu-py311-cu121-ubuntu20.04-sagemaker"
image = "452145973879.dkr.ecr.us-west-2.amazonaws.com/whisper-inference:v0"

In [136]:
# Create a HuggingFaceModel for deployment
from sagemaker.huggingface.model import HuggingFaceModel

whisper_hf_model = HuggingFaceModel(
    model_data=model_uri,
    role=role,
    image_uri = image,
    entry_point="inference.py",
    source_dir='code',
    name=model_name,
    env = {
        "chunk_length_s":"30",
        'MMS_MAX_REQUEST_SIZE': '2000000000',
        'MMS_MAX_RESPONSE_SIZE': '2000000000',
        'MMS_DEFAULT_RESPONSE_TIMEOUT': '900'
    }
)

### Real-time inference 

In [137]:
from sagemaker.serializers import DataSerializer
from sagemaker.deserializers import JSONDeserializer

# Define serializers and deserializer
audio_serializer = DataSerializer(content_type="audio/x-audio")  # "audio/x-audio"
deserializer = JSONDeserializer()

In [138]:
# Deploy the model for real-time inference
endpoint_name = f'whisper-hf-real-time-endpoint-{id}'

real_time_predictor = whisper_hf_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g4dn.xlarge",   # "ml.g4dn.xlarge",  "ml.g5.xlarge"
    endpoint_name = endpoint_name,
    serializer=audio_serializer,
    deserializer = deserializer
    )

--------------!

# if already deployed

In [149]:
from sagemaker import Model, image_uris, serializers, deserializers

# endpoint_name = 'whisper-hf-real-time-endpoint-1718259402'  # g4dn
endpoint_name = 'whisper-hf-real-time-endpoint-1718257905'  # g5

real_time_predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name, 
    sagemaker_session=sess,
    serializer=audio_serializer, # serializers.JSONSerializer()
    deserializer = deserializer
)

real_time_predictor

<sagemaker.base_predictor.Predictor at 0x7fc7ad18d180>

In [140]:
# # Download a test data sample from huggingface dataset
# import soundfile as sf
# from datasets import load_dataset
# dataset = load_dataset('MLCommons/peoples_speech', split='train', streaming = True)
# sample = next(iter(dataset))
# audio_data = sample['audio']['array']
# output_path = 'sample_audio.wav'
# sf.write(output_path, audio_data, sample['audio']['sampling_rate'])

# print(f"Audio sample saved to '{output_path}'.")

In [159]:
import json
import time
# Perform real-time inference
audio_path = "/home/ec2-user/SageMaker/efs/Projects/whisper/data/midea_data_500/wavs/bdaca8e0-4eab-44d0-b632-91dbb78e02eb.wav"
audio_path = "/home/ec2-user/SageMaker/efs/Projects/whisper/data/midea_data_500/wavs/89c871c5-516c-4bc0-90a5-0d7e60fbc374.wav"
audio_path = "/home/ec2-user/SageMaker/efs/Projects/whisper/data/midea_0612/wavs/d5f2afaa-53af-4dcb-ac24-3827a99c748e.wav"
# audio_path = "/home/ec2-user/SageMaker/efs/Projects/whisper/data/midea_0612/fb0b8693-208a-408c-9f56-1979aa9b5421.wav"
audio_path = "/home/ec2-user/SageMaker/efs/Projects/whisper/data/midea_0612/wavs_1channel/d5f2afaa-53af-4dcb-ac24-3827a99c748e_c1.wav"

# audio_path = "sample_audio.wav" 
start_time = time.time()
response = real_time_predictor.predict(data=audio_path)
time_cost = time.time() - start_time

print(f"time cost: {time_cost}s")
print(f"response: {response}")
# print(response[0])

time cost: 3.873155355453491s
response: 放落嚟星期五鬼星小姐你本人電話號碼幾多羅小姐你本人電話號碼係嗨你嗰部大眼雞嚟㗎嘛係咪?羅士在新聞問但係如果係話譬如我哋師傅上門安裝嗰個時間你都係與佢都係當日鐘五十二點至下集五點鐘呢段時間都係大約呢段時間上門嘅係呀好啊冇問題你府上係住喺鑽石山啟鑽院嗰邊嚟㗎嘛劉少正係咪好咁我呢邊就照落單我哋安排師傅今個星期五上門即係四月二十六號嘅好咁師�


In [83]:
# optional: Delete real-time inference endpoint, this is not required for below steps
# real_time_predictor.delete_endpoint()

### Batch Transform Inference

In [160]:
# Create a transformer
whisper_transformer = whisper_hf_model.transformer(
    instance_count = 1,
    instance_type = "ml.g4dn.xlarge",
    output_path="s3://{}/{}/batch-transform/".format(bucket, prefix),
    max_payload = 100
)

Using already existing model: whisper-hf-model-1718259402


In [161]:
# Please provide the S3 path where you have one or more audio files that you want to process 
data = "s3://sagemaker-us-west-2-452145973879/data/midea_data_500/wavs/"

In [162]:
# Define request data and job name
job_name = f"whisper-hf-batch-transform-{id}"

# Start batch transform job
whisper_transformer.transform(data = data, job_name= job_name, wait = False)

INFO:sagemaker:Creating transform job with name: whisper-hf-batch-transform-1718259402


### Asynchronous Inference 

In [163]:
%%time
from sagemaker.async_inference import AsyncInferenceConfig

# Create an AsyncInferenceConfig object
async_config = AsyncInferenceConfig(
    output_path=f"s3://{bucket}/{prefix}/output", 
    max_concurrent_invocations_per_instance = 4,
    # notification_config = {
            #   "SuccessTopic": "arn:aws:sns:us-east-2:123456789012:MyTopic",
            #   "ErrorTopic": "arn:aws:sns:us-east-2:123456789012:MyTopic",
    # }, #  Notification configuration 
)

# Deploy the model for async inference
endpoint_name = f'whisper-hf-async-endpoint-{id}'
async_predictor = whisper_hf_model.deploy(
    async_inference_config=async_config,
    initial_instance_count=1, # number of instances
    instance_type ='ml.g4dn.xlarge', # instance type
    endpoint_name = endpoint_name
)

INFO:sagemaker:Repacking model artifact (s3://sagemaker-us-west-2-452145973879/whisper_blog_post/huggingface/model/model.tar.gz), script artifact (code), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-west-2-452145973879/whisper-hf-model-1718259402/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: whisper-hf-model-1718259402
INFO:sagemaker:Creating endpoint-config with name whisper-hf-async-endpoint-1718259402
INFO:sagemaker:Creating endpoint with name whisper-hf-async-endpoint-1718259402


------------!CPU times: user 5min 44s, sys: 1min 2s, total: 6min 46s
Wall time: 12min 31s


In [164]:
# Provide the S3 path for the audio file you want to processs
input_path = "s3://xxx/audio-files/xxx.mp3"
input_path = "s3://sagemaker-us-west-2-452145973879/data/midea_data_500/wavs/005888b3-897d-4112-8232-0cefda76aa3f.wav"

In [165]:
# Perform async inference
initial_args = {'ContentType':"audio/x-audio"}
response = async_predictor.predict_async(initial_args = initial_args, input_path=input_path)
response.output_path

's3://sagemaker-us-west-2-452145973879/whisper_blog_post/output/ad9ec9de-b144-4a96-af09-cfdfc6b910e7.out'

### Optional: Test autoscaling configurations for Async inference 

In [166]:
autoscale = boto3.client('application-autoscaling') 
resource_id='endpoint/' + endpoint_name + '/variant/' + 'AllTraffic'

# Register scalable target
register_response = autoscale.register_scalable_target(
    ServiceNamespace='sagemaker', 
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredInstanceCount',
    MinCapacity=0,  
    MaxCapacity=3 # * check how many instances available in your account
)

# Define scaling policy
scalingPolicy_response = autoscale.put_scaling_policy(
    PolicyName='Invocations-ScalingPolicy',
    ServiceNamespace='sagemaker', # The namespace of the AWS service that provides the resource. 
    ResourceId=resource_id,  
    ScalableDimension='sagemaker:variant:DesiredInstanceCount', # SageMaker supports only Instance Count
    PolicyType='TargetTrackingScaling', # 'StepScaling'|'TargetTrackingScaling'
    TargetTrackingScalingPolicyConfiguration={
        'TargetValue': 3.0, # The target value for the metric. 
        'CustomizedMetricSpecification': {
            'MetricName': 'ApproximateBacklogSizePerInstance',
            'Namespace': 'AWS/SageMaker',
            'Dimensions': [
                {'Name': 'EndpointName', 'Value': endpoint_name }
            ],
            'Statistic': 'Average',
        },
        'ScaleInCooldown': 60, # The cooldown period helps you prevent your Auto Scaling group from launching or terminating 
                                # additional instances before the effects of previous activities are visible. 
                                # You can configure the length of time based on your instance startup time or other application needs.
                                # ScaleInCooldown - The amount of time, in seconds, after a scale in activity completes before another scale in activity can start. 
        'ScaleOutCooldown': 60 # ScaleOutCooldown - The amount of time, in seconds, after a scale out activity completes before another scale out activity can start.
        
        # 'DisableScaleIn': True|False - indicates whether scale in by the target tracking policy is disabled. 
                            # If the value is true , scale in is disabled and the target tracking policy won't remove capacity from the scalable resource.
    }
)

scalingPolicy_response

{'PolicyARN': 'arn:aws:autoscaling:us-west-2:452145973879:scalingPolicy:2a239217-1727-49eb-b629-8ea1870a3308:resource/sagemaker/endpoint/whisper-hf-async-endpoint-1718259402/variant/AllTraffic:policyName/Invocations-ScalingPolicy',
 'Alarms': [{'AlarmName': 'TargetTracking-endpoint/whisper-hf-async-endpoint-1718259402/variant/AllTraffic-AlarmHigh-0c02d78f-d7fa-4525-97df-ed99a5a4b874',
   'AlarmARN': 'arn:aws:cloudwatch:us-west-2:452145973879:alarm:TargetTracking-endpoint/whisper-hf-async-endpoint-1718259402/variant/AllTraffic-AlarmHigh-0c02d78f-d7fa-4525-97df-ed99a5a4b874'},
  {'AlarmName': 'TargetTracking-endpoint/whisper-hf-async-endpoint-1718259402/variant/AllTraffic-AlarmLow-19a89b75-bdab-42be-9e10-f1e2f3251de5',
   'AlarmARN': 'arn:aws:cloudwatch:us-west-2:452145973879:alarm:TargetTracking-endpoint/whisper-hf-async-endpoint-1718259402/variant/AllTraffic-AlarmLow-19a89b75-bdab-42be-9e10-f1e2f3251de5'}],
 'ResponseMetadata': {'RequestId': '90633e50-91b9-4aa5-a66e-2c724e38ba40',
  'H

In [167]:
# Trigger 1000 asynchronous invocations with autoscaling from 1 to 3
# then scale down to 0 on completion

print(endpoint_name)
for i in range(1,1000):
    response = sm_runtime.invoke_endpoint_async(
    EndpointName=endpoint_name, 
    InputLocation=input_path)
    
print("\nAsync invocations for Hugging Face model serving with autoscaling\n")

whisper-hf-async-endpoint-1718259402

Async invocations for Hugging Face model serving with autoscaling



### Clean up

In [None]:
# Delete Asynchronous inference endpoint
async_predictor.delete_endpoint()