# Sentence Transformers on Inf1

This notebook demonstrates the process of deploying sentence-transformers on Inf1 and utilizing all 4 neuron cores on inf1.xlarge instance type. The models have been traced using a separate script on bigger instance on EC2. Model can be traced within Studio Notebook as well.

## Package installation

In [None]:
!pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com

In [None]:
!pip install torch-neuron neuron-cc[tensorflow] sagemaker transformers accelerate datasets --upgrade

In [None]:
!pip install transformers -U 

## Imports

In [None]:
import torch
import os
import torch.neuron
from transformers import AutoModel, AutoTokenizer

In [None]:
# Based on the batch size select the corresponding directory
batch_size = 50
model_folder_name = f"tmp{batch_size}"

In [None]:
%%writefile code/inference.py

print("Importing everything")
import os
from transformers import AutoConfig, AutoTokenizer
import torch
import torch.neuron
import torch.nn.functional as F
import pathlib
import json

print("Imported everything")

# To use one neuron core per worker
os.environ["NEURON_RT_NUM_CORES"] = "1"
batch_size = int(os.environ.get('ST_BATCH_SIZE', 10))
max_length = os.environ.get('MAX_LENGTH', 256)

# saved weights name
AWS_NEURON_TRACED_WEIGHTS_NAME = f"neuron_model_{batch_size}.pt"
print(f"batch_size is {batch_size}")

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def model_fn(model_dir):
    # load tokenizer and neuron model from model_dir
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    print(f"JIT Loading")
    model = torch.jit.load(os.path.join(model_dir, AWS_NEURON_TRACED_WEIGHTS_NAME))
    print(f"JIT Loading done")
    model_config = AutoConfig.from_pretrained(model_dir)
    # encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')    
    return model, tokenizer, model_config

def predict_fn(data, model_tokenizer_model_config):
    # destruct model, tokenizer and model config
    model, tokenizer, model_config = model_tokenizer_model_config

    # create embeddings for inputs
    print("Model input received")
    inputs = data.pop("inputs", data)    
    encoded_input = tokenizer(inputs, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')    # convert to tuple for neuron model
    neuron_input = encoded_input['input_ids']    
    print(f"Neuron inputs created")

    # run prediciton
    with torch.no_grad():
        model_output = model(neuron_input)
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)   

    # return dictonary, which will be json serializable
    return [{"embeddings": json.dumps(sentence_embeddings.tolist())}]

## Sample inputs to trace the model

In [None]:
batches = ["The movie had stunning visuals and a unique storyline, but the pacing felt off and some scenes were confusing"] * batch_size
max_length = 256

## Create a SageMaker session

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
!echo $model_folder_name

In [None]:
!cp -r ./code $model_folder_name

# create a model.tar.gz archive with all the model artifacts and the inference.py script.
%cd tmp50
!tar zcvf model.tar.gz *
%cd ..

In [None]:
# This is just for the s3 uploads and references
model_id = "sentence-transformer"

In [None]:
from sagemaker.s3 import S3Uploader

# create s3 uri
s3_model_path = f"s3://{sess.default_bucket()}/{model_id}"

# upload model.tar.gz
s3_model_uri = S3Uploader.upload(local_path=f"tmp{batch_size}/model.tar.gz", desired_s3_uri=s3_model_path)
print(f"model artifcats uploaded to {s3_model_uri}")

## Model deployment

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel

In [None]:
image_uri_py37 = "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-inference-neuron:1.10.2-transformers4.20.1-neuron-py37-sdk1.19.1-ubuntu18.04"
image_uri_py310 = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuron:1.13.1-neuron-py310-sdk2.14.1-ubuntu20.04"

In [None]:
huggingface_model = HuggingFaceModel(
   model_data=s3_model_uri,      # path to your model and script
   role=role,                    # iam role with permissions to create an Endpoint
   image_uri = image_uri_py37,
   env={
       "SAGEMAKER_MODEL_SERVER_WORKERS": "4", 
       "ST_BATCH_SIZE": "50",
       "NEURON_MONITOR_CW_REGION": "us-west-2",
       "NEURON_MONITOR_CW_NAMESPACE": "/aws/sagemaker",
       "NEURON_RT_LOG_LEVEL": "5",   
   }
)

In [None]:
huggingface_model._is_compiled_model = True

In [None]:
predictor = huggingface_model.deploy(
    initial_instance_count=1,      # number of instances
    instance_type="ml.inf1.xlarge" # AWS Inferentia Instance
)

In [None]:
for i in range(3000):
    for i in range(4):
        result = predictor.predict({"inputs": batches})
        # print(result)