# TODO
 
* ## https://medium.com/analytics-vidhya/deploy-huggingface-s-bert-to-production-with-pytorch-serve-27b068026d18 

* ## https://github.com/pytorch/serve/tree/master/examples/Huggingface_Transformers

# Deploying our BERT PyTorch Model as REST EndPoint

In [None]:
!pip install -q transformers==2.8.0
!pip install -q torch==1.5.0 --upgrade --ignore-installed

In [None]:
!pip install torchserve torch-model-archiver

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# Clone the TorchServe repository and install torch-model-archiver

You'll use `torch-model-archiver` to create a model archive file (.mar). The .mar model archive file contains model checkpoints along with it’s `state_dict` (dictionary object that maps each layer to its parameter tensor).

In [None]:
!git clone https://github.com/pytorch/serve.git
!pip install serve/model-archiver/

# Retrieve PyTorch Models

In [None]:
%store -r s3_pytorch_model_path

In [None]:
print(s3_pytorch_model_path)

In [None]:
%store -r s3_transformer_pytorch_model_path

In [None]:
print(s3_transformer_pytorch_model_path)

In [None]:
!aws s3 cp --recursive $s3_transformer_pytorch_model_path ./Transformer_model/

# Create TorchServe Model Archive File

Once, setup_config.json, sample_text.txt and index_to_name.json are set properly, we can go ahead and package the model and start serving it. The artifacts realted to each operation mode (such as sample_text.txt, index_to_name.json) can be place in their respective folder. 

In [None]:
#!torch-model-archiver 
#    --model-name "bert" \
#    --version 1.0 \
#    --serialized-file ./bert_model/pytorch_model.bin \
#    --extra-files "./bert_model/config.json" \
#    --handler "./transformers_classifier_torchserve_handler.py"

In [None]:
!torch-model-archiver \
    --model-name DistilBertForSequenceClassification \
    --version 1.0 \
    --serialized-file Transformer_model/pytorch_model.bin \
    --handler ./src_torchserve/Transformer_handler_generalized.py \
    --extra-files "./Transformer_model/config.json,./src_torchserve/setup_config.json,./src_torchserve/Seq_classification_artifacts/index_to_name.json"

In [None]:
!ls ./*.mar

# Registering the Model on TorchServe and Running Inference

To register the model on TorchServe using the above model archive file, we run the following commands:

In [None]:
!mkdir ./model_store
!mv ./DistilBertForSequenceClassification.mar ./model_store/

# Note:  This requires Java 11 which is not currently the default in our SageMaker Notebook Instances

In [None]:
# %%bash

# torchserve \
# --start \
# --model-store model_store \
# --models distilbert-pytorch=DistilBertForSequenceClassification.mar &

In [None]:
# !ps -aef | grep torch

## To run the inference using our registered model, open a new terminal and run: 

In [None]:
# !curl -X POST http://127.0.0.1:8080/predictions/distilbert-pytorch -T ./src_torchserve/Seq_classification_artifacts/sample_text.txt

# Prepare the Model for SageMaker Deployment

## Upload .mar to S3

In [None]:
torchserve_model_name = 'DistilBertForSequenceClassification.mar'

In [None]:
s3_torchserve_mar = 's3://{}/models/torchserve/{}'.format(bucket, torchserve_model_name)
print(s3_torchserve_mar)

In [None]:
!aws s3 cp $s3_torchserve_mar $s3_torchserve_mar

# Store Endpoint Name for Next Notebook(s)

In [None]:
%store endpoint_name

# _Wait Until the ^^ Endpoint ^^ is Deployed_

In [None]:
client = boto3.client('sagemaker')
waiter = client.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=endpoint_name)

# Simulate a Prediction from an Application

In [None]:
class RequestHandler(object):
    import json
    
    def __init__(self, tokenizer, max_seq_length):
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __call__(self, instances):
        transformed_instances = []

        for instance in instances:
            encode_plus_tokens = tokenizer.encode_plus(instance,
                                                       pad_to_max_length=True,
                                                       max_length=self.max_seq_length)

            input_ids = encode_plus_tokens['input_ids']
            input_mask = encode_plus_tokens['attention_mask']
            segment_ids = [0] * self.max_seq_length

            transformed_instance = {"input_ids": input_ids, 
                                    "input_mask": input_mask, 
                                    "segment_ids": segment_ids}

            transformed_instances.append(transformed_instance)

        transformed_data = {"instances": transformed_instances}

        return json.dumps(transformed_data)

In [None]:
class ResponseHandler(object):
    import json
    import tensorflow as tf
    
    def __init__(self, classes):
        self.classes = classes
    
    def __call__(self, response, accept_header):
        import tensorflow as tf

        response_body = response.read().decode('utf-8')

        response_json = json.loads(response_body)

        log_probabilities = response_json["predictions"]

        predicted_classes = []

        # Convert log_probabilities => softmax (all probabilities add up to 1) => argmax (final prediction)
        for log_probability in log_probabilities:
            softmax = tf.nn.softmax(log_probability)    
            predicted_class_idx = tf.argmax(softmax, axis=-1, output_type=tf.int32)
            predicted_class = self.classes[predicted_class_idx]
            predicted_classes.append(predicted_class)

        return predicted_classes

In [None]:
import json
from sagemaker.tensorflow.serving import Predictor
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

request_handler = RequestHandler(tokenizer=tokenizer,
                                 max_seq_length=128)

response_handler = ResponseHandler(classes=[1, 2, 3, 4, 5])

predictor = Predictor(endpoint_name=endpoint_name,
                      sagemaker_session=sess,
                      serializer=request_handler,
                      deserializer=response_handler,
                      content_type='application/json',
                      model_name='saved_model',
                      model_version=0)

In [None]:
import tensorflow as tf
import json
    
reviews = ["This is great!", 
           "This is terrible."]

predicted_classes = predictor.predict(reviews)

for predicted_class, review in zip(predicted_classes, reviews):
    print('[Predicted Star Rating: {}]'.format(predicted_class), review)

# Simulate a (Mini-)Load Test

In [None]:
def predict(idx):
    reviews = ["This is great!", 
               "This is terrible."]

    predicted_classes = predictor.predict(reviews)

    return predicted_classes

In [None]:
import functools
import multiprocessing

num_cpus = multiprocessing.cpu_count()

p = multiprocessing.Pool(num_cpus)

In [None]:
%%time

results = p.map(predict, range(1,100))

# Verify that Elastic Inference is working
https://docs.aws.amazon.com/sagemaker/latest/dg/ei.html

_Note:  This may take 10-15 minutes for the metrics to appear._

In [None]:
!aws cloudwatch list-metrics --namespace " AWS/ElasticInference "

# Optimize Cost with TensorFlow and Elastic Inference
https://aws.amazon.com/blogs/machine-learning/optimizing-costs-in-amazon-elastic-inference-with-amazon-tensorflow/

# Using API Gateway with SageMaker Endpoints

https://aws.amazon.com/blogs/machine-learning/creating-a-machine-learning-powered-rest-api-with-amazon-api-gateway-mapping-templates-and-amazon-sagemaker/