In [3]:
pip install torch

Collecting torch
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m445.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99 (from torch)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu11==11.7.101 (from torch)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8

In [1]:
import os
import requests
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import boto3

# Define the Hugging Face model and S3 bucket information
model_name = "google/flan-t5-base"
s3_bucket_name = "huggingface-flan5-base"
s3_prefix = "flan-t5-model"  # The prefix within the S3 bucket where you want to store the model

# Create a directory to store the model files
model_dir = "flan-t5-model"
os.makedirs(model_dir, exist_ok=True)

# Download the model from Hugging Face
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

# Create an S3 client
s3 = boto3.client("s3")

# Upload the model to S3
for root, _, files in os.walk(model_dir):
    for file in files:
        local_path = os.path.join(root, file)
        s3_path = os.path.join(s3_prefix, os.path.relpath(local_path, model_dir))
        s3.upload_file(local_path, s3_bucket_name, s3_path)

# Clean up the local model files if needed
# os.rmdir(model_dir)


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [4]:
import tarfile
local_model_dir = "flan-t5-model" 
# Create a tar.gz archive of the model files
archive_name = "flan-t5-model.tar.gz"
with tarfile.open(archive_name, "w:gz") as tar:
    tar.add(local_model_dir, arcname=os.path.basename(local_model_dir))

# Create an S3 client
s3 = boto3.client("s3")

# Upload the archive to S3
s3.upload_file(archive_name, s3_bucket_name, os.path.join(s3_prefix, archive_name))

# Clean up the local archive file if needed
os.remove(archive_name)

In [7]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.model import Model
from sagemaker.predictor import Predictor

# Define your SageMaker execution role
role = "arn:aws:iam::077086922411:role/service-role/AmazonSageMaker-ExecutionRole-20230824T205541"

# Define the S3 location of your model archive
model_s3_uri = "s3://huggingface-flan5-base/flan-t5-model/flan-t5-model.tar.gz"

# Create a SageMaker Model object
sagemaker_model = Model(
    image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference:1.8.1-cpu-py36-ubuntu18.04",  # SageMaker PyTorch inference image
    model_data=model_s3_uri,
    role=role,
)

# Deploy the model as an endpoint with instance type and instance count specified
predictor = sagemaker_model.deploy(
    endpoint_name="flan-t5-endpoint",  # Choose a name for your endpoint
    initial_instance_count=1,  # Set the initial instance count
    instance_type="ml.m5.large",  # Choose an appropriate instance type
    wait=True,
)

-------!

In [11]:
endpoint_name = "flan-t5-endpoint"

# Create a SageMaker Predictor object using the endpoint name
predictor = Predictor(endpoint_name=endpoint_name, sagemaker_session=sagemaker.Session())

# Example input data (prompt)
prompt = "Translate the following English text to French: 'Hello, how are you?'"

# Make a prediction using the SageMaker predictor
response = predictor.predict(prompt)

# The response is usually in JSON format, so you can parse it
import json
result = json.loads(response)

# Extract the prediction from the result
prediction = result["predictions"][0]

# Print the prediction
print("Translated text:", prediction)

# Delete the SageMaker endpoint when done
predictor.delete_endpoint()

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (0) from primary with message "Your invocation timed out while waiting for a response from container primary. Review the latency metrics for each container in Amazon CloudWatch, resolve the issue, and try again.". See https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logEventViewer:group=/aws/sagemaker/Endpoints/flan-t5-endpoint in account 077086922411 for more information.