In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# llama2 deployment to GKE using TGI on GPU

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fmodel_garden%2Fmodel_garden_gemma_deployment_on_gke.ipynb">
    </a>
  </td>
  <td style="text-align: center">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates downloading and deploying llama2 from Meta using vLLM inference server. In this notebook we will deploy and serve llama2 on GPUs.


### Objective

Deploy and run inference for serving llama2 with vLLM on GPUs.

### GPUs

GPUs let you accelerate specific workloads running on your nodes such as machine learning and data processing. GKE provides a range of machine type options for node configuration, including machine types with NVIDIA H100, L4, and A100 GPUs.

Before you use GPUs in GKE, we recommend that you complete the following learning path:

Learn about [current GPU version availability](https://cloud.google.com/compute/docs/gpus)

Learn about [GPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/gpus)


### vLLM

vLLM is a fast and easy-to-use library for LLM inference and serving.

vLLM is fast with:

State-of-the-art serving throughput
Efficient management of attention key and value memory with PagedAttention
Continuous batching of incoming requests
Fast model execution with CUDA/HIP graph
Quantization: GPTQ, AWQ, SqueezeLLM, FP8 KV Cache
Optimized CUDA kernels

To learn more, refer to the [vLLM documentation](https://docs.vllm.ai/en/latest/)


## Run the notebook

In [None]:
# @title Setup Google Cloud project

# @markdown 1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

# @markdown 2. Set Hugging Face access token in `HF_TOKEN` field. If you don't already have a "read" access token, follow the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens) to create an access token with "read" permission. You can find your existing access tokens in the Hugging Face [Access Token](https://huggingface.co/settings/tokens) page.

# @markdown 3. **[Optional]** Set `CLUSTER_NAME` if you want to use your own GKE cluster. If not set, this example will create a standard cluster with 2 NVIDIA L4 GPU accelerators.

import os
from datetime import datetime

# The HuggingFace token used to download models.
HF_TOKEN = ""  # @param {type:"string"}
assert HF_TOKEN, "Please set Hugging Face access token in `HF_TOKEN`."

# Get the default cloud project id.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Set up gcloud.
! gcloud config set project "$PROJECT_ID"
! gcloud services enable container.googleapis.com

# Add kubectl to the set of available tools.
! mkdir -p /tools/google-cloud-sdk/.install
! gcloud components install kubectl --quiet

# The cluster name to create
CLUSTER_NAME = "autopilot-cluster-1"  # @param {type:"string"}

# Use existing GKE cluster or create a new cluster.
if CLUSTER_NAME:
    ! gcloud container clusters get-credentials {CLUSTER_NAME} --location {REGION}
else:
    now = datetime.now().strftime("%Y%m%d%H%M%S")
    CLUSTER_NAME=f"gke-gemma-cluster-{now}"
    ! gcloud container clusters create {CLUSTER_NAME} \
        --project={PROJECT_ID} \
        --region={REGION} \
        --workload-pool={PROJECT_ID}.svc.id.goog \
        --release-channel=rapid \
        --num-nodes=4
    ! gcloud container node-pools create gpupool \
        --accelerator=type=nvidia-l4,count=2,gpu-driver-version=latest \
        --project={PROJECT_ID} \
        --location={REGION} \
        --node-locations={REGION}-a \
        --cluster={CLUSTER_NAME} \
        --machine-type=g2-standard-24 \
        --num-nodes=1

# Create Kubernetes secret for Hugging Face credentials
! kubectl create secret generic hf-secret \
    --from-literal=hf_api_token={HF_TOKEN} \
    --dry-run=client -o yaml > hf-secret.yaml

! kubectl apply -f hf-secret.yaml

Updated property [core/project].


Your current Google Cloud CLI version is: 483.0.0
Installing components from version: 483.0.0

┌─────────────────────────────────────────────┐
│     These components will be installed.     │
├────────────────────────┬─────────┬──────────┤
│          Name          │ Version │   Size   │
├────────────────────────┼─────────┼──────────┤
│ gke-gcloud-auth-plugin │   0.5.9 │  4.0 MiB │
│ kubectl                │ 1.27.15 │  < 1 MiB │
│ kubectl                │ 1.27.15 │ 73.4 MiB │
└────────────────────────┴─────────┴──────────┘

For the latest full release notes, please visit:
  https://cloud.google.com/sdk/release_notes

Performing in place update...

╔════════════════════════════════════════════════════════════╗
╠═ Downloading: gke-gcloud-auth-plugin                      ═╣
╠════════════════════════════════════════════════════════════╣
╠═ Downloading: gke-gcloud-auth-plugin                      ═╣
╠══════════════════════════════════════════════════════════

In [None]:
# @title Deploy llama2

# @markdown This section deploys llama2 on vLLM.

# @markdown Select one of the following model version and size options:

K8S_YAML = f"""
apiVersion: apps/v1
kind: Deployment
metadata:
  name: llama-deployment
spec:
  replicas: 1
  selector:
    matchLabels:
      app: llama-server
  template:
    metadata:
      labels:
        app: llama-server
        ai.gke.io/model: LLaMA2_7B_Chat
        ai.gke.io/inference-server: vllm
        examples.ai.gke.io/source: model-garden
    spec:
      containers:
      - name: inference-server
        image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240313_0916_RC00
        resources:
          requests:
            cpu: 5
            memory: 20Gi
            ephemeral-storage: 40Gi
            nvidia.com/gpu : 1
          limits:
            cpu: 5
            memory: 20Gi
            ephemeral-storage: 40Gi
            nvidia.com/gpu : 1
        command:
        - python
        - -m
        - vllm.entrypoints.api_server
        args:
        - --host=0.0.0.0
        - --port=7080
        - --tensor-parallel-size=1
        - --swap-space=16
        - --gpu-memory-utilization=0.95
        - --max-model-len=2048
        - --max-num-batched-tokens=4096
        - --disable-log-stats
        env:
        - name: DEPLOY_SOURCE
          value: UI_NATIVE_MODEL
        - name: MODEL_ID
          value: "Llama2-7B-chat-001"
        - name: AIP_STORAGE_URI
          value: "gs://vertex-model-garden-public-us/llama2/llama2-7b-chat-hf"
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
      volumes:
      - name: dshm
        emptyDir:
          medium: Memory
      nodeSelector:
        cloud.google.com/gke-accelerator: nvidia-l4
---
apiVersion: v1
kind: Service
metadata:
  name: llama-service
spec:
  selector:
    app: llama-server
  type: ClusterIP
  ports:
  - protocol: TCP
    port: 8000
    targetPort: 7080
"""

with open("llama2.yaml", "w") as f:
    f.write(K8S_YAML)

! kubectl apply -f llama2.yaml

# Wait for container to be created.
import time

print("Waiting for container to be created...\n")
while True:
    shell_output = ! kubectl get pod
    container_status = "\n".join(shell_output)
    if "1/1" in container_status:
        break
    time.sleep(5)

print(container_status)

# Wait for downloading artifacts.
print("\nDownloading artifacts...")
while True:
    shell_output = ! kubectl logs -l app=llama-server
    logs = "\n".join(shell_output)
    if "Connected" in logs:
        break
    time.sleep(5)

print("Server is up and running.")

deployment.apps/tgi-gemma-deployment1 created
service/llm-service1 created
Waiting for container to be created...

NAME                                     READY   STATUS    RESTARTS   AGE
tgi-gemma-deployment-68659f584-xm4cc     1/1     Running   0          6h41m
tgi-gemma-deployment1-56d56ccb6d-d9p7q   0/1     Pending   0          1s

Downloading artifacts...


In [None]:
# @title Prediction

# @markdown Once the server is up and running, you may send prompts to local server for prediction.

import json

prompt = "What are the top 5 most popular programming languages? Please be brief."  # @param {type: "string"}
temperature = 0.40  # @param {type: "number"}
top_p = 0.1  # @param {type: "number"}
max_tokens = 250  # @param {type: "number"}

request = {
    "inputs": prompt,
    "temperature": temperature,
    "top_p": top_p,
    "max_tokens": max_tokens,
}

command = f"""kubectl exec -t $( kubectl get pod -l app=gemma-server1 -o jsonpath="{{.items[0].metadata.name}}" ) -c inference-server -- curl -X POST http://localhost:8000/generate \
   -H "Content-Type: application/json" \
   -d '{json.dumps(request)}' \
   2> /dev/null"""

output = !{command}
print("Output:")
print(json.loads(output[0])["generated_text"])

Output:


IndexError: list index out of range

## Clean up resources

In [None]:
# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continouous charges that may incur.

! kubectl delete deployments tgi-gemma-deployment
! kubectl delete services llm-service
! kubectl delete secrets hf-secret

DELETE_CLUSTER = False # @param {type: "boolean"}

if DELETE_CLUSTER:
  ! gcloud container clusters delete {CLUSTER_NAME} \
    --region={REGION} \
    --quiet