In [None]:
# Copyright 2023 Nils Knieling
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Building AI-powered applications using LocalAI and Llama 2 in the Google Cloud

[![Open in Colab](https://img.shields.io/badge/Open%20in%20Colab-%23F9AB00.svg?logo=googlecolab&logoColor=white)](https://colab.research.google.com/github/Cyclenerd/toolbox/blob/master/notebooks/LocalAI_Llama2.ipynb)
[![Open in Vertex AI Workbench](https://img.shields.io/badge/Open%20in%20Vertex%20AI%20Workbench-%234285F4.svg?logo=googlecloud&logoColor=white)](https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/Cyclenerd/toolbox/master/notebooks/LocalAI_Llama2.ipynb)
[![View on GitHub](https://img.shields.io/badge/View%20on%20GitHub-181717.svg?logo=github&logoColor=white)](https://github.com/Cyclenerd/toolbox/blob/master/notebooks/LocalAI_Llama2.ipynb)

![Screenshot](https://raw.githubusercontent.com/Cyclenerd/toolbox/master/notebooks/chatbot-ui-llama2.png)

## Setup Google Cloud environment

>

In [None]:
# @markdown ✏️ Replace the placeholder text below:

# Please fill in these values.
project_id = "test-nils-ai"  # @param {type:"string"}
region = "us-central1"  # @param {type:"string"}
llm_models_bucket = "nils-llm-models"  # @param {type:"string"}
llm_models_disk = "disk-nils-llm-models"  # @param {type:"string"}
vpc_network_name = "vpc-test-nils-ai" # @param {type:"string"}
docker_registry_name = "docker-test-nils-ai" # @param {type:"string"}

# Quick input validations.
assert project_id, "⚠️ Please provide a Google Cloud project ID"
assert region, "⚠️ Please provide a Google Cloud region"
assert llm_models_bucket, "⚠️ Please provide a Google Cloud storage bucket to store LLM models"
assert llm_models_disk, "⚠️ Please provide a Google Cloud storage disk to store LLM models"
assert vpc_network_name, "⚠️ Please provide a VPC network name"
assert docker_registry_name, "⚠️ Please provide a Artifact Registry repository name"

# Configure gcloud.
!gcloud config set project "{project_id}"
!gcloud config set storage/parallel_composite_upload_enabled "True"

print("☑️ Done")

In [3]:
#@markdown ### (Colab only!) Authenticate your Google Cloud Account

# Authenticate gcloud.
from google.colab import auth
auth.authenticate_user()

In [None]:
#@markdown ###  Check authenticated user
current_user = !gcloud auth list \
  --filter="status:ACTIVE" \
  --format="value(account)" \
  --quiet

current_user = current_user[0]
print(f"Current user: {current_user}")

In [None]:
#@markdown ### Enable APIs

# Enable APIs
my_google_apis = [
    "storage.googleapis.com",
    "compute.googleapis.com",
    "artifactregistry.googleapis.com",
    "container.googleapis.com",
    "containerscanning.googleapis.com",
    "cloudbuild.googleapis.com",
    "notebooks.googleapis.com",
    "aiplatform.googleapis.com",
]

for api in my_google_apis :
  print(f"Enable API: {api}")
  !gcloud services enable "{api}" \
    --project="{project_id}" \
    --quiet

print("☑️ OK")

### Storage

In [None]:
#@markdown #### Create storage bucket for data

!gcloud storage buckets create 'gs://{llm_models_bucket}' \
  --location='{region}' \
  --uniform-bucket-level-access \
  --quiet

print("☑️ Done")
print(f"Open in console: https://console.cloud.google.com/storage/browser/{llm_models_bucket}")

### Disk

In [None]:
#@markdown #### Create disk for LLM models

!gcloud compute disks create "{llm_models_disk}" \
  --type="pd-ssd" \
  --size="75GB" \
  --zone="{region}-b" \
  --project="{project_id}" \
  --quiet

print("☑️ Done")

### Network

In [None]:
#@markdown #### Create a regional VPC network

!gcloud compute networks create "{vpc_network_name}" \
  --subnet-mode="custom" \
  --bgp-routing-mode="regional" \
  --project="{project_id}" \
  --quiet

print("☑️ Done")

In [None]:
#@markdown #### Create a subnet in the region

!gcloud compute networks subnets create "{vpc_network_name}-{region}" \
  --network="{vpc_network_name}" \
  --region="{region}" \
  --range="10.128.1.0/24" \
  --enable-private-ip-google-access \
  --project="{project_id}" \
  --quiet

print("☑️ Done")

In [None]:
#@markdown #### Create firewall rules

!gcloud compute firewall-rules create "{vpc_network_name}-allow-default" \
  --allow="tcp:22,tcp:3389,icmp" \
  --network="{vpc_network_name}" \
  --project="{project_id}" \
  --quiet
!gcloud compute firewall-rules create "{vpc_network_name}-allow-http" \
  --allow="tcp:80,tcp:443,tcp:3000,tcp:8080" \
  --network="{vpc_network_name}" \
  --project="{project_id}" \
  --quiet

print("☑️ Done")

In [None]:
#@markdown #### Create router in region

!gcloud compute routers create "router-{vpc_network_name}-{region}" \
  --network="{vpc_network_name}" \
  --region="{region}" \
  --project="{project_id}" \
  --quiet

print("☑️ Done")

In [None]:
#@markdown #### Add NAT to router in region

!gcloud compute routers nats create "nat-{vpc_network_name}-{region}" \
  --router="router-{vpc_network_name}-{region}"  \
  --auto-allocate-nat-external-ips \
  --nat-all-subnet-ip-ranges \
  --enable-logging \
  --log-filter=ERRORS_ONLY \
  --min-ports-per-vm=256 \
  --region="{region}" \
  --project="{project_id}" \
  --quiet

print("☑️ Done")

### Registry

In [None]:
#@markdown #### Create Artifact Registry for Docker cointainer images

!gcloud artifacts repositories create "{docker_registry_name}" \
  --repository-format="docker"\
  --description="Docker contrainer registry" \
  --location="{region}" \
  --project="{project_id}" \
  --quiet

print("☑️ Done")

## Container

Build Docker container images.

### LocalAI

* GitHub: <https://github.com/go-skynet/LocalAI#readme>
* Website: <https://localai.io/>

In [None]:
#@markdown #### Build LocalAI container image from GitHub source

# Please fill in these values.
localai_git_repo = "https://github.com/go-skynet/LocalAI.git"  # @param {type:"string"}
localai_git_revision = "master"  # @param {type:"string"}

# Quick input validations.
assert localai_git_repo, "⚠️ Please provide the LocalAI Git source repository"
assert localai_git_revision, "⚠️ Please provide the LocalAI Git source revision"

!gcloud builds submit "{localai_git_repo}" \
  --git-source-revision="{localai_git_revision}" \
  --tag "{region}-docker.pkg.dev/{project_id}/{docker_registry_name}/localai:latest" \
  --machine-type="e2-highcpu-8" \
  --timeout="1h" \
  --region="{region}" \
  --default-buckets-behavior="regional-user-owned-bucket" \
  --project="{project_id}" \
  --quiet

print("☑️ Done")

### Chatbot UI

* GitHub: <https://github.com/mckaywrigley/chatbot-ui#readme>
* Website: <https://www.chatbotui.com/>

In [None]:
#@markdown #### Build Chatbot UI container image from GitHub source

# Please fill in these values.
chatbot_ui_git_repo = "https://github.com/mckaywrigley/chatbot-ui.git"  # @param {type:"string"}
chatbot_ui_git_revision = "main"  # @param {type:"string"}

# Quick input validations.
assert chatbot_ui_git_repo, "⚠️ Please provide the LocalAI Git source repository"
assert chatbot_ui_git_revision, "⚠️ Please provide the LocalAI Git source revision"

!gcloud builds submit "{chatbot_ui_git_repo}" \
  --git-source-revision="{chatbot_ui_git_revision}" \
  --tag "{region}-docker.pkg.dev/{project_id}/{docker_registry_name}/chatbot-ui:latest" \
  --machine-type="e2-highcpu-8" \
  --timeout="1h" \
  --region="{region}" \
  --default-buckets-behavior="regional-user-owned-bucket" \
  --project="{project_id}" \
  --quiet

print("☑️ Done")

## Workbench

Deploy Vertex user-managed notebooks instance in VPC network.

Machine type recommendation: [`n1-standard-2` in `us-central1`](https://gcloud-compute.com/us-central1/n1-standard-2.html)

In [None]:
#@markdown #### Create user-managed instance for workbench/notebooks

# This code snippet may run for a few (>5) minutes.

# Please fill in these values.
workbench_machine_type = "n1-standard-2"  # @param {type:"string"}
workbench_data_disk_gb = "250"  # @param {type:"string"}

# Quick input validations.
assert workbench_machine_type, "⚠️ Please provide a Google Compute Engine machine type"
assert workbench_data_disk_gb, "⚠️ Please provide a data disk size in GiB"

print("Please wait...")

# OS images: https://gcloud-compute.com/images.html
!gcloud notebooks instances create "workbench-{vpc_network_name}" \
  --machine-type="{workbench_machine_type}" \
  --vm-image-project="deeplearning-platform-release"\
  --vm-image-family="tf-latest-gpu-ubuntu-2004-py310" \
  --boot-disk-size="50" \
  --boot-disk-type="PD_SSD" \
  --data-disk-size="{workbench_data_disk_gb}"\
  --data-disk-type="PD_SSD" \
  --network="{vpc_network_name}" \
  --subnet="{vpc_network_name}-{region}" \
  --subnet-region="{region}" \
  --no-public-ip \
  --project="{project_id}" \
  --location="{region}-b" \
  --quiet

print("☑️ Done")
print(f"Open Workbench in console: https://console.cloud.google.com/vertex-ai/workbench/user-managed?project={project_id}")
print(f"Open Compute Engine in console: https://console.cloud.google.com/compute/instancesDetail/zones/{region}-b/instances/workbench-{vpc_network_name}?project={project_id}")

### Prepare LLM disk

Stop workbench instance and attach created disk for LLM models.

Start workbench instance and open JupyterLab:

1. In the terminal,
use the `lsblk` command to list the disks that are attached to your instance and find the disk that you want to format and mount.

  ```text
  (base) jupyter@workbench-vpc-test-nils-ai:~$ sudo lsblk
  NAME    MAJ:MIN RM   SIZE RO TYPE MOUNTPOINT
  loop0     7:0    0  55.7M  1 loop /snap/core18/2785
  loop1     7:1    0  63.5M  1 loop /snap/core20/1891
  loop2     7:2    0  63.5M  1 loop /snap/core20/1974
  loop3     7:3    0 344.1M  1 loop /snap/google-cloud-cli/143
  loop4     7:4    0  91.9M  1 loop /snap/lxd/24061
  loop5     7:5    0  53.3M  1 loop /snap/snapd/19361
  loop6     7:6    0  53.3M  1 loop /snap/snapd/19457
  sda       8:0    0    50G  0 disk
  ├─sda1    8:1    0  49.9G  0 part /
  ├─sda14   8:14   0     4M  0 part
  └─sda15   8:15   0   106M  0 part /boot/efi
  sdb       8:16   0   750G  0 disk /home/jupyter
  sdc       8:32   0    75G  0 disk
  ```

  In this example it is disk `sdc`.

1. Format the disk using the `mkfs` tool.
This command deletes all data from the specified disk, so make sure that you specify the disk device correctly.

  ```bash
  sudo mkfs.ext4 -m 0 -E lazy_itable_init=0,lazy_journal_init=0,discard /dev/sdc
  ```

1. Create a directory that serves as the mount point for the new disk on the VM.

  ```bash
  sudo mkdir -p /mnt/disks/models
  ```

1. Use the `mount` tool to mount the disk to the instance, and enable the discard option:

  ```bash
  sudo mount -o discard,defaults /dev/sdc /mnt/disks/models
  ```

1. Grant write access to the disk for all users.

  ```bash
  sudo chmod a+w /mnt/disks/models
  ```

Run the following steps in the terminal of your new workbench instance:

```bash
# Install git lfs
curl -s "https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh" \
  | sudo bash
sudo apt-get install git-lfs
git lfs install
```

Download models... (from [Hugging Face](https://huggingface.co/TheBloke/Llama-2-70B-Chat-GGML/tree/main))

```bash
mkdir -p models
cd models
# curl -OLp https://huggingface.co/TheBloke/Llama-2-7B-GGML/resolve/main/llama-2-7b.ggmlv3.q4_0.bin
# curl -OLp https://huggingface.co/TheBloke/Llama-2-13B-GGML/resolve/main/llama-2-13b.ggmlv3.q4_0.bin
# curl -OLp https://huggingface.co/TheBloke/Llama-2-70B-GGML/resolve/main/llama-2-70b.ggmlv3.q4_0.bin
# Chat
# curl -OLp https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_0.bin
curl -OLp https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin
# curl -OLp https://huggingface.co/TheBloke/Llama-2-70B-Chat-GGML/resolve/main/llama-2-70b-chat.ggmlv3.q4_0.bin
# German
# https://huggingface.co/flozi00/Llama-2-13B-german-assistant-v2
# curl -OLp https://huggingface.co/TheBloke/llama-2-13B-German-Assistant-v2-GGML/resolve/main/llama-2-13b-german-assistant-v2.ggmlv3.q4_0.bin
```

Copy to bucket:

```bash
gcloud storage cp -r models gs://nils-llm-models/
```

Copy to LLM model to disk:

```bash
# From disk
cp models/llama-2-13b-chat.ggmlv3.q4_0.bin "/mnt/disks/models/"
# or from bucket
#gcloud storage cp gs://nils-llm-models/models/llama-2-13b-chat.ggmlv3.q4_0.bin "/mnt/disks/models/"
```

Create config and "fake" `gpt-3.5-turbo` model. Create `gpt-3.5-turbo.yaml` file:

```bash
nano gpt-3.5-turbo.yaml
```

Add text to file:

```text
name: gpt-3.5-turbo
parameters:
  model: llama-2-13b-chat.ggmlv3.q4_0.bin
  top_k: 80
  temperature: 0.2
  top_p: 0.7
context_size: 1024
stopwords:
- "HUMAN:"
- "GPT:"
roles:
  user: " "
  system: " "
template:
  completion: completion
  chat: gpt4all
backend: llama
```

Create `completion.tmpl` file:

```bash
nano completion.tmpl
```

Add:

```text
{{.Input}}
```

Create `gpt4all.tmpl` file:

```bash
nano gpt4all.tmpl
```

Add:

```text
The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
### Prompt:
{{.Input}}
### Response:
```


Stop the workbench instance and detach the disk with the LLM model!

## Server

### LocalAI

Machine type recommendation: [`c2d-highcpu-32` in `us-central1`](https://gcloud-compute.com/us-central1/c2d-highcpu-32.html)

In [None]:
#@markdown #### Deploy VM instance with LocalAI container image

#@markdown ⚠️ Warning: `localai_cpu_cores` must be the number of physical cores!
#@markdown Overbooking the CPU degrades performance notably.

# Please fill in these values.
localai_machine_type = "c2d-highcpu-32"  # @param {type:"string"}
localai_cpu_cores = "16"  # @param {type:"string"}

# Quick input validations.
assert localai_machine_type, "⚠️ Please provide a Google Compute Engine machine type"
assert localai_cpu_cores, "⚠️ Please provide Google Compute Engine machine type CPU core count"

!gcloud compute instances create-with-container "localai-{vpc_network_name}" \
  --machine-type="{localai_machine_type}" \
  --network-interface="subnet={vpc_network_name}-{region},no-address" \
  --image-project="cos-cloud" \
  --image-family="cos-stable" \
  --boot-disk-size="25GB" \
  --boot-disk-type="pd-ssd" \
  --container-image="{region}-docker.pkg.dev/{project_id}/{docker_registry_name}/localai:latest" \
  --container-mount-disk="mode=ro,mount-path=/build/models,name={llm_models_disk},partition=0" \
  --disk="boot=no,device-name={llm_models_disk},mode=ro,name={llm_models_disk}" \
  --container-env="THREADS={localai_cpu_cores}" \
  --zone="{region}-b" \
  --project="{project_id}" \
  --quiet


In [None]:
#@markdown #### Generate hostname for later use

localai_global_dns_name=f"localai-{vpc_network_name}.c.{project_id}.internal"
localai_base_url=f"http://{localai_global_dns_name}:8080"

print(f"Global internal DNS name: {localai_global_dns_name}\n")
print(f"Models: curl '{localai_base_url}/v1/models'")
print(f"Completions: curl '{localai_base_url}/v1/chat/completions'")

print("\n☑️ OK")

Test from workbench instance:

```bash
time curl "http://[LOCALAI_BASE_URL]/v1/chat/completions" -H "Content-Type: application/json" -d '{
     "model": "gpt-3.5-turbo",
     "messages": [{"role": "user", "content": "How are you?"}],
     "temperature": 0.9
   }'
```

Example:

```bash
time curl 'http://localai-vpc-test-nils-ai.c.test-nils-ai.internal:8080/v1/chat/completions' -H "Content-Type: application/json" -d '{
     "model": "gpt-3.5-turbo",
     "messages": [{"role": "user", "content": "How are you?"}],
     "temperature": 0.9
   }'
```

### Chatbot UI

Machine type recommendation: [`e2-medium` in `us-central1`](https://gcloud-compute.com/us-central1/e2-medium.html)

In [None]:
#@markdown #### Deploy VM instance with Chatbot UI container image

#@markdown ⚠️ Warning: This instance gets a public IPv4 address!

# Please fill in these values.
chatbot_ui_machine_type = "e2-medium"  # @param {type:"string"}

# Quick input validations.
assert chatbot_ui_machine_type, "⚠️ Please provide a Google Compute Engine machine type"

!gcloud compute instances create-with-container "chatbot-ui-{vpc_network_name}" \
  --machine-type="{chatbot_ui_machine_type}" \
  --network-interface="subnet={vpc_network_name}-{region}" \
  --image-project="cos-cloud" \
  --image-family="cos-stable" \
  --boot-disk-size="10GB" \
  --boot-disk-type="pd-ssd" \
  --container-image="{region}-docker.pkg.dev/{project_id}/{docker_registry_name}/chatbot-ui:latest" \
  --container-env="OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXX,OPENAI_API_HOST={localai_base_url}" \
  --zone="{region}-b" \
  --project="{project_id}" \
  --quiet

You can now connect with your browser to the external IP on port 3000 and chat with Llama 2 🦙💬

URL:

```text
http://[EXTERNAL_IP]:3000/
```

## Helper

In [12]:
# Lists GCE instances
!gcloud compute instances list \
  --project="{project_id}"


NAME                         ZONE           MACHINE_TYPE    PREEMPTIBLE  INTERNAL_IP  EXTERNAL_IP  STATUS
chatbot-ui-vpc-test-nils-ai  us-central1-b  e2-medium                    10.128.1.16               TERMINATED
localai-vpc-test-nils-ai     us-central1-b  c2d-highcpu-32               10.128.1.15               RUNNING
workbench-vpc-test-nils-ai   us-central1-b  n1-standard-2                10.128.1.8                RUNNING


In [None]:
# List notebooks instances in a region/location
!gcloud notebooks instances list \
  --location="{region}-b" \
  --project="{project_id}" \
  --quiet

## Clean up

### Workbench

In [None]:
# Delete user-managed notebooks/workbench instance
print("Please wait...")
!gcloud notebooks instances delete "workbench-{vpc_network_name}" \
  --project="{project_id}" \
  --location="{region}-b" \
  --quiet
print("☑️ Done")

### Server

In [None]:
# Delete LocalAI instance
print("Please wait...")
!gcloud compute instances delete "localai-{vpc_network_name}" \
  --project="{project_id}" \
  --zone="{region}-b" \
  --quiet
print("☑️ Done")

In [None]:
# Delete Chatbot UI instance
print("Please wait...")
!gcloud compute instances delete "chatbot-ui-{vpc_network_name}" \
  --project="{project_id}" \
  --zone="{region}-b" \
  --quiet
print("☑️ Done")

### Storage

In [None]:
# Delete data bucket
!gcloud storage rm -r 'gs://{llm_models_bucket}' \
  --project="{project_id}" \
  --quiet
print("☑️ Done")

### Disk


In [None]:
# Delete disk for LLM models
!gcloud compute disks delete "{llm_models_disk}" \
  --zone="{region}-b" \
  --project="{project_id}" \
  --quiet

print("☑️ Done")

### Network

In [None]:
# Delete NAT router in region
print("Please wait...")
!gcloud compute routers nats delete "nat-{vpc_network_name}-{region}" \
  --router="router-{vpc_network_name}-{region}"  \
  --region="{region}" \
  --project="{project_id}" \
  --quiet
print("☑️ Done")

In [None]:
# Delete router
print("Please wait...")
!gcloud compute routers delete "router-{vpc_network_name}-{region}" \
  --region="{region}" \
  --project="{project_id}" \
  --quiet
print("☑️ Done")

In [None]:
# Delete subnet
print("Please wait...")
!gcloud compute networks subnets delete "{vpc_network_name}-{region}" \
  --region="{region}" \
  --project="{project_id}" \
  --quiet
print("☑️ Done")

In [None]:
# Delete firewall rules

!gcloud compute firewall-rules delete "{vpc_network_name}-allow-default" \
  --project="{project_id}" \
  --quiet
!gcloud compute firewall-rules delete "{vpc_network_name}-allow-http" \
  --project="{project_id}" \
  --quiet

print("☑️ Done")

In [None]:
# Delete VPC
print("Please wait...")
!gcloud compute networks delete "{vpc_network_name}" \
  --project="{project_id}" \
  --quiet
print("☑️ Done")

### Registry

In [None]:
# Delete old container images in Artifact Registry

gcr_cleaner_cloud_build_config=f"""# Delete old container images
steps:
  - name: 'cyclenerd/google-cloud-gcp-tools-container:latest'
    entrypoint: 'gcr-cleaner-cli'
    args:
      - '-repo'
      - '{region}-docker.pkg.dev/{project_id}/{docker_registry_name}'
      - '-recursive'
"""

with open("gcr-cleaner.yaml", "w") as text_file:
    print(gcr_cleaner_cloud_build_config, file=text_file)

!gcloud builds submit --no-source \
  --config="./gcr-cleaner.yaml" \
  --timeout="10m" \
  --region="{region}" \
  --default-buckets-behavior="regional-user-owned-bucket" \
  --project="{project_id}" \
  --quiet

print("☑️ Done")

In [None]:
# Delete Artifact Registry for Docker cointainer images
!gcloud artifacts repositories delete "{docker_registry_name}" \
  --location="{region}" \
  --project="{project_id}" \
  --quiet