diff --git a/assets/css/demo.css b/assets/css/demo.css new file mode 100644 index 0000000000..3cbeb0e9dd --- /dev/null +++ b/assets/css/demo.css @@ -0,0 +1,366 @@ +#demo-container { + background-color: #516170; + border: 5px solid #6B747D; + + margin-top: 16px; + + min-width: 200px; + min-height: 200px; + text-align: center; +} + +#modal-diagram-expansion { + --ads-modal-content-height: 3000px; + --ads-modal-content-max-height: 2000px; +} + + + +/******************************************************/ +/* Configuration section */ + + +.eq-width-cols { + max-width: calc(100% / var(--column-count)); /* Calculate the max-width based on the number of columns */ +} +@media (min-width: 992px) { /* lg */ + .eq-width-cols { + max-width: calc(100% / var(--column-count)); /* Calculate the max-width based on the number of columns */ + } +} + +@media (max-width: 576px) { /* xs */ + .eq-width-cols { + max-width: 100%; + flex-basis: 100%; /* Stacks the columns vertically on smaller screens */ + } +} + + +.spcs-div { + min-height: 50px; +} +.config-specs { + margin-left: 8px; + margin-top: 8px; +} + + + + +/******************************************************/ +/* Config - File & Param section */ +.config-param-div, .config-file-div { + flex: 1; /* Make both columns take equal width */ + display: flex; + flex-direction: column; +} +.config-file-div { + margin-top: 16px; +} +.config-file-display { + border-radius: 5px; + background-color: #C6C6C6; + border: 1px solid #ffffff; + + overflow: hidden; + font-family: monospace; +} + +.config-file-header { + background-color: #C6C6C6; + padding: 10px; + font-weight: bold; + border-bottom: 1px solid #ffffff; + text-align: left; + color: black; +} + +.config-file-content { + flex-grow: 1; + overflow-y: auto; + padding: 10px; + background-color: #C6C6C6; +} +.config-file-content.small { + max-height: 250px; /* Match the length of content about */ +} +.config-file-content.large { + max-height: 80vh; /* Match the length of content about */ +} + +.config-file-content pre { + margin: 0; + white-space: pre-wrap; /* Wrap long lines */ + word-wrap: break-word; + + /* Text style */ + color: black; +} + +/******************************************************/ + + + + + + + + + + + +/******************************************************/ +/* Demo section */ + +#all-messages-div { + height: 300px; + display: flex; + flex-direction: column-reverse; + overflow-y: auto; + padding-top: 16px; +} + + +#input-and-submit { + display: flex; + position: relative; + align-items: center; + width: 100%; + margin-top: 24px; +} + + + +#user-input-for-demo { + width: 100%; + flex-grow: 1; + padding: 10px; + + color: black; + background-color: white; + border: 1px solid #ccc; + border-radius: 4px; + resize: none; + overflow: hidden; + + font-family: (--ads-font-family); +} +#user-input-for-demo::placeholder { + color: darkgrey; + + font-family: (--ads-font-family); +} + + +#submit-button { + margin-left: 10px; + flex-shrink: 0; +} + + + +#reset-demo-txt { + margin-top: 8px; + float: left; + color: var(--arm-light-blue); + text-decoration: underline; + cursor: pointer; +} +#reset-demo-txt:hover { + color: var(--arm-green); +} +#ping-info { + margin-top: 8px; + margin-left: 24px; + text-align: center; + float: left; + color: var(--arm-light-grey); + font-style: italic; + +} + +.user-message { + background-color: #545454; + border-radius: 20px; + width: fit-content; + max-width: 70%; + align-self: flex-end; /* float right */ + + margin-right: 16px; + margin-bottom: 16px; + + + padding: 8px; + padding-left: 24px; + padding-right: 24px; + + /* text alignment */ + text-align: left; + font-size: 18px!important; + overflow-wrap: break-word; /* First attempt to break the word naturally */ + word-break: break-word; /* Then break at any character if needed */ + + /* Enable fade in */ + opacity: 0; + animation: fadeIn 1s forwards; +} +@keyframes fadeIn { + to { + opacity: 1; + } +} + + +.chatbot-message { + background-color: transparent; + border-radius: 20px; + width: 90%; + /*align-self: flex-end; */ + + display: flex; + align-items: flex-start; + + margin-left: 16px; + margin-bottom: 16px; + + padding: 8px; + padding-left: 16px; + + /* text alignment */ + text-align: left; + font-size: 18px!important; + overflow-wrap: break-word; /* First attempt to break the word naturally */ + word-break: break-word; /* Then break at any character if needed */ +} + + + +#notification-popup { + position: absolute; + top: -34px; /* Adjust this value to position the message as needed */ + left: 0; + width: calc(100% - 32px); /* Subtract gutter space from width */ + margin-left: 16px; + + padding: 4px; + border-radius: 4px; + font-size: 14px; + text-align: center; + box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2); + z-index: 1000; + + + background-color: black; + color: white; + + white-space: nowrap; /* Prevent the message from wrapping */ + box-sizing: border-box; /* Ensure padding doesn't affect the width */ + + opacity: 0; + transition: opacity 1s ease-in-out; /* Fade-in effect */ + visibility: hidden; +} +#notification-popup.hide-popup { + opacity: 0; + visibility: hidden; + transition: opacity 1s ease-in-out, visibility 0s 1s; +} +#notification-popup.show-popup { + opacity: 1; + visibility: visible; +} + + + + + + + + + + + + + + + + + + + + + +/**************************************************/ +/* Graphing */ +.chart-container { + position: relative; + width: 100%; + max-width: 500px; + height: 30px; + background-color: #f0f0f0; + border: 1px solid #ccc; + } + + .bar { + height: 100%; + background-color: var(--arm-green); + width: 0; + text-align: right; + padding-right: 5px; + box-sizing: border-box; + color: black; + line-height: 30px; + font-weight: bold; + } + + .context-line { + position: absolute; + top: 0; + bottom: 0; + width: 1px; + background-color: var(--arm-color-footing); + text-align: center; + font-size: 12px; + color: blue; + transform: translateX(-30%); + } + + + + + .pie-chart { + width: 125px; + height: 125px; + border-radius: 50%; + background: conic-gradient( + var(--arm-green) 0% 81.81%, /* 9/11 segments in green */ + var(--arm-yellow) 81.81% 90.91%, /* 1/11 segment in yellow */ + var(--arm-orange) 90.91% 100% /* 1/11 segment in red */ + ); +} + + + + +.clickable-blowup { + cursor: pointer; +} + +.content-blowup { + display: none; + position: fixed; + top: 0; + left: 0; + width: 100%; + height: 100%; + background: rgba(0, 0, 0, 0.8); + justify-content: center; + align-items: center; + z-index: 1000; +} + +.blown-up-image { + max-width: 90%; + max-height: 90%; +} diff --git a/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/_demo.md b/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/_demo.md new file mode 100644 index 0000000000..6cc4c33393 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/_demo.md @@ -0,0 +1,36 @@ +--- +title: Demo - Run kubectl on Arm CPU +overview: Kubernetes supports many types of apps - web/game servers, data storage, AI training/inference, and so much more. It also supports many architectures, including Arm64. Using this Arm Kubernetes migration tool, you can identify if your cluster already just works on arm64. This demo illustrates how easy it is to run this tool yourself by using pre-loaded Kubernetes cluster config files and a console notebook to run the tool within. + +demo_steps: + - Select a Kubernetes cluster file to analyze. + - Run the Kubernetes migration tool in the demo space. + - Review the output and stats to understand it. + +diagram: kubernetes_diagram.png + +configuration_popup_details: Super long list of configuration information to provide to the user. Should be context and all that to be crystal clear what the setup is. + +configuration_dropdown_options: + - parameters: + param_name: Cluster .yaml file + options: + - name: Hosted LLM chatbot + specs: Hosted LLM chatbot cluster includes Rancher packages, the web server NGINX, Redis, and Mongo-DB as a simple database. ML capabilities come from TensorFlow and Rocket.chat modules. + - name: other file + specs: Other information. + selectable: true + explanation: Helpbox info that isn't used yet. + + +### Specific details to this demo +# ================================================================================ +prismjs: true + +### FIXED, DO NOT MODIFY +# ================================================================================ +demo_template_name: kubectl_demo # allows the 'demo.html' partial to route to the correct Configuration and Demo/Stats sub partials for page render. +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/_index.md b/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/_index.md new file mode 100644 index 0000000000..3bd00bbb57 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/_index.md @@ -0,0 +1,38 @@ +--- +title: Kubernetes Arm quicktool aaaa + +minutes_to_complete: 60 + +who_is_this_for: This is for developers who want to use the Kubernetes migration tool. + +learning_objectives: + - One + - Two + - Three + +prerequisites: + - Viewing the demo! + +author_primary: Zach Lasiuk + +draft: true +cascade: + draft: true + +### Tags +skilllevels: Advanced +subjects: Containers and Virtualization +armips: + - Neoverse +operatingsystems: + - Linux +tools_software_languages: + - Kubernetes + + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 2 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/_next-steps.md new file mode 100644 index 0000000000..2d5f8417ae --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/_next-steps.md @@ -0,0 +1,23 @@ +--- +next_step_guidance: > + Thank you for completing this Learning path. + +recommended_path: "/learning-paths/servers-and-cloud-computing/nlp-hugging-face/" + +further_reading: + - resource: + title: Getting started with Llama + link: https://llama.meta.com/get-started + type: documentation + - resource: + title: Hugging Face Documentation + link: https://huggingface.co/docs + type: documentation + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +weight: 21 # set to always be larger than the content in this path, and one more than 'review' +title: "Next Steps" # Always the same +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/_review.md b/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/_review.md new file mode 100644 index 0000000000..abdb0ce111 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/_review.md @@ -0,0 +1,39 @@ +--- +review: + - questions: + question: > + Can you run LLMs on Arm CPUs? + answers: + - "Yes" + - "No" + correct_answer: 1 + explanation: > + Yes. The advancements made in the Generative AI space with new model formats like GGUF and smaller parameter models make LLM inference on CPUs very efficient. + + - questions: + question: > + Can llama.cpp be built and run on CPU only? + answers: + - "Yes" + - "No" + correct_answer: 1 + explanation: > + Yes. By default llama.cpp is built for CPU only on Linux and Windows. + + - questions: + question: > + Can you profile the time taken by the model to generate the output until the end of text? + answers: + - "Yes" + - "No" + correct_answer: 1 + explanation: > + llama.cpp prints a few timing parameters at the end of the execution of the LLM. One of these timing parameters is the eval time which is the time taken by the model to generate the output. + +# ================================================================================ +# FIXED, DO NOT MODIFY +# ================================================================================ +title: "Review" # Always the same title +weight: 20 # Set to always be larger than the content in this path +layout: "learningpathall" # All files under learning paths have this same wrapper +--- diff --git a/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/hosted-llm-chatbot.yaml b/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/hosted-llm-chatbot.yaml new file mode 100644 index 0000000000..9a68fbb3d0 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/hosted-llm-chatbot.yaml @@ -0,0 +1,235 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: hosted-llm-chatbot + +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: coredns + namespace: hosted-llm-chatbot +spec: + replicas: 2 + selector: + matchLabels: + app: coredns + template: + metadata: + labels: + app: coredns + spec: + containers: + - name: coredns + image: rancher/mirrored-coredns-coredns:1.10.1 + +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: local-path-provisioner + namespace: hosted-llm-chatbot +spec: + replicas: 1 + selector: + matchLabels: + app: local-path-provisioner + template: + metadata: + labels: + app: local-path-provisioner + spec: + containers: + - name: local-path-provisioner + image: rancher/local-path-provisioner:v0.0.24 + +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: klipper-helm + namespace: hosted-llm-chatbot +spec: + replicas: 1 + selector: + matchLabels: + app: klipper-helm + template: + metadata: + labels: + app: klipper-helm + spec: + containers: + - name: klipper-helm + image: rancher/klipper-helm:v0.8.0-build20230510 + +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: klipper-lb + namespace: hosted-llm-chatbot +spec: + replicas: 1 + selector: + matchLabels: + app: klipper-lb + template: + metadata: + labels: + app: klipper-lb + spec: + containers: + - name: klipper-lb + image: rancher/klipper-lb:0.4.4 + +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: traefik + namespace: hosted-llm-chatbot +spec: + replicas: 1 + selector: + matchLabels: + app: traefik + template: + metadata: + labels: + app: traefik + spec: + containers: + - name: traefik + image: rancher/mirrored-library-traefik:2.9.10 + +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: metrics-server + namespace: hosted-llm-chatbot +spec: + replicas: 1 + selector: + matchLabels: + app: metrics-server + template: + metadata: + labels: + app: metrics-server + spec: + containers: + - name: metrics-server + image: rancher/mirrored-metrics-server:v0.6.3 + +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redis + namespace: hosted-llm-chatbot +spec: + replicas: 1 + selector: + matchLabels: + app: redis + template: + metadata: + labels: + app: redis + spec: + containers: + - name: redis + image: redis:5.0.5 + +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nginx + namespace: hosted-llm-chatbot +spec: + replicas: 1 + selector: + matchLabels: + app: nginx + template: + metadata: + labels: + app: nginx + spec: + containers: + - name: nginx + image: nginx:1.14.2 + +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mongo + namespace: hosted-llm-chatbot +spec: + replicas: 1 + selector: + matchLabels: + app: mongo + template: + metadata: + labels: + app: mongo + spec: + containers: + - name: mongo + image: docker.io/mongo:4.0.10 + +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tensorflow + namespace: hosted-llm-chatbot +spec: + replicas: 1 + selector: + matchLabels: + app: tensorflow + template: + metadata: + labels: + app: tensorflow + spec: + containers: + - name: tensorflow + image: tensorflow/tensorflow:2.16.1 + +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: rocketchat + namespace: hosted-llm-chatbot +spec: + replicas: 1 + selector: + matchLabels: + app: rocketchat + template: + metadata: + labels: + app: rocketchat + spec: + containers: + - name: rocketchat + image: rocket.chat:6.7.0 diff --git a/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/kubernetes-instructions.md b/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/kubernetes-instructions.md new file mode 100644 index 0000000000..9a35db9441 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/kubernetes-instructions.md @@ -0,0 +1,266 @@ +--- +title: Tool installation and running +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Before you begin +The instructions in this Learning Path are for any Arm server running Ubuntu 22.04 LTS. You need an Arm server instance with at least four cores and 8GB of RAM to run this example. The instructions have been tested on an AWS Graviton3 c7g.2xlarge instance. + +## Overview + +Arm CPUs are widely used in traditional ML and AI use cases. In this Learning Path, you learn how to run generative AI inference-based use cases like a LLM chatbot on Arm-based CPUs. You do this by deploying the [Llama-2-7B-Chat model](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF) on your Arm-based CPU using `llama.cpp`. + +[llama.cpp](https://github.com/ggerganov/llama.cpp) is an open source C/C++ project developed by Georgi Gerganov that enables efficient LLM inference on a variety of hardware - both locally, and in the cloud. + +## About the Llama 2 model and GGUF model format + +The [Llama-2-7B-Chat model](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF) from Meta belongs to the Llama 2 model family and is free to use for research and commercial purposes. Before you use the model, visit the Llama [website](https://llama.meta.com/llama-downloads/) and fill in the form to request access. + + +Llama 2 collection of models perform general natural language processing (NLP) tasks such as text generation. You can access the base foundation Llama 2 model or select the specialized chat Llama 2 version that is already optimized for back-and-forth dialogue. In this Learning Path, you run the specialized chat model. +The Llama 2 family of models range in size from 7 billion to 70 billion parameters. The greater the number of parameters, the more information the model can store. This directly affects how well the model understands language and the model's general capabilities. LLMs that run efficiently on CPUs typically have lower numbers of parameters. For this example, the 7 billion (7b) model is ideal for retaining quality chatbot capability while also running efficiently on your Arm-based CPU. + +Traditionally, the training and inference of LLMs has been done on GPUs using full-precision 32-bit (FP32) or half-precision 16-bit (FP16) data type formats for the model parameter and weights. Recently, a new binary model format called GGUF was introduced by the `llama.cpp` team. This new GGUF model format uses compression and quantization techniques that remove the dependency on using FP32 and FP16 data type formats. For example, GGUF supports quantization where model weights that are generally stored as FP16 data types are scaled down to 4-bit integers. This significantly reduces the need for computational resources and the amount of RAM required. These advancements made in the model format and the data types used make Arm CPUs a great fit for running LLM inferences. + +## Install dependencies + +Install the following packages on your Arm based server instance: + +```bash +sudo apt update +sudo apt install make cmake -y +``` + +You also need to install `gcc` on your machine: + +```bash +sudo apt install gcc g++ -y +sudo apt install build-essential -y +``` + +## Download and build llama.cpp + +You are now ready to start building `llama.cpp`. + +Clone the source repository for llama.cpp: + +```bash +git clone https://github.com/ggerganov/llama.cpp +``` + +By default, `llama.cpp` builds for CPU only on Linux and Windows. You don't need to provide any extra switches to build it for the Arm CPU that you run it on. + +Run `make` to build it: + +```bash +cd llama.cpp +make GGML_NO_LLAMAFILE=1 -j$(nproc) +``` + +Check that `llama.cpp` has built correctly by running the help command: + +```bash +./llama-cli -h +``` + +If `llama.cpp` has built correctly on your machine, you will see the help options being displayed. A snippet of the output is shown below: + +```output +usage: ./llama-cli [options] + +general: + + -h, --help, --usage print usage and exit + --version show version and build info + -v, --verbose print verbose information + --verbosity N set specific verbosity level (default: 0) + --verbose-prompt print a verbose prompt before generation (default: false) + --no-display-prompt don't print prompt at generation (default: false) + -co, --color colorise output to distinguish prompt and user input from generations (default: false) + -s, --seed SEED RNG seed (default: -1, use random seed for < 0) + -t, --threads N number of threads to use during generation (default: 4) + -tb, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads) + -td, --threads-draft N number of threads to use during generation (default: same as --threads) + -tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default: same as --threads-draft) + --draft N number of tokens to draft for speculative decoding (default: 5) + -ps, --p-split N speculative decoding split probability (default: 0.1) + -lcs, --lookup-cache-static FNAME + path to static lookup cache to use for lookup decoding (not updated by generation) + -lcd, --lookup-cache-dynamic FNAME + path to dynamic lookup cache to use for lookup decoding (updated by generation) + -c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model) + -n, --predict N number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled) + -b, --batch-size N logical maximum batch size (default: 2048) +``` + + +## Install Hugging Face Hub + +There are a few different ways you can download the Llama-2-7B Chat model. In this Learning Path, you download the model from Hugging Face. + +{{% notice Note %}} Use of Llama-2-7B-Chat model is governed by the Meta license. Before you proceed to download the model, please visit the Llama [website](https://llama.meta.com/llama-downloads/) and fill in the form. {{% /notice %}} + +[Hugging Face](https://huggingface.co/) is an open source AI community where you can host your own AI models, train them and collaborate with others in the community. You can browse through the thousands of models that are available for a variety of use cases like NLP, audio, and computer vision. + +The `huggingface_hub` library provides APIs and tools that let you easily download and fine-tune pre-trained models. You will use `huggingface-cli` to download the [Llama-2-7B-Chat model](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF). + +Install the required Python packages: + +```bash +sudo apt install python-is-python3 python3-pip python3-venv -y +``` + +Create and activate a Python virtual environment: + +```bash +python -m venv venv +source venv/bin/activate +``` + +Your terminal prompt now has the `(venv)` prefix indicating the virtual environment is active. Use this virtual environment for the remaining commands. + +Install the `huggingface_hub` python library using `pip`: + +```bash +pip install huggingface_hub +``` + +You can now download the model using the huggingface cli: + +```bash +huggingface-cli download TheBloke/Llama-2-7b-Chat-GGUF llama-2-7b-chat.Q4_0.gguf --local-dir . --local-dir-use-symlinks False +``` +Before you proceed and run this model, take a quick look at what `Q4_0` in the model name denotes. + +## Quantization format + +`Q4_0` in the model name refers to the quantization method the model uses. The goal of quantization is to reduce the size of the model (to reduce the memory space required) and faster (to reduce memory bandwidth bottlenecks transferring large amounts of data from memory to a processor). The primary trade-off to keep in mind when reducing a model's size is maintaining quality of performance. Ideally, a model is quantized to meet size and speed requirements while not having a negative impact on performance. + +Llama 2 was originally trained and published using the bfloat16 data type, meaning that each of the 7 billion model parameters takes up 16 bits of memory to store. Putting that into real terms, multiplying 16 bits per parameter by 7 billion parameters, the base foundation llama-2-7b model is just over 13Gb in size. + +This model is `llama-2-7b-chat.Q4_0.gguf`, so what does each component mean in relation to the quantization level? The main thing to note is the number of bits per parameter, which is denoted by 'Q4' in this case or 4-bit integer. As a result, by only using 4 bits per parameter for 7 billion parameters, the model drops to be 3.6Gb in size. + +Here is a quick lookup to the rest of the quantization parts for the Llama-2 model family as it exists today: + +| quantization-method | # of bits per parameter | quantization format (does not apply to quantization method 'IQ') | quantization method specifics | +| ------------------- | ----------------------- | ---------------------------------------------------------------- | ------------------ | +| Q, IQ, F, FP | 2,3,4,5,6,7,8,16,32 | _0, _1, _K | _XXS, _XS, _S, _M, _L | + +Some examples: + +* Q8_0 --> Straightforward quantization method (indicated with _0 or _1), with an 8 bit integer per parameter. +* Q4_K_M --> K-quant method (indicated with _K), with a 4 bit integer per parameter, with the _M quantization mix type used. +* IQ2_XXS --> I-quant method (indicated with _IQ), with the _XXS quantization mix type used. +* F16 --> Using a 16 bit floating point number per parameter (no other quantization method used, only rounding a number if starting from a 32 bit floating point number). + +Each quantization method has a unique approach to quantizing parameters. The deeper technical details of different quantization methodologies are outside the scope of this guide. The main takeaway is that selecting the right model quantization is critical to running an LLM effectively on your hardware, and the most impactful quantization decision is the number of bits per parameter. You will need also need to check you have enough system memory before deploying larger models or models with higher precision/quantization. + +In this guide, you will not use any other quantization methods, because Arm has not made kernel optimizations for other quantization types. + +## Re-quantize the model weights + +To see improvements for Arm optimized kernels, you need to generate a new weights file with rearranged Q4_0 weights. As of [llama.cpp commit 0f1a39f3](https://github.com/ggerganov/llama.cpp/commit/0f1a39f3), Arm has contributed code for three types of GEMV/GEMM kernels corresponding to three processor types: + +* AWS Graviton2, where you only have NEON support (you will see less improvement for these GEMV/GEMM kernels), +* AWS Graviton3, where the GEMV/GEMM kernels exploit both SVE 256 and MATMUL INT8 support, and +* AWS Graviton4, where the GEMV/GEMM kernels exploit NEON/SVE 128 and MATMUL_INT8 support + +To re-quantize optimally for Graviton3, run + +```bash +./llama-quantize --allow-requantize llama-2-7b-chat.Q4_0.gguf llama-2-7b-chat.Q4_0_8_8.gguf Q4_0_8_8 +``` + +This will output a new file, `llama-2-7b-chat.Q4_0_8_8.gguf`, which contains reconfigured weights that allow `llama-cli` to use SVE 256 and MATMUL_INT8 support. + +{{% notice Note %}} +This requantization is optimal only for Graviton3. For Graviton2, requantization should optimally be done in `Q4_0_4_4` format, and for Graviton4, `Q4_0_4_8` is the optimal requantization format. +{{% /notice %}} + +## Compare the pre-quantized Llama-2-7B-Chat LLM model weights to the optimized weights + +First, run the pre-quantized llama-2-7b-chat model exactly as the weights were downloaded from huggingface: + +```bash +./llama-cli -m llama-2-7b-chat.Q4_0.gguf -p "Building a visually appealing website can be done in ten simple steps:" -n 64 -t 2 +``` + +This command will use the downloaded model (`-m` flag), with the specified prompt (`-p` flag), and target a 64 token completion (`-n` flag), using two threads (`-t` flag). + +You will see lots of interesting statistics being printed from llama.cpp about the model and the system, followed by the prompt and completion. The tail of the output from running this model on an AWS Graviton3 c7g.2xlarge instance is shown below: + +```output +llm_load_tensors: ggml ctx size = 0.14 MiB +llm_load_tensors: CPU buffer size = 3647.87 MiB +.................................................................................................. +llama_new_context_with_model: n_ctx = 4096 +llama_new_context_with_model: n_batch = 2048 +llama_new_context_with_model: n_ubatch = 512 +llama_new_context_with_model: flash_attn = 0 +llama_new_context_with_model: freq_base = 10000.0 +llama_new_context_with_model: freq_scale = 1 +llama_kv_cache_init: CPU KV buffer size = 2048.00 MiB +llama_new_context_with_model: KV self size = 2048.00 MiB, K (f16): 1024.00 MiB, V (f16): 1024.00 MiB +llama_new_context_with_model: CPU output buffer size = 0.12 MiB +llama_new_context_with_model: CPU compute buffer size = 296.01 MiB +llama_new_context_with_model: graph nodes = 1030 +llama_new_context_with_model: graph splits = 1 + +system_info: n_threads = 2 / 8 | AVX = 0 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 1 | SVE = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | MATMUL_INT8 = 1 | LLAMAFILE = 0 | +sampling: + repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000 + top_k = 40, tfs_z = 1.000, top_p = 0.950, min_p = 0.050, typical_p = 1.000, temp = 0.800 + mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000 +sampling order: +CFG -> Penalties -> top_k -> tfs_z -> typical_p -> top_p -> min_p -> temperature +generate: n_ctx = 4096, n_batch = 2048, n_predict = 64, n_keep = 1 + + + Building a visually appealing website can be done in ten simple steps: + Einzeln, a UX/UI Designer at Designhill, provides a list of ten simple steps to create a visually appealing website. These steps include using high-quality images, choosing a consistent color scheme, and incorporating negative space. Additionally, Using a clean and simple layout, creating a clear hierarchy +llama_print_timings: load time = 1120.85 ms +llama_print_timings: sample time = 2.11 ms / 64 runs ( 0.03 ms per token, 30303.03 tokens per second) +llama_print_timings: prompt eval time = 1998.79 ms / 16 tokens ( 124.92 ms per token, 8.00 tokens per second) +llama_print_timings: eval time = 15991.48 ms / 63 runs ( 253.83 ms per token, 3.94 tokens per second) +llama_print_timings: total time = 17996.97 ms / 79 tokens +``` + +The `system_info` printed from llama.cpp highlights important architectural features present on your hardware that improve the performance of the model execution. In the output shown above from running on an AWS Graviton3 instance, you will see: + + * NEON = 1 This flag indicates support for Arm's Neon technology which is an implementation of the Advanced SIMD instructions + * ARM_FMA = 1 This flag indicates support for Arm Floating-point Multiply and Accumulate instructions + * MATMUL_INT8 = 1 This flag indicates support for Arm int8 matrix multiplication instructions + * SVE = 1 This flag indicates support for the Arm Scalable Vector Extension + + +The end of the output shows several model timings: + +* load time refers to the time taken to load the model. +* prompt eval time refers to the time taken to process the prompt before generating the new text. In this example, it shows that it evaluated 16 tokens in 1998.79 ms. +* eval time refers to the time taken to generate the output. Generally anything above 10 tokens per second is faster than what humans can read. + +You can compare these timings to the optimized model weights by running: + +```bash +./llama-cli -m llama-2-7b-chat.Q4_0_8_8.gguf -p "Building a visually appealing website can be done in ten simple steps:" -n 64 -t 2 +``` + +This is the same command as before, but with the model file swapped out for the re-quantized file. + +The timings on this one look like: + +``` +llama_print_timings: load time = 984.78 ms +llama_print_timings: sample time = 2.12 ms / 64 runs ( 0.03 ms per token, 30245.75 tokens per second) +llama_print_timings: prompt eval time = 463.98 ms / 16 tokens ( 29.00 ms per token, 34.48 tokens per second) +llama_print_timings: eval time = 6890.95 ms / 63 runs ( 109.38 ms per token, 9.14 tokens per second) +llama_print_timings: total time = 7362.13 ms / 79 tokens +``` + +As you can see, load time improves, but the biggest improvement can be seen in eval times. The number of tokens per second for prompt eval quadruples, while the speed of inference more than doubles. + +You have successfully run a LLM chatbot with Arm optimizations, all running on your Arm AArch64 CPU on your server. You can continue experimenting and trying out the model with different prompts. + diff --git a/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/kubernetes_diagram.png b/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/kubernetes_diagram.png new file mode 100644 index 0000000000..aa465ab34a Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/aaademo-kubectl-arm/kubernetes_diagram.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama-cpu/_demo.md b/content/learning-paths/servers-and-cloud-computing/llama-cpu/_demo.md new file mode 100644 index 0000000000..8787c8f984 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/llama-cpu/_demo.md @@ -0,0 +1,73 @@ +--- +title: Demo - Run a chatbot on an Arm CPU +overview: Running a chatbot can be expensive at scale, and surprising performance can be achieved with quantized (4 or 8 bit) small LLMs (~7 billion parameters) on Arm CPUs. CPUs are more accessible than GPUs and easier to program for ML for those new to the space. Cost is the largest benefit. Chat with an LLM here to see the price performance for yourself. This demo is running on AWS Graviton 4, on r8g.4xlarge instances via Lambdas. + +demo_steps: + - Type & send a message to the chatbot. + - Get the chatbot's reply. + - View stats showing how Graviton runs LLMs. + +diagram: config-diagram.png + +configuration_popup_details: Super long list of configuration information to provide to the user. Should be context and all that to be crystal clear what the setup is. + +configuration_dropdown_options: + - parameters: + param_name: LLM + options: + - name: llama-3-8b-instruct + specs: The newest Llama model, with 8 billion parameters. + - name: llama-2-7b + specs: Llama2 has 7 billion paramters. + selectable: true + explanation: The LLM selected affects how performant the model is and such. + + - parameters: + param_name: Instance Type + options: + - name: C7g.2xlarge + specs: This instance has 8 CPUs with 16 GB RAM. + - name: C7g.4xlarge + specs: This instance has 16 CPUs with 16 GB RAM. + - name: C7g.8xlarge + specs: This instance has 32 CPUs with 32 GB RAM. + selectable: true + explanation: The specific hardware specs you will be using. + + - parameters: + param_name: Compute Platform + options: + - name: AWS Graviton3 + specs: Details here as well + - name: AWS Graviton2 + specs: Details here + selectable: false + explanation: The compute hardware series to select between. + +### Specific details to this demo +# ================================================================================ +tps_max: 50 # sets stat visuals for tps +tps_ranges: + - name: Low + context: TPS is low right now. The reasons why are x, y, and z. + color: red + min: 0 + max: 10 + - name: Mid + context: Average TPS throughput. This is considered good enough for human readability. + color: yellow + min: 10 + max: 25 + - name: High + context: Excellent TPS - a high quality UX is being delivered. Due to x, y, and z reasons. + color: green + min: 25 + max: 1000 + +### FIXED, DO NOT MODIFY +# ================================================================================ +demo_template_name: llm_chatbot_first_demo # allows the 'demo.html' partial to route to the correct Configuration and Demo/Stats sub partials for page render. +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/llama-cpu/chatbot-icon.png b/content/learning-paths/servers-and-cloud-computing/llama-cpu/chatbot-icon.png new file mode 100644 index 0000000000..e9d80e4af6 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama-cpu/chatbot-icon.png differ diff --git a/content/learning-paths/servers-and-cloud-computing/llama-cpu/config-diagram.png b/content/learning-paths/servers-and-cloud-computing/llama-cpu/config-diagram.png new file mode 100644 index 0000000000..713df46858 Binary files /dev/null and b/content/learning-paths/servers-and-cloud-computing/llama-cpu/config-diagram.png differ diff --git a/themes/arm-design-system-hugo-theme/layouts/learning-paths/learningpathall.html b/themes/arm-design-system-hugo-theme/layouts/learning-paths/learningpathall.html index a72311bedb..8d0db8fd44 100644 --- a/themes/arm-design-system-hugo-theme/layouts/learning-paths/learningpathall.html +++ b/themes/arm-design-system-hugo-theme/layouts/learning-paths/learningpathall.html @@ -70,6 +70,8 @@
View the full configuration details on this page.
+ {{- range .Params.configuration_dropdown_options -}} ++ {{ .specs }} +
+ {{- $first = false -}} + {{- end -}} +{{ readFile $file_path }}
+ {{ readFile $file_path }}
+ View the full configuration details on this page.
++ {{ .specs }} +
+ {{- $first = false -}} + {{- end -}} +
+
+ # Initialization commands
+ aws configure
+ aws eks --region _region_ update-kubeconfig --name _cluster_name_
+
+ # Run the Arm KubeCTL migration diagnostic tool
+ ./armer.sh
+
+
+
+
+
+
+ Reset chat
+Type a message to the chatbot to view metrics.
+11 Total containers.
+ 9 - Support Arm.
+ 1 - not supported.
+ 1 - Unknown error.
+
Given these results, you can continue migrating your Kubernetes cluster to Arm following these steps:
+1) Replace non-Arm compatible containers
+The official tensorflow container does not support Arm. However, Arm has created an image that does support Arm Linux servers on Docker Hub here.
+ +2) Mix your cluster architectures
+You don't have to deploy all your containers to one architecture - deploy what works today on Arm to Arm, and keep your other containers where they are. For more information read here.
+ +3) Follow the Arm Migration checklist
+Read Arm's full tips & tricks on how to migrate your apps to Arm servers here.
+ + + +Reset chat
+ +Type a message to the chatbot to view metrics.
++
# seconds to display # tokens.
+Tokens per second: #
+Time to first token: #
+{{.Params.overview}}
+ +- {{ if eq .File.TranslationBaseName "_index" }} Introduction {{else}} {{.Title}} {{ end }} + {{ if eq .File.TranslationBaseName "_demo" }} + Demo + {{ else if eq .File.TranslationBaseName "_index" }} + Introduction + {{else}} + {{.Title}} + {{ end }}