From eebf70c5aa88daca138d40a787774e51b6d740b4 Mon Sep 17 00:00:00 2001 From: Julien Simon Date: Thu, 26 Jun 2025 13:06:25 +0200 Subject: [PATCH 1/4] New learning path: Deploy Arcee AFM-4.5B on AWS Graviton4 --- .../01_launching_a_graviton4_instance.md | 168 ++++++++++++++++++ .../02_setting_up_the_instance.md | 51 ++++++ .../03_building_llama_cpp.md | 81 +++++++++ ...stall_python_dependencies_for_llama_cpp.md | 68 +++++++ .../05_downloading_and_optimizing_afm45b.md | 91 ++++++++++ .../06_running_inference.md | 156 ++++++++++++++++ .../07_evaluating_the_quantized_models.md | 104 +++++++++++ .../08_conclusion.md | 64 +++++++ .../arcee-foundation-model-on-aws/_index.md | 63 +++++++ .../_next-steps.md | 8 + 10 files changed, 854 insertions(+) create mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/01_launching_a_graviton4_instance.md create mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/02_setting_up_the_instance.md create mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md create mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/04_install_python_dependencies_for_llama_cpp.md create mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/05_downloading_and_optimizing_afm45b.md create mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/06_running_inference.md create mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/07_evaluating_the_quantized_models.md create mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/08_conclusion.md create mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md create mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_next-steps.md diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/01_launching_a_graviton4_instance.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/01_launching_a_graviton4_instance.md new file mode 100644 index 0000000000..d3170daf3e --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/01_launching_a_graviton4_instance.md @@ -0,0 +1,168 @@ +--- +title: Launching a Graviton4 instance +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## System Requirements + + - An AWS account + + - A Linux or MacOS host + + - A c8g or r8g instance (4xlarge or larger) + + - At least 128GB of storage + +## AWS Console Steps + +Follow these steps to launch your EC2 instance using the AWS Management Console: + +### Step 1: Create an SSH Key Pair + +1. **Navigate to EC2 Console** + + - Go to the [AWS Management Console](https://console.aws.amazon.com) + + - Search for "EC2" and click on "EC2" service + +2. **Create Key Pair** + + - In the left navigation pane, click "Key Pairs" under "Network & Security" + + - Click "Create key pair" + + - Enter name: `arcee-graviton4-key` + + - Select "RSA" as the key pair type + + - Select ".pem" as the private key file format + + - Click "Create key pair" + + - The private key file will automatically download to your computer + +3. **Secure the Key File** + + - Move the downloaded `.pem` file to the SSH configuration directory + ```bash + mkdir -p ~/.ssh + mv arcee-graviton4-key.pem ~/.ssh + ``` + + - Set proper permissions (on Mac/Linux): + ```bash + chmod 400 ~/.ssh/arcee-graviton4-key.pem + ``` + +### Step 2: Launch EC2 Instance + +1. **Start Instance Launch** + + - In the left navigation pane, click "Instances" under "Instances" + + - Click "Launch instances" button + +2. **Configure Instance Details** + + - **Name and tags**: Enter `Arcee-Graviton4-Instance` as the instance name + + - **Application and OS Images**: + - Click "Quick Start" tab + + - Select "Ubuntu" + + - Choose "Ubuntu Server 24.04 LTS (HVM), SSD Volume Type" + + - **Important**: Ensure the architecture shows "64-bit (ARM)" for Graviton compatibility + + - **Instance type**: + - Click on "Select instance type" + + - Select `c8g.4xlarge` or larger + +3. **Configure Key Pair** + + In "Key pair name", select the SSH keypair you created earlier (`Arcee-Graviton4-Instance`) + +4. **Configure Network Settings** + + - **Network**: Select a VPC with a least one public subnet. + + - **Subnet**: Select a public subnet in the VPC + + - **Auto-assign Public IP**: Enable + + - **Firewall (security groups)** + + - Click on "Create security group" + + - Click on "Allow SSH traffic from" + + - In the dropdown list, select "My IP". + + Note 1: you will only be able to connect to the instance from your current host, which is the safest setting. We don't recommend selecting "Anywhere", which would allow anyone on the Internet to attempt to connect. Use at your own risk. + + Note 2: although this demonstration only requires SSH access, feel free to use one of your existing security groups as long as it allows SSH traffic. + +5. **Configure Storage** + + - **Root volume**: + - Size: `128` GB + + - Volume type: `gp3` + +7. **Review and Launch** + + - Review all settings in the "Summary" section + + - Click "Launch instance" + +### Step 3: Monitor Instance Launch + +1. **View Launch Status** + + After a few seconds, you should see a message similar to this one: + + `Successfully initiated launch of instance (i-)` + + If instance launch fails, please review your settings and try again. + +2. **Get Connection Information** + + - Click on the instance id, or look for the instance in the Instances list in the EC2 console. + + - In the "Details" tab of the instance, note the "Public DNS" host name + + - This is the host name you'll use to connect via SSH, aka `PUBLIC_DNS_HOSTNAME` + +### Step 4: Connect to Your Instance + +1. **Open Terminal/Command Prompt** + +2. **Connect via SSH** + ```bash + ssh -i ~/.ssh/arcee-graviton4-key.pem ubuntu@ + ``` + +3. **Accept Security Warning** + + - When prompted about authenticity of host, type `yes` + + - You should now be connected to your Ubuntu instance + +### Important Notes + +- **Region Selection**: Ensure you're in your preferred AWS region before launching + +- **AMI Selection**: The Ubuntu 24.04 LTS AMI must be ARM64 compatible for Graviton processors + +- **Security**: please think twice about allowing SSH from anywhere (0.0.0.0/0). We strongly recommend restricting access to your IP address + +- **Storage**: The 128GB EBS volume is sufficient for the Arcee model and dependencies + +- **Backup**: Consider creating AMIs or snapshots for backup purposes + + diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/02_setting_up_the_instance.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/02_setting_up_the_instance.md new file mode 100644 index 0000000000..f8e09c292e --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/02_setting_up_the_instance.md @@ -0,0 +1,51 @@ +--- +title: Setting up the instance +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In this step, we'll set up the Graviton4 instance with all the necessary tools and dependencies required to build and run the Arcee Foundation Model. This includes installing the build tools and Python environment. + +## Step 1: Update Package List + +```bash +sudo apt-get update +``` + +This command updates the local package index from the repositories: + +- Downloads the latest package lists from all configured APT repositories +- Ensures you have the most recent information about available packages and their versions +- This is a best practice before installing new packages to avoid potential conflicts +- The package index contains metadata about available packages, their dependencies, and version information + +## Step 2: Install System Dependencies + +```bash +sudo apt-get install cmake gcc g++ git python3 python3-pip python3-virtualenv libcurl4-openssl-dev unzip -y +``` + +This command installs all the essential development tools and dependencies: + +- **cmake**: Cross-platform build system generator that we'll use to compile Llama.cpp +- **gcc & g++**: GNU C and C++ compilers for building native code +- **git**: Version control system for cloning repositories +- **python3**: Python interpreter for running Python-based tools and scripts +- **python3-pip**: Python package installer for managing Python dependencies +- **python3-virtualenv**: Tool for creating isolated Python environments +- **libcurl4-openssl-dev**: client-side URL transfer library + +The `-y` flag automatically answers "yes" to prompts, making the installation non-interactive. + +## What's Ready Now + +After completing these steps, your Graviton4 instance will have: + +- A complete C/C++ development environment for building Llama.cpp +- Python 3 with pip for managing Python packages +- Git for cloning repositories +- All necessary build tools for compiling optimized ARM64 binaries + +The system is now prepared for the next steps: building Llama.cpp and downloading the Arcee Foundation Model. diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md new file mode 100644 index 0000000000..3c86cfb4b5 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md @@ -0,0 +1,81 @@ +--- +title: Building Llama.cpp +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In this step, we'll build Llama.cpp from source. Llama.cpp is a high-performance C++ implementation of the LLaMA model that's optimized for inference on various hardware platforms, including ARM-based processors like Graviton4. + +Here are all the steps. + +## Step 1: Clone the Repository + +```bash +git clone https://github.com/ggerganov/llama.cpp +``` + +This command clones the Llama.cpp repository from GitHub to your local machine. The repository contains the source code, build scripts, and documentation needed to compile the inference engine. + +## Step 2: Navigate to the Project Directory + +```bash +cd llama.cpp +``` + +Change into the llama.cpp directory where we'll perform the build process. This directory contains the CMakeLists.txt file and source code structure. + +## Step 3: Configure the Build with CMake + +```bash +cmake -B . +``` + +This command uses CMake to configure the build system: +- `-B .` specifies that the build files should be generated in the current directory +- CMake will detect your system's compiler, libraries, and hardware capabilities +- It will generate the appropriate build files (Makefiles on Linux) based on your system configuration +- This step also enables optimizations for ARM processors like Graviton4 + +Note: The cmake output should include the information below, indicating that the build process will leverage the Neoverse V2 architecture's specialized instruction sets designed for AI/ML workloads. These optimizations are crucial for achieving optimal performance on Graviton4: + +```bash +-- ARM feature DOTPROD enabled +-- ARM feature SVE enabled +-- ARM feature MATMUL_INT8 enabled +-- ARM feature FMA enabled +-- ARM feature FP16_VECTOR_ARITHMETIC enabled +-- Adding CPU backend variant ggml-cpu: -mcpu=neoverse-v2+crc+sve2-aes+sve2-sha3+dotprod+i8mm+sve +``` + +- **DOTPROD: Dot Product** - Hardware-accelerated dot product operations for neural network computations +- **SVE: Scalable Vector Extension** - Advanced vector processing capabilities that can handle variable-length vectors up to 2048 bits, providing significant performance improvements for matrix operations +- **MATMUL_INT8: Matrix multiplication units** - Dedicated hardware for efficient matrix operations common in transformer models, accelerating the core computations of large language models +- **FMA: Fused Multiply-Add - Optimized floating-point operations that combine multiplication and addition in a single instruction +- **FP16 Vector Arithmetic - Hardware support for 16-bit floating-point vector operations, reducing memory usage while maintaining good numerical precision + +## Step 4: Compile the Project + +```bash +cmake --build . --config Release -j16 +``` + +This command compiles the Llama.cpp project: +- `--build .` tells CMake to build the project using the files in the current directory +- `--config Release` specifies a Release build configuration, which enables optimizations and removes debug symbols +- `-j16` runs the build with 16 parallel jobs, which speeds up compilation on multi-core systems like Graviton4 + +The build process will compile the C++ source code into executable binaries optimized for your ARM64 architecture. This should only take a minute. + +## What Gets Built + +After successful compilation, you'll have several key command-line executables in the `bin` directory: +- `llama-cli` - The main inference executable for running LLaMA models +- `llama-server` - A web server for serving model inference over HTTP +- `llama-quantize` - a tool for model quantization to reduce memory usage +- Various utility programs for model conversion and optimization + +You can find more information in the llama.cpp [GitHub repository](https://github.com/ggml-org/llama.cpp/tree/master/tools). + +These binaries are specifically optimized for ARM64 architecture and will provide excellent performance on your Graviton4 instance. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/04_install_python_dependencies_for_llama_cpp.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/04_install_python_dependencies_for_llama_cpp.md new file mode 100644 index 0000000000..d3f9ebcac3 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/04_install_python_dependencies_for_llama_cpp.md @@ -0,0 +1,68 @@ +--- +title: Installing Python dependencies for llama.cpp +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In this step, we'll set up a Python virtual environment and install the required dependencies for working with Llama.cpp. This ensures we have a clean, isolated Python environment with all the necessary packages for model optimization. + +Here are all the steps. + +## Step 1: Create a Python Virtual Environment + +```bash +virtualenv env-llama-cpp +``` + +This command creates a new Python virtual environment named `env-llama-cpp`: +- Virtual environments provide isolated Python environments that prevent conflicts between different projects +- The `env-llama-cpp` directory will contain its own Python interpreter and package installation space +- This isolation ensures that the Llama.cpp dependencies won't interfere with other Python projects on your system +- Virtual environments are essential for reproducible development environments + +## Step 2: Activate the Virtual Environment + +```bash +source env-llama-cpp/bin/activate +``` + +This command activates the virtual environment: +- The `source` command executes the activation script, which modifies your current shell environment +- Depending on you sheel, your command prompt may change to show `(env-llama-cpp)` at the beginning, indicating the active environment. We will reflect this in the following commands. +- All subsequent `pip` commands will install packages into this isolated environment +- The `PATH` environment variable is updated to prioritize the virtual environment's Python interpreter + +## Step 3: Upgrade pip to the Latest Version + +```bash +(env-llama-cpp) pip install --upgrade pip +``` + +This command ensures you have the latest version of pip: +- Upgrading pip helps avoid compatibility issues with newer packages +- The `--upgrade` flag tells pip to install the newest available version +- This is a best practice before installing project dependencies +- Newer pip versions often include security fixes and improved package resolution + +## Step 4: Install Project Dependencies + +```bash +(env-llama-cpp) pip install -r requirements.txt +``` + +This command installs all the Python packages specified in the requirements.txt file: +- The `-r` flag tells pip to read the package list from the specified file +- `requirements.txt` contains a list of Python packages and their version specifications +- This ensures everyone working on the project uses the same package versions +- The installation will include packages needed for model loading, inference, and any Python bindings for Llama.cpp + +## What Gets Installed + +After successful installation, your virtual environment will contain: +- **NumPy**: For numerical computations and array operations +- **Requests**: For HTTP operations and API calls +- **Other dependencies**: Specific packages needed for Llama.cpp Python integration + +The virtual environment is now ready for running Python scripts that interact with the compiled Llama.cpp binaries. Remember to always activate the virtual environment (`source env-llama-cpp/bin/activate`) before running any Python code related to this project. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/05_downloading_and_optimizing_afm45b.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/05_downloading_and_optimizing_afm45b.md new file mode 100644 index 0000000000..fffb81a79a --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/05_downloading_and_optimizing_afm45b.md @@ -0,0 +1,91 @@ +--- +title: Downloading and optimizing AFM-4.5B +weight: 6 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In this step, we'll download the AFM-4.5B model from Hugging Face, convert it to the GGUF format for use with Llama.cpp, and create quantized versions to optimize memory usage and inference speed. + +The first release of the [Arcee Foundation Model](https://www.arcee.ai/blog/announcing-the-arcee-foundation-model-family) family, [AFM-4.5B](https://www.arcee.ai/blog/deep-dive-afm-4-5b-the-first-arcee-foundational-model) is a 4.5-billion-parameter frontier model that delivers excellent accuracy, strict compliance, and very high cost-efficiency. It was trained on almost 7 trillion tokens of clean, rigorously filtered data, and has been tested across a wide range of languages, including Arabic, English, French, German, Hindi, Italian, Korean, Mandarin, Portuguese, Russian, and Spanish + +Here are all the steps to download and optimize the model for AWS Graviton4. Make sure to run them in the virtual environment you created at the previous step. + +## Step 1: Install the Hugging Face libraries + +```bash +(env-llama-cpp) pip install huggingface_hub hf_xet +``` + +This command installs the Hugging Face Hub Python library, which provides tools for downloading models and datasets from the Hugging Face platform. The library includes the `huggingface-cli` command-line interface that we'll use to download the AFM-4.5B model. The `hf_xet` library provides additional functionality for efficient data transfer and caching when downloading large models from Hugging Face Hub. + +## Step 2: Download the AFM-4.5B Model + +```bash +(env-llama-cpp) huggingface-cli download arcee-ai/afm-4.5B --local-dir models/afm-4-5b +``` + +This command downloads the AFM-4.5B model from the Hugging Face Hub: +- `arcee-ai/afm-4.5B` is the model identifier on Hugging Face Hub +- `--local-dir model/afm-4-5b` specifies the local directory where the model files will be stored +- The download includes the model weights, configuration files, and tokenizer data +- This is a 4.5 billion parameter model, so the download may take several minutes depending on your internet connection + +## Step 3: Convert to GGUF Format + +```bash +(env-llama-cpp) python3 convert_hf_to_gguf.py models/afm-4-5b +(env-llama-cpp) deactivate +``` + +The first command converts the downloaded Hugging Face model to the GGUF (GGML Universal Format) format: +- `convert_hf_to_gguf.py` is a conversion script that comes with Llama.cpp +- `models/afm-4-5b` is the input directory containing the Hugging Face model files +- The script reads the model architecture, weights, and configuration from the Hugging Face format +- It outputs a single `afm-4-5B-F16.gguf` ~15GB file in the `models/afm-4-5b/` directory +- GGUF is the native format used by Llama.cpp and provides efficient loading and inference + +Then, we deactivate the Python virtual environment as future commands won't require it. + +## Step 4: Create Q4_0 Quantized Version + +```bash +bin/llama-quantize models/afm-4-5b/afm-4-5B-F16.gguf models/afm-4-5b/afm-4-5B-Q4_0.gguf Q4_0 +``` + +This command creates a 4-bit quantized version of the model: +- `llama-quantize` is the quantization tool from Llama.cpp +- `afm-4-5B-F16.gguf` is the input GGUF model file in 16-bit precision +- `Q4_0` specifies 4-bit quantization with zero-point quantization +- This reduces the model size by approximately 45% (from ~15GB to ~8GB) +- The quantized model will use less memory and run faster, though with a small reduction in accuracy +- The output file will be named `afm-4-5B-Q4_0.gguf` + +**ARM Optimization**: ARM has contributed highly optimized kernels for Q4_0 quantization that leverage the Neoverse v2 instruction sets. These low-level math routines accelerate typical deep learning operations, providing significant performance improvements on ARM-based processors like Graviton4. + +These instruction sets enable Llama.cpp to perform quantized operations much faster than generic implementations, making ARM processors highly competitive for inference workloads. + +## Step 5: Create Q8_0 Quantized Version + +```bash +bin/llama-quantize models/afm-4-5b/afm-4-5B-F16.gguf models/afm-4-5b/afm-4-5B-Q8_0.gguf Q8_0 +``` + +This command creates an 8-bit quantized version of the model: +- `Q8_0` specifies 8-bit quantization with zero-point quantization +- This reduces the model size by approximately 70% (from ~15GB to ~4.4GB) +- The 8-bit version provides a better balance between memory usage and accuracy compared to 4-bit +- The output file will be named `afm-4-5B-Q8_0.gguf` +- This version is often preferred for production use when memory constraints allow + +**ARM Optimization**: Similar to Q4_0, ARM has contributed optimized kernels for Q8_0 quantization that take advantage of Neoverse v2 instruction sets. These optimizations provide excellent performance for 8-bit operations while maintaining higher accuracy compared to 4-bit quantization. + +## What You'll Have + +After completing these steps, you'll have three versions of the AFM-4.5B model: +- `afm-4-5B-F16.gguf` - The original full-precision model (~15GB) +- `afm-4-5B-Q4_0.gguf` - 4-bit quantized version (~8GB) for memory-constrained environments +- `afm-4-5B-Q8_0.gguf` - 8-bit quantized version (~4.4GB) for balanced performance and memory usage + +These models are now ready to be used with the Llama.cpp inference engine for text generation and other language model tasks. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/06_running_inference.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/06_running_inference.md new file mode 100644 index 0000000000..7898ab02a5 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/06_running_inference.md @@ -0,0 +1,156 @@ +--- +title: Running inference with AFM-4.5B +weight: 7 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +Now that we have our AFM-4.5B models in GGUF format, we can run inference using various Llama.cpp tools. In this step, we'll explore different ways to interact with the model for text generation, benchmarking, and evaluation. + +## Using llama-cli for Interactive Text Generation + +The `llama-cli` tool provides an interactive command-line interface for text generation. This is perfect for testing the model's capabilities and having conversations with it. + +### Basic Usage + +```bash +bin/llama-cli -m models/afm-4-5b/afm-4-5B-Q8_0.gguf -n 256 --color +``` + +This command starts an interactive session with the model: +- `-m models/afm-4-5b/afm-4-5B-Q8_0.gguf` specifies the model file to load +- `-n 512` sets the maximum number of tokens to generate per response +- The tool will prompt you to enter text, and the model will generate a response + +In this example, `llama-cli` uses 16 vCPUs. You can try different values with `-t `. + +### Example Interactive Session + +Once you start the interactive session, you can have conversations like this: + +``` +> Give me a brief explanation of the attention mechnanism in transformer models. +In transformer models, the attention mechanism allows the model to focus on specific parts of the input sequence when computing the output. Here's a simplified explanation: + +1. **Key-Query-Value (K-Q-V) computation**: For each input element, the model computes three vectors: + - **Key (K)**: This represents the input element in a way that's useful for computing attention weights. + - **Query (Q)**: This represents the current input element being processed and is used to compute attention weights. + - **Value (V)**: This represents the input element in its original form, which is used to compute the output based on attention weights. + +2. **Attention scores computation**: The attention mechanism computes the similarity between the Query (Q) and each Key (K) element using dot product and softmax normalization. This produces a set of attention scores, which represent how relevant each Key (K) element is to the Query (Q). + +3. **Weighted sum**: The attention scores are used to compute a weighted sum of the Value (V) elements. The output is a weighted sum of the Values (V) based on the attention scores. + +4. **Output**: The final output is a vector that represents the context of the input sequence, taking into account the attention scores. This output is used in the decoder to generate the next word in the output sequence. + +The attention mechanism allows transformer models to selectively focus on specific parts of the input sequence, enabling them to better understand context and relationships between input elements. This is particularly useful for tasks like machine translation, where the model needs to capture long-range dependencies between input words. +``` + +To exit the interactive session, type `Ctrl+C` or `/bye`. + +This will display performance statistics: +```bash +llama_perf_sampler_print: sampling time = 26.66 ms / 356 runs ( 0.07 ms per token, 13352.84 tokens per second) +llama_perf_context_print: load time = 782.72 ms +llama_perf_context_print: prompt eval time = 392.40 ms / 24 tokens ( 16.35 ms per token, 61.16 tokens per second) +llama_perf_context_print: eval time = 13173.66 ms / 331 runs ( 39.80 ms per token, 25.13 tokens per second) +llama_perf_context_print: total time = 129945.08 ms / 355 tokens +``` + +In this example, our 8-bit model running on 16 threads generated 355 tokens, at over 25 tokens per second (`eval time`). + +### Example Non-Interactive Session + +Now, let's try the 4-bit model in non-interactive mode: + +```bash +bin/llama-cli -m models/afm-4-5b/afm-4-5B-Q4_0.gguf -n 256 --color -no-cnv -p "Give me a brief explanation of the attention mechnanism in transformer models." +``` +This command starts an non-interactive session with the model: +- `-m models/afm-4-5b/afm-4-5B-Q4_0.gguf` specifies the model file to load +- `-no-cnv` disable the conversation mode +- `-p` sets the prompt sent to the model +- The tool will prompt you to enter text, and the model will generate a response + +Here, you should see the model generating at about 40 tokens per second. This shows how a more aggressive quantization recipe helps deliver faster perfornmance. + +## Using llama-server for API Access + +The `llama-server` tool runs the model as a web server, allowing you to make HTTP requests for text generation. This is useful for integrating the model into applications or for batch processing. + +### Starting the Server + +```bash +bin/llama-server -m models/afm-4-5b/afm-4-5B-Q4_0.gguf \ + --host 0.0.0.0 \ + --port 8080 \ + --ctx-size 4096 +``` + +This starts a server that: +- Loads the specified model +- Listens on all network interfaces (`0.0.0.0`) +- Accepts connections on port 8080 +- Uses a 4096-token context window + +### Making API Requests + +Once the server is running, you can make requests using curl or any HTTP client. As `llama-server` is compatible with the popular OpenAI API, we'll use in the following examples. + +Open a new terminal on the AWS instance and run: + +```bash +curl -X POST http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "afm-4-5b", + "messages": [ + { + "role": "user", + "content": "Explain quantum computing in less than 100 words." + } + ], + "max_tokens": 256, + "temperature": 0.9 + }' +``` + +You should get an answer similar to this one: + +```json +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "message": { + "role": "assistant", + "content": "Quantum computing uses quantum-mechanical phenomena, such as superposition and entanglement, to perform calculations. It allows for multiple possibilities to exist simultaneously, which can speed up certain processes. Unlike classical computers, quantum computers can solve complex problems and simulate systems more efficiently. Quantum bits (qubits) store information, and quantum gates perform operations. Quantum computing has potential applications in fields like cryptography, optimization, and materials science. Its development is an active area of research, with companies like IBM, Google, and Microsoft investing in quantum computing technology." + } + } + ], + "created": 1750929895, + "model": "afm-4-5b", + "system_fingerprint": "b5757-716301d1", + "object": "chat.completion", + "usage": { + "completion_tokens": 111, + "prompt_tokens": 20, + "total_tokens": 131 + }, + "id": "chatcmpl-tb93ww9iYCErwLJmsV0YLrIadVvpBk4m", + "timings": { + "prompt_n": 11, + "prompt_ms": 105.651, + "prompt_per_token_ms": 9.604636363636363, + "prompt_per_second": 104.11638318615064, + "predicted_n": 111, + "predicted_ms": 2725.982, + "predicted_per_token_ms": 24.558396396396397, + "predicted_per_second": 40.719271073690145 + } +} +``` + +You could also interact with the server using Python with the [OpenAI client library](https://github.com/openai/openai-python), enabling streaming responses, and other features. diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/07_evaluating_the_quantized_models.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/07_evaluating_the_quantized_models.md new file mode 100644 index 0000000000..34ad11cb23 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/07_evaluating_the_quantized_models.md @@ -0,0 +1,104 @@ +--- +title: Evaluating the quantized models +weight: 8 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Using llama-bench for Performance Benchmarking + +The [`llama-bench`](https://github.com/ggml-org/llama.cpp/tree/master/tools/llama-bench) tool allows you to measure the performance characteristics of your model, including inference speed and memory usage. + +### Basic Benchmarking + +You can benchmark multiple model versions to compare their performance: + +```bash +# Benchmark the full precision model +bin/llama-bench -m models/afm-4-5b/afm-4-5B-F16.gguf + +# Benchmark the 8-bit quantized model +bin/llama-bench -m models/afm-4-5b/afm-4-5B-Q8_0.gguf + +# Benchmark the 4-bit quantized model +bin/llama-bench -m models/afm-4-5b/afm-4-5B-Q4_0.gguf +``` + +Running each model on 16 vCPUs, you should see results like: +- **F16 model**: ~15-16 tokens/second, ~15GB memory usage +- **Q8_0 model**: ~25 tokens/second, ~8GB memory usage +- **Q4_0 model**: ~40 tokens/second, ~4.4GB memory usage + +The exact performance will depend on your specific instance configuration and load. + +### Advanced Benchmarking + +```bash +bin/llama-bench -m models/afm-4-5b/afm-4-5B-Q4_0.gguf \ + -p 128,256,512 \ + -n 128 \ + -t 8,16,24 +``` + +This command: +- Loads the model and runs inference benchmarks +- `-p`: Evaluates a random prompt of 128, and 512 tokens +- `-n`: Generates 128 tokens +- `-t`: Run the model on 4, 8, and 16 threads + +The results should look like this: + +| model | size | params | backend | threads | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | ------: | --------------: | -------------------: | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 4 | pp128 | 62.90 ± 0.08 | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 4 | pp512 | 57.63 ± 0.06 | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 4 | tg128 | 15.18 ± 0.02 | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 8 | pp128 | 116.23 ± 0.04 | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 8 | pp512 | 106.39 ± 0.03 | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 8 | tg128 | 25.29 ± 0.05 | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 16 | pp128 | 206.67 ± 0.10 | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 16 | pp512 | 190.18 ± 0.03 | +| llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 16 | tg128 | 40.99 ± 0.36 | + +It's pretty amazing to see that with only 4 threads, the 4-bit model can still generate at the very comfortable speed of 15 tokens per second. + +You could also try [`llama-batched-bench`](https://github.com/ggml-org/llama.cpp/tree/master/tools/batched-bench) to benchmark performance on batch sizes larger than 1. + + +## Using llama-perplexity for Model Evaluation + +Perplexity is a measure of how well a language model predicts text. It represents the average number of possible next tokens the model considers when predicting each word. A lower perplexity score indicates the model is more confident in its predictions and generally performs better on the given text. For example, a perplexity of 2.0 means the model typically considers 2 possible tokens when making each prediction, while a perplexity of 10.0 means it considers 10 possible tokens on average. + +The `llama-perplexity` tool evaluates the model's quality on text datasets by calculating perplexity scores. Lower perplexity indicates better quality. + +### Downloading a Test Dataset + +First, let's download the Wikitest-2 test dataset. + +```bash +sh scripts/get-wikitext-2.sh +``` + +### Running Perplexity Evaluation + +Now, let's measure perplexity on the test dataset +```bash +bin/llama-perplexity -m models/afm-4-5b/afm-4-5B-F16.gguf -f wikitext-2-raw/wiki.test.raw +bin/llama-perplexity -m models/afm-4-5b/afm-4-5B-Q8_0.gguf -f wikitext-2-raw/wiki.test.raw +bin/llama-perplexity -m models/afm-4-5b/afm-4-5B-Q4_0.gguf -f wikitext-2-raw/wiki.test.raw +``` + +These commands will run for about 4 hours. You should run them in a shell script to avoid SSH timeouts. For example: +```bash + nohup sh ppl.sh >& ppl.sh.log & + tail -f ppl.sh.log + ``` + +If you want to speed things up, you can add the `--chunks` option to use a fraction of 564 chunks contained in the test dataset. + +Here are the full results: + +TODO + + diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/08_conclusion.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/08_conclusion.md new file mode 100644 index 0000000000..04ff3dd0b7 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/08_conclusion.md @@ -0,0 +1,64 @@ +--- +title: Conclusion +weight: 9 + + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## Conclusion + +Congratulations! You have successfully completed the journey of deploying the Arcee AFM-4.5B foundation model on AWS Graviton4. Let's recap what we accomplished: + +### What We Built + +Throughout this learning path, you've: + +1. **Launched a Graviton4-powered EC2 instance** - Set up a c8g.4xlarge instance running Ubuntu 24.04 LTS, leveraging AWS's latest Arm-based processors for optimal performance and cost efficiency. + +2. **Configured the development environment** - Installed essential tools and dependencies, including Git, build tools, and Python packages needed for machine learning workloads. + +3. **Built llama.cpp from source** - Compiled the optimized inference engine specifically for Arm64 architecture, ensuring maximum performance on Graviton4 processors. + +4. **Downloaded and optimized AFM-4.5B** - Retrieved the 4.5-billion parameter Arcee Foundation Model and converted it to the efficient GGUF format, then created quantized versions (8-bit and 4-bit) to balance performance and memory usage. + +5. **Ran inference and evaluation** - Tested the model's capabilities through interactive conversations, API endpoints, and comprehensive benchmarking to measure speed, memory usage, and model quality. + +### Key Performance Insights + +The benchmarking results demonstrate the power of quantization and Arm-based computing: + +- **Memory efficiency**: The 4-bit quantized model uses only ~4.4GB of RAM compared to ~15GB for the full precision model +- **Speed improvements**: Quantization delivers 2-3x faster inference speeds (40+ tokens/second vs 15-16 tokens/second) +- **Cost optimization**: Lower memory requirements enable running on smaller, more cost-effective instances +- **Quality preservation**: The quantized models maintain excellent perplexity scores, showing minimal quality degradation + +### The Graviton4 Advantage + +AWS Graviton4 processors, built on Arm Neoverse-V2 architecture, provide: +- Superior performance per watt compared to x86 alternatives +- Cost savings of 20-40% for compute-intensive workloads +- Optimized memory bandwidth and cache hierarchy for AI/ML workloads +- Native Arm64 support for modern machine learning frameworks + +### Next Steps and Call to Action + +Now that you have a fully functional AFM-4.5B deployment, here are some exciting ways to extend your learning: + +**Production Deployment** +- Set up auto-scaling groups for high availability +- Implement load balancing for multiple model instances +- Add monitoring and logging with CloudWatch +- Secure your API endpoints with proper authentication + +**Application Development** +- Build a web application using the llama-server API +- Create a chatbot or virtual assistant +- Develop content generation tools +- Integrate with existing applications via REST APIs + +The combination of Arcee AI's efficient foundation models, llama.cpp's optimized inference engine, and AWS Graviton4's powerful Arm processors creates a compelling platform for deploying production-ready AI applications. Whether you're building chatbots, content generators, or research tools, this stack provides the performance, cost efficiency, and flexibility needed for modern AI workloads. + +For more information on Arcee AI and how we can help you build high-quality, secure, and cost-efficient AI, solution, please visit [www.arcee.ai](https://www.arcee.ai). + diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md new file mode 100644 index 0000000000..55fbf739de --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md @@ -0,0 +1,63 @@ +--- +title: Deploy Arcee AFM-4.5B on AWS Graviton4 + +minutes_to_complete: 30 + +who_is_this_for: This is an introductory topic for developers and engineers who want to deploy the Arcee AFM-4.5B small language model on an AWS Arm-based instance. AFM-4.5B is a 4.5-billion-parameter frontier model that delivers excellent accuracy, strict compliance, and very high cost-efficiency. It was trained on almost 7 trillion tokens of clean, rigorously filtered data, and has been tested across a wide range of languages, including Arabic, English, French, German, Hindi, Italian, Korean, Mandarin, Portuguese, Russian, and Spanish + +learning_objectives: + - Deploy an Arm-based Graviton4 virtual machine on Amazon Web Services + - Connect to the virtual machine using SSH + - Download the AFM-4.5B model from Hugging Face + - Quantize the model with llama.cpp + - Deploy the model and run inference with llama.cpp + +prerequisites: + - An Amazon Web Services account, with quota for c8g instances + - Basic familiarity with SSH + +author: Julien Simon + +### Tags +# Tagging metadata, see the Learning Path guide for the allowed values +skilllevels: Introductory +subjects: ML +arm_ips: + - Neoverse +tools_software_languages: + - Amazon Web Services + - Linux + - Python + - Llama.cpp +operatingsystems: + - Linux + + +further_reading: + - resource: + title: Arcee AI + link: https://www.arcee.ai + type: Website + - resource: + title: Announcing Arcee Foundation Models + link: https://www.arcee.ai/blog/announcing-the-arcee-foundation-model-family + type: Blog + - resource: + title: AFM-4.5B, the First Arcee Foundation Model + link: https://www.arcee.ai/blog/deep-dive-afm-4-5b-the-first-arcee-foundational-model + type: Blog + - resource: + title: Amazon EC2 Graviton Instances + link: https://aws.amazon.com/ec2/graviton/ + type: Documentation + - resource: + title: Amazon EC2 Documentation + link: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ + type: Documentation + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- From 7cae6e41aa25b8a704f987e0dfb5672ce7eeec82 Mon Sep 17 00:00:00 2001 From: Julien Simon Date: Thu, 26 Jun 2025 14:01:28 +0200 Subject: [PATCH 2/4] A couple of clarifications --- .../03_building_llama_cpp.md | 2 ++ .../arcee-foundation-model-on-aws/_index.md | 9 +++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md index 3c86cfb4b5..b3fb6016ae 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md @@ -8,6 +8,8 @@ layout: learningpathall In this step, we'll build Llama.cpp from source. Llama.cpp is a high-performance C++ implementation of the LLaMA model that's optimized for inference on various hardware platforms, including ARM-based processors like Graviton4. +Even though AFM-4.5B has a custom model architecture, we're able to use the vanilla version of llama.cpp as the Arcee AI team has contributed the appropriate modeling code. + Here are all the steps. ## Step 1: Clone the Repository diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md index 55fbf739de..827712a256 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/_index.md @@ -6,11 +6,12 @@ minutes_to_complete: 30 who_is_this_for: This is an introductory topic for developers and engineers who want to deploy the Arcee AFM-4.5B small language model on an AWS Arm-based instance. AFM-4.5B is a 4.5-billion-parameter frontier model that delivers excellent accuracy, strict compliance, and very high cost-efficiency. It was trained on almost 7 trillion tokens of clean, rigorously filtered data, and has been tested across a wide range of languages, including Arabic, English, French, German, Hindi, Italian, Korean, Mandarin, Portuguese, Russian, and Spanish learning_objectives: - - Deploy an Arm-based Graviton4 virtual machine on Amazon Web Services - - Connect to the virtual machine using SSH - - Download the AFM-4.5B model from Hugging Face - - Quantize the model with llama.cpp + - Launch and set up an Arm-based Graviton4 virtual machine on Amazon Web Services + - Build llama.cpp from source + - Download AFM-4.5B from Hugging Face + - Quantize AFM-4.5B with llama.cpp - Deploy the model and run inference with llama.cpp + - Evaluate the quality of quantized models by measuring perplexity prerequisites: - An Amazon Web Services account, with quota for c8g instances From d123006311a49b433fa68722bd5c4051b789a053 Mon Sep 17 00:00:00 2001 From: Julien Simon Date: Thu, 26 Jun 2025 16:54:31 +0200 Subject: [PATCH 3/4] New learning path: Deploy Arcee AFM-4.5B on Google Axion --- .../01_launching_an axion_instance.md | 102 ++++++++++++++++++ .../02_setting_up_the_instance.md | 51 +++++++++ .../03_building_llama_cpp.md | 82 ++++++++++++++ ...stall_python_dependencies_for_llama_cpp.md | 68 ++++++++++++ .../arcee-foundation-model-on-gcp/_index.md | 78 ++++++++++++++ .../_next-steps.md | 8 ++ 6 files changed, 389 insertions(+) create mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/01_launching_an axion_instance.md create mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/02_setting_up_the_instance.md create mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/03_building_llama_cpp.md create mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/04_install_python_dependencies_for_llama_cpp.md create mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_index.md create mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_next-steps.md diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/01_launching_an axion_instance.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/01_launching_an axion_instance.md new file mode 100644 index 0000000000..10b904a28b --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/01_launching_an axion_instance.md @@ -0,0 +1,102 @@ +--- +title: Launching an Axion c4a instance +weight: 2 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +## System Requirements + + - A Google Cloud account with billing enabled + + - Quota for c4a instances in your preferred region + + - A Linux or MacOS host + + - A c4a-standard-4 or larger instance + + - At least 128GB of storage + +## Google Cloud Console Steps + +Follow these steps to launch your Compute Engine instance using the Google Cloud Console: + +### Step 1: Launch Compute Engine Instance + +1. **Navigate to Google Cloud Console** + + - Go to the [Google Cloud Console](https://console.cloud.google.com) + + - Make sure you're in the correct project + + - In the left navigation menu, click "Compute Engine" > "VM instances" + +2. **Create Instance** + + Click "CREATE INSTANCE" button + +3. **Configure Instance Details** + + - **Name**: Enter `arcee-axion-instance` + + - **Region**: Select a region where c4a instances are available (e.g., us-central1, us-east1, europe-west1) + + - **Zone**: Select any zone in the chosen region + + - **Machine family**: Select "General urpose" + + - **Series**: Select "C4A" + + - **Machine type**: Select `c4a-standard-32` or larger + - This provides 32 vCPUs and 128 GB memory + +4. **Configure OS and Storage** + + In the left menu, click on "OS and storage" + + - Click "Change". + + - **Size (GB)**: Set to `128` + + - Click "Select" + +5. **Configure Networking** + + In the left menu, click on "Networking" + + - Click + + - **Important**: We'll configure SSH access through IAP (Identity-Aware Proxy) for security + +7. **Create Instance** + + - Review all settings + + - Click "Create" at the bottom of the screen. + +### Step 3: Connect to Your Instance + + After a minute or so, the instance should be available. + + - In the VM instances list, locate the instance name (`arcee-axion-instance`) and click on "SSH" + + - This opens a browser-based SSH terminal. You may need to accept some security message + + - No additional configuration is needed + + - You should now be connected to your Ubuntu instance + +### Important Notes + +- **Region Selection**: Ensure you're in a region where c4a instances are available + +- **Quota**: Make sure you have sufficient quota for c4a instances in your selected region + +- **Security**: The browser-based SSH connection is more secure as it uses Google's Identity-Aware Proxy + +- **Storage**: The 128GB boot disk is sufficient for the Arcee model and dependencies + +- **Cost**: Monitor your usage in the Google Cloud Console billing section + +- **Backup**: Consider creating snapshots for backup purposes \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/02_setting_up_the_instance.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/02_setting_up_the_instance.md new file mode 100644 index 0000000000..4cc666910e --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/02_setting_up_the_instance.md @@ -0,0 +1,51 @@ +--- +title: Setting up the instance +weight: 3 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In this step, we'll set up the Axion c4a instance with all the necessary tools and dependencies required to build and run the Arcee Foundation Model. This includes installing the build tools and Python environment. + +## Step 1: Update Package List + +```bash +sudo apt-get update +``` + +This command updates the local package index from the repositories: + +- Downloads the latest package lists from all configured APT repositories +- Ensures you have the most recent information about available packages and their versions +- This is a best practice before installing new packages to avoid potential conflicts +- The package index contains metadata about available packages, their dependencies, and version information + +## Step 2: Install System Dependencies + +```bash +sudo apt-get install cmake gcc g++ git python3 python3-pip python3-virtualenv libcurl4-openssl-dev unzip -y +``` + +This command installs all the essential development tools and dependencies: + +- **cmake**: Cross-platform build system generator that we'll use to compile Llama.cpp +- **gcc & g++**: GNU C and C++ compilers for building native code +- **git**: Version control system for cloning repositories +- **python3**: Python interpreter for running Python-based tools and scripts +- **python3-pip**: Python package installer for managing Python dependencies +- **python3-virtualenv**: Tool for creating isolated Python environments +- **libcurl4-openssl-dev**: client-side URL transfer library + +The `-y` flag automatically answers "yes" to prompts, making the installation non-interactive. + +## What's Ready Now + +After completing these steps, your Axion c4a instance will have: + +- A complete C/C++ development environment for building Llama.cpp +- Python 3 with pip for managing Python packages +- Git for cloning repositories +- All necessary build tools for compiling optimized ARM64 binaries + +The system is now prepared for the next steps: building Llama.cpp and downloading the Arcee Foundation Model. diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/03_building_llama_cpp.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/03_building_llama_cpp.md new file mode 100644 index 0000000000..4f50050d72 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/03_building_llama_cpp.md @@ -0,0 +1,82 @@ +--- +title: Building Llama.cpp +weight: 4 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In this step, we'll build Llama.cpp from source. Llama.cpp is a high-performance C++ implementation of the LLaMA model that's optimized for inference on various hardware platforms, including ARM-based processors like Google Axion. + +Even though AFM-4.5B has a custom model architecture, we're able to use the vanilla version of llama.cpp as the Arcee AI team has contributed the appropriate modeling code. + +Here are all the steps. + +## Step 1: Clone the Repository + +```bash +git clone https://github.com/ggerganov/llama.cpp +``` + +This command clones the Llama.cpp repository from GitHub to your local machine. The repository contains the source code, build scripts, and documentation needed to compile the inference engine. + +## Step 2: Navigate to the Project Directory + +```bash +cd llama.cpp +``` + +Change into the llama.cpp directory where we'll perform the build process. This directory contains the CMakeLists.txt file and source code structure. + +## Step 3: Configure the Build with CMake + +```bash +cmake -B . +``` + +This command uses CMake to configure the build system: +- `-B .` specifies that the build files should be generated in the current directory +- CMake will detect your system's compiler, libraries, and hardware capabilities +- It will generate the appropriate build files (Makefiles on Linux) based on your system configuration + +Note: The cmake output should include the information below, indicating that the build process will leverage the Neoverse V2 architecture's specialized instruction sets designed for AI/ML workloads. These optimizations are crucial for achieving optimal performance on Axion: + +```bash +-- ARM feature DOTPROD enabled +-- ARM feature SVE enabled +-- ARM feature MATMUL_INT8 enabled +-- ARM feature FMA enabled +-- ARM feature FP16_VECTOR_ARITHMETIC enabled +-- Adding CPU backend variant ggml-cpu: -mcpu=neoverse-v2+crc+sve2-aes+sve2-sha3+dotprod+i8mm+sve +``` + +- **DOTPROD: Dot Product** - Hardware-accelerated dot product operations for neural network computations +- **SVE: Scalable Vector Extension** - Advanced vector processing capabilities that can handle variable-length vectors up to 2048 bits, providing significant performance improvements for matrix operations +- **MATMUL_INT8: Matrix multiplication units** - Dedicated hardware for efficient matrix operations common in transformer models, accelerating the core computations of large language models +- **FMA: Fused Multiply-Add - Optimized floating-point operations that combine multiplication and addition in a single instruction +- **FP16 Vector Arithmetic - Hardware support for 16-bit floating-point vector operations, reducing memory usage while maintaining good numerical precision + +## Step 4: Compile the Project + +```bash +cmake --build . --config Release -j16 +``` + +This command compiles the Llama.cpp project: +- `--build .` tells CMake to build the project using the files in the current directory +- `--config Release` specifies a Release build configuration, which enables optimizations and removes debug symbols +- `-j16` runs the build with 16 parallel jobs, which speeds up compilation on multi-core systems like Axion. + +The build process will compile the C++ source code into executable binaries optimized for your ARM64 architecture. This should only take a minute. + +## What Gets Built + +After successful compilation, you'll have several key command-line executables in the `bin` directory: +- `llama-cli` - The main inference executable for running LLaMA models +- `llama-server` - A web server for serving model inference over HTTP +- `llama-quantize` - a tool for model quantization to reduce memory usage +- Various utility programs for model conversion and optimization + +You can find more information in the llama.cpp [GitHub repository](https://github.com/ggml-org/llama.cpp/tree/master/tools). + +These binaries are specifically optimized for ARM64 architecture and will provide excellent performance on your Google Axion instance. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/04_install_python_dependencies_for_llama_cpp.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/04_install_python_dependencies_for_llama_cpp.md new file mode 100644 index 0000000000..d3f9ebcac3 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/04_install_python_dependencies_for_llama_cpp.md @@ -0,0 +1,68 @@ +--- +title: Installing Python dependencies for llama.cpp +weight: 5 + +### FIXED, DO NOT MODIFY +layout: learningpathall +--- + +In this step, we'll set up a Python virtual environment and install the required dependencies for working with Llama.cpp. This ensures we have a clean, isolated Python environment with all the necessary packages for model optimization. + +Here are all the steps. + +## Step 1: Create a Python Virtual Environment + +```bash +virtualenv env-llama-cpp +``` + +This command creates a new Python virtual environment named `env-llama-cpp`: +- Virtual environments provide isolated Python environments that prevent conflicts between different projects +- The `env-llama-cpp` directory will contain its own Python interpreter and package installation space +- This isolation ensures that the Llama.cpp dependencies won't interfere with other Python projects on your system +- Virtual environments are essential for reproducible development environments + +## Step 2: Activate the Virtual Environment + +```bash +source env-llama-cpp/bin/activate +``` + +This command activates the virtual environment: +- The `source` command executes the activation script, which modifies your current shell environment +- Depending on you sheel, your command prompt may change to show `(env-llama-cpp)` at the beginning, indicating the active environment. We will reflect this in the following commands. +- All subsequent `pip` commands will install packages into this isolated environment +- The `PATH` environment variable is updated to prioritize the virtual environment's Python interpreter + +## Step 3: Upgrade pip to the Latest Version + +```bash +(env-llama-cpp) pip install --upgrade pip +``` + +This command ensures you have the latest version of pip: +- Upgrading pip helps avoid compatibility issues with newer packages +- The `--upgrade` flag tells pip to install the newest available version +- This is a best practice before installing project dependencies +- Newer pip versions often include security fixes and improved package resolution + +## Step 4: Install Project Dependencies + +```bash +(env-llama-cpp) pip install -r requirements.txt +``` + +This command installs all the Python packages specified in the requirements.txt file: +- The `-r` flag tells pip to read the package list from the specified file +- `requirements.txt` contains a list of Python packages and their version specifications +- This ensures everyone working on the project uses the same package versions +- The installation will include packages needed for model loading, inference, and any Python bindings for Llama.cpp + +## What Gets Installed + +After successful installation, your virtual environment will contain: +- **NumPy**: For numerical computations and array operations +- **Requests**: For HTTP operations and API calls +- **Other dependencies**: Specific packages needed for Llama.cpp Python integration + +The virtual environment is now ready for running Python scripts that interact with the compiled Llama.cpp binaries. Remember to always activate the virtual environment (`source env-llama-cpp/bin/activate`) before running any Python code related to this project. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_index.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_index.md new file mode 100644 index 0000000000..4245bc0d32 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_index.md @@ -0,0 +1,78 @@ +--- +title: Deploy Arcee AFM-4.5B on Google Axion + +minutes_to_complete: 30 + +who_is_this_for: This is an introductory topic for developers and engineers who want to deploy the Arcee AFM-4.5B small language model on a Google Cloud Axion c4a instance. AFM-4.5B is a 4.5-billion-parameter frontier model that delivers excellent accuracy, strict compliance, and very high cost-efficiency. It was trained on almost 7 trillion tokens of clean, rigorously filtered data, and has been tested across a wide range of languages, including Arabic, English, French, German, Hindi, Italian, Korean, Mandarin, Portuguese, Russian, and Spanish + +learning_objectives: + - Launch and set up an Arm-based Axion c4a virtual machine on Google Cloud + + - Build llama.cpp from source + + - Download AFM-4.5B from Hugging Face + + - Quantize AFM-4.5B with llama.cpp + + - Deploy the model and run inference with llama.cpp + + - Evaluate the quality of quantized models by measuring perplexity + +prerequisites: + - A Google Cloud account, with quota for c4a instances + + - Basic familiarity with SSH + +author: Julien Simon + +### Tags +# Tagging metadata, see the Learning Path guide for the allowed values +skilllevels: Introductory +subjects: ML +arm_ips: + - Neoverse + +tools_software_languages: + - Google Cloud + + - Linux + + - Python + + - Llama.cpp + +operatingsystems: + - Linux + +further_reading: + - resource: + title: Arcee AI + link: https://www.arcee.ai + type: Website + + - resource: + title: Announcing Arcee Foundation Models + link: https://www.arcee.ai/blog/announcing-the-arcee-foundation-model-family + type: Blog + + - resource: + title: AFM-4.5B, the First Arcee Foundation Model + link: https://www.arcee.ai/blog/deep-dive-afm-4-5b-the-first-arcee-foundational-model + type: Blog + + - resource: + title: Google Cloud c4a Instances + link: https://cloud.google.com/blog/products/compute/try-c4a-the-first-google-axion-processor + type: Documentation + + - resource: + title: Google Cloud Compute Engine + link: https://cloud.google.com/compute/docs + type: Documentation + +### FIXED, DO NOT MODIFY +# ================================================================================ +weight: 1 # _index.md always has weight of 1 to order correctly +layout: "learningpathall" # All files under learning paths have this same wrapper +learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. +--- diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_next-steps.md new file mode 100644 index 0000000000..c3db0de5a2 --- /dev/null +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_next-steps.md @@ -0,0 +1,8 @@ +--- +# ================================================================================ +# FIXED, DO NOT MODIFY THIS FILE +# ================================================================================ +weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. +title: "Next Steps" # Always the same, html page title. +layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. +--- From ec8cc7703d916243ec40fbfdf78f776307d5b0b4 Mon Sep 17 00:00:00 2001 From: Julien Simon Date: Fri, 27 Jun 2025 11:50:20 +0200 Subject: [PATCH 4/4] - Clean up - Remove GCP learning for now --- .../01_launching_a_graviton4_instance.md | 4 +- .../03_building_llama_cpp.md | 3 +- .../07_evaluating_the_quantized_models.md | 27 ++++- .../08_conclusion.md | 6 +- .../01_launching_an axion_instance.md | 102 ------------------ .../02_setting_up_the_instance.md | 51 --------- .../03_building_llama_cpp.md | 82 -------------- ...stall_python_dependencies_for_llama_cpp.md | 68 ------------ .../arcee-foundation-model-on-gcp/_index.md | 78 -------------- .../_next-steps.md | 8 -- 10 files changed, 31 insertions(+), 398 deletions(-) delete mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/01_launching_an axion_instance.md delete mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/02_setting_up_the_instance.md delete mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/03_building_llama_cpp.md delete mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/04_install_python_dependencies_for_llama_cpp.md delete mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_index.md delete mode 100644 content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_next-steps.md diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/01_launching_a_graviton4_instance.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/01_launching_a_graviton4_instance.md index d3170daf3e..36a784d664 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/01_launching_a_graviton4_instance.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/01_launching_a_graviton4_instance.md @@ -10,9 +10,11 @@ layout: learningpathall - An AWS account + - Quota for c8g instances in your preferred region + - A Linux or MacOS host - - A c8g or r8g instance (4xlarge or larger) + - A c8g instance (4xlarge or larger) - At least 128GB of storage diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md index b3fb6016ae..713fff1696 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/03_building_llama_cpp.md @@ -38,7 +38,6 @@ This command uses CMake to configure the build system: - `-B .` specifies that the build files should be generated in the current directory - CMake will detect your system's compiler, libraries, and hardware capabilities - It will generate the appropriate build files (Makefiles on Linux) based on your system configuration -- This step also enables optimizations for ARM processors like Graviton4 Note: The cmake output should include the information below, indicating that the build process will leverage the Neoverse V2 architecture's specialized instruction sets designed for AI/ML workloads. These optimizations are crucial for achieving optimal performance on Graviton4: @@ -80,4 +79,4 @@ After successful compilation, you'll have several key command-line executables i You can find more information in the llama.cpp [GitHub repository](https://github.com/ggml-org/llama.cpp/tree/master/tools). -These binaries are specifically optimized for ARM64 architecture and will provide excellent performance on your Graviton4 instance. \ No newline at end of file +These binaries are specifically optimized for ARM64 architecture and will provide excellent performance on your Graviton4 instance. diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/07_evaluating_the_quantized_models.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/07_evaluating_the_quantized_models.md index 34ad11cb23..7788ddde5f 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/07_evaluating_the_quantized_models.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/07_evaluating_the_quantized_models.md @@ -61,7 +61,7 @@ The results should look like this: | llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 16 | pp512 | 190.18 ± 0.03 | | llama 8B Q4_0 | 4.33 GiB | 8.03 B | CPU | 16 | tg128 | 40.99 ± 0.36 | -It's pretty amazing to see that with only 4 threads, the 4-bit model can still generate at the very comfortable speed of 15 tokens per second. +It's pretty amazing to see that with only 4 threads, the 4-bit model can still generate at the very comfortable speed of 15 tokens per second. We could definitely run several copies of the model on the same instance to serve concurrent users or applications. You could also try [`llama-batched-bench`](https://github.com/ggml-org/llama.cpp/tree/master/tools/batched-bench) to benchmark performance on batch sizes larger than 1. @@ -89,16 +89,33 @@ bin/llama-perplexity -m models/afm-4-5b/afm-4-5B-Q8_0.gguf -f wikitext-2-raw/wik bin/llama-perplexity -m models/afm-4-5b/afm-4-5B-Q4_0.gguf -f wikitext-2-raw/wiki.test.raw ``` -These commands will run for about 4 hours. You should run them in a shell script to avoid SSH timeouts. For example: +If you want to speed things up, you can add the `--chunks` option to use a fraction of 564 chunks contained in the test dataset. + +On the full dataset, these three commands will take about 5 hours. You should run them in a shell script to avoid SSH timeouts. + +For example: +```bash +#!/bin/bash +# ppl.sh +bin/llama-perplexity -m models/afm-4-5b/afm-4-5B-F16.gguf -f wikitext-2-raw/wiki.test.raw +bin/llama-perplexity -m models/afm-4-5b/afm-4-5B-Q8_0.gguf -f wikitext-2-raw/wiki.test.raw +bin/llama-perplexity -m models/afm-4-5b/afm-4-5B-Q4_0.gguf -f wikitext-2-raw/wiki.test.raw +``` ```bash nohup sh ppl.sh >& ppl.sh.log & tail -f ppl.sh.log ``` -If you want to speed things up, you can add the `--chunks` option to use a fraction of 564 chunks contained in the test dataset. -Here are the full results: +Here are the full results. + + +| Model | Generation Speed (tokens/s, 16 vCPUs) | Memory Usage | Perplexity (Wikitext-2) | +|:-------:|:----------------------:|:------------:|:----------:| +| F16 | ~15–16 | ~15 GB | TODO | +| Q8_0 | ~25 | ~8 GB | TODO | +| Q4_0 | ~40 | ~4.4 GB | TODO | -TODO +*Please remember to terminate the instance in the AWS console when you're done testing* diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/08_conclusion.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/08_conclusion.md index 04ff3dd0b7..73a859ffce 100644 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/08_conclusion.md +++ b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-aws/08_conclusion.md @@ -9,7 +9,11 @@ layout: learningpathall ## Conclusion -Congratulations! You have successfully completed the journey of deploying the Arcee AFM-4.5B foundation model on AWS Graviton4. Let's recap what we accomplished: +Congratulations! You have successfully completed the journey of deploying the Arcee AFM-4.5B foundation model on AWS Graviton4. + +*Please remember to terminate the instance in the AWS console when you're done testing* + +Let's recap what we accomplished. ### What We Built diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/01_launching_an axion_instance.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/01_launching_an axion_instance.md deleted file mode 100644 index 10b904a28b..0000000000 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/01_launching_an axion_instance.md +++ /dev/null @@ -1,102 +0,0 @@ ---- -title: Launching an Axion c4a instance -weight: 2 - -### FIXED, DO NOT MODIFY -layout: learningpathall ---- - -## System Requirements - - - A Google Cloud account with billing enabled - - - Quota for c4a instances in your preferred region - - - A Linux or MacOS host - - - A c4a-standard-4 or larger instance - - - At least 128GB of storage - -## Google Cloud Console Steps - -Follow these steps to launch your Compute Engine instance using the Google Cloud Console: - -### Step 1: Launch Compute Engine Instance - -1. **Navigate to Google Cloud Console** - - - Go to the [Google Cloud Console](https://console.cloud.google.com) - - - Make sure you're in the correct project - - - In the left navigation menu, click "Compute Engine" > "VM instances" - -2. **Create Instance** - - Click "CREATE INSTANCE" button - -3. **Configure Instance Details** - - - **Name**: Enter `arcee-axion-instance` - - - **Region**: Select a region where c4a instances are available (e.g., us-central1, us-east1, europe-west1) - - - **Zone**: Select any zone in the chosen region - - - **Machine family**: Select "General urpose" - - - **Series**: Select "C4A" - - - **Machine type**: Select `c4a-standard-32` or larger - - This provides 32 vCPUs and 128 GB memory - -4. **Configure OS and Storage** - - In the left menu, click on "OS and storage" - - - Click "Change". - - - **Size (GB)**: Set to `128` - - - Click "Select" - -5. **Configure Networking** - - In the left menu, click on "Networking" - - - Click - - - **Important**: We'll configure SSH access through IAP (Identity-Aware Proxy) for security - -7. **Create Instance** - - - Review all settings - - - Click "Create" at the bottom of the screen. - -### Step 3: Connect to Your Instance - - After a minute or so, the instance should be available. - - - In the VM instances list, locate the instance name (`arcee-axion-instance`) and click on "SSH" - - - This opens a browser-based SSH terminal. You may need to accept some security message - - - No additional configuration is needed - - - You should now be connected to your Ubuntu instance - -### Important Notes - -- **Region Selection**: Ensure you're in a region where c4a instances are available - -- **Quota**: Make sure you have sufficient quota for c4a instances in your selected region - -- **Security**: The browser-based SSH connection is more secure as it uses Google's Identity-Aware Proxy - -- **Storage**: The 128GB boot disk is sufficient for the Arcee model and dependencies - -- **Cost**: Monitor your usage in the Google Cloud Console billing section - -- **Backup**: Consider creating snapshots for backup purposes \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/02_setting_up_the_instance.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/02_setting_up_the_instance.md deleted file mode 100644 index 4cc666910e..0000000000 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/02_setting_up_the_instance.md +++ /dev/null @@ -1,51 +0,0 @@ ---- -title: Setting up the instance -weight: 3 - -### FIXED, DO NOT MODIFY -layout: learningpathall ---- - -In this step, we'll set up the Axion c4a instance with all the necessary tools and dependencies required to build and run the Arcee Foundation Model. This includes installing the build tools and Python environment. - -## Step 1: Update Package List - -```bash -sudo apt-get update -``` - -This command updates the local package index from the repositories: - -- Downloads the latest package lists from all configured APT repositories -- Ensures you have the most recent information about available packages and their versions -- This is a best practice before installing new packages to avoid potential conflicts -- The package index contains metadata about available packages, their dependencies, and version information - -## Step 2: Install System Dependencies - -```bash -sudo apt-get install cmake gcc g++ git python3 python3-pip python3-virtualenv libcurl4-openssl-dev unzip -y -``` - -This command installs all the essential development tools and dependencies: - -- **cmake**: Cross-platform build system generator that we'll use to compile Llama.cpp -- **gcc & g++**: GNU C and C++ compilers for building native code -- **git**: Version control system for cloning repositories -- **python3**: Python interpreter for running Python-based tools and scripts -- **python3-pip**: Python package installer for managing Python dependencies -- **python3-virtualenv**: Tool for creating isolated Python environments -- **libcurl4-openssl-dev**: client-side URL transfer library - -The `-y` flag automatically answers "yes" to prompts, making the installation non-interactive. - -## What's Ready Now - -After completing these steps, your Axion c4a instance will have: - -- A complete C/C++ development environment for building Llama.cpp -- Python 3 with pip for managing Python packages -- Git for cloning repositories -- All necessary build tools for compiling optimized ARM64 binaries - -The system is now prepared for the next steps: building Llama.cpp and downloading the Arcee Foundation Model. diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/03_building_llama_cpp.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/03_building_llama_cpp.md deleted file mode 100644 index 4f50050d72..0000000000 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/03_building_llama_cpp.md +++ /dev/null @@ -1,82 +0,0 @@ ---- -title: Building Llama.cpp -weight: 4 - -### FIXED, DO NOT MODIFY -layout: learningpathall ---- - -In this step, we'll build Llama.cpp from source. Llama.cpp is a high-performance C++ implementation of the LLaMA model that's optimized for inference on various hardware platforms, including ARM-based processors like Google Axion. - -Even though AFM-4.5B has a custom model architecture, we're able to use the vanilla version of llama.cpp as the Arcee AI team has contributed the appropriate modeling code. - -Here are all the steps. - -## Step 1: Clone the Repository - -```bash -git clone https://github.com/ggerganov/llama.cpp -``` - -This command clones the Llama.cpp repository from GitHub to your local machine. The repository contains the source code, build scripts, and documentation needed to compile the inference engine. - -## Step 2: Navigate to the Project Directory - -```bash -cd llama.cpp -``` - -Change into the llama.cpp directory where we'll perform the build process. This directory contains the CMakeLists.txt file and source code structure. - -## Step 3: Configure the Build with CMake - -```bash -cmake -B . -``` - -This command uses CMake to configure the build system: -- `-B .` specifies that the build files should be generated in the current directory -- CMake will detect your system's compiler, libraries, and hardware capabilities -- It will generate the appropriate build files (Makefiles on Linux) based on your system configuration - -Note: The cmake output should include the information below, indicating that the build process will leverage the Neoverse V2 architecture's specialized instruction sets designed for AI/ML workloads. These optimizations are crucial for achieving optimal performance on Axion: - -```bash --- ARM feature DOTPROD enabled --- ARM feature SVE enabled --- ARM feature MATMUL_INT8 enabled --- ARM feature FMA enabled --- ARM feature FP16_VECTOR_ARITHMETIC enabled --- Adding CPU backend variant ggml-cpu: -mcpu=neoverse-v2+crc+sve2-aes+sve2-sha3+dotprod+i8mm+sve -``` - -- **DOTPROD: Dot Product** - Hardware-accelerated dot product operations for neural network computations -- **SVE: Scalable Vector Extension** - Advanced vector processing capabilities that can handle variable-length vectors up to 2048 bits, providing significant performance improvements for matrix operations -- **MATMUL_INT8: Matrix multiplication units** - Dedicated hardware for efficient matrix operations common in transformer models, accelerating the core computations of large language models -- **FMA: Fused Multiply-Add - Optimized floating-point operations that combine multiplication and addition in a single instruction -- **FP16 Vector Arithmetic - Hardware support for 16-bit floating-point vector operations, reducing memory usage while maintaining good numerical precision - -## Step 4: Compile the Project - -```bash -cmake --build . --config Release -j16 -``` - -This command compiles the Llama.cpp project: -- `--build .` tells CMake to build the project using the files in the current directory -- `--config Release` specifies a Release build configuration, which enables optimizations and removes debug symbols -- `-j16` runs the build with 16 parallel jobs, which speeds up compilation on multi-core systems like Axion. - -The build process will compile the C++ source code into executable binaries optimized for your ARM64 architecture. This should only take a minute. - -## What Gets Built - -After successful compilation, you'll have several key command-line executables in the `bin` directory: -- `llama-cli` - The main inference executable for running LLaMA models -- `llama-server` - A web server for serving model inference over HTTP -- `llama-quantize` - a tool for model quantization to reduce memory usage -- Various utility programs for model conversion and optimization - -You can find more information in the llama.cpp [GitHub repository](https://github.com/ggml-org/llama.cpp/tree/master/tools). - -These binaries are specifically optimized for ARM64 architecture and will provide excellent performance on your Google Axion instance. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/04_install_python_dependencies_for_llama_cpp.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/04_install_python_dependencies_for_llama_cpp.md deleted file mode 100644 index d3f9ebcac3..0000000000 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/04_install_python_dependencies_for_llama_cpp.md +++ /dev/null @@ -1,68 +0,0 @@ ---- -title: Installing Python dependencies for llama.cpp -weight: 5 - -### FIXED, DO NOT MODIFY -layout: learningpathall ---- - -In this step, we'll set up a Python virtual environment and install the required dependencies for working with Llama.cpp. This ensures we have a clean, isolated Python environment with all the necessary packages for model optimization. - -Here are all the steps. - -## Step 1: Create a Python Virtual Environment - -```bash -virtualenv env-llama-cpp -``` - -This command creates a new Python virtual environment named `env-llama-cpp`: -- Virtual environments provide isolated Python environments that prevent conflicts between different projects -- The `env-llama-cpp` directory will contain its own Python interpreter and package installation space -- This isolation ensures that the Llama.cpp dependencies won't interfere with other Python projects on your system -- Virtual environments are essential for reproducible development environments - -## Step 2: Activate the Virtual Environment - -```bash -source env-llama-cpp/bin/activate -``` - -This command activates the virtual environment: -- The `source` command executes the activation script, which modifies your current shell environment -- Depending on you sheel, your command prompt may change to show `(env-llama-cpp)` at the beginning, indicating the active environment. We will reflect this in the following commands. -- All subsequent `pip` commands will install packages into this isolated environment -- The `PATH` environment variable is updated to prioritize the virtual environment's Python interpreter - -## Step 3: Upgrade pip to the Latest Version - -```bash -(env-llama-cpp) pip install --upgrade pip -``` - -This command ensures you have the latest version of pip: -- Upgrading pip helps avoid compatibility issues with newer packages -- The `--upgrade` flag tells pip to install the newest available version -- This is a best practice before installing project dependencies -- Newer pip versions often include security fixes and improved package resolution - -## Step 4: Install Project Dependencies - -```bash -(env-llama-cpp) pip install -r requirements.txt -``` - -This command installs all the Python packages specified in the requirements.txt file: -- The `-r` flag tells pip to read the package list from the specified file -- `requirements.txt` contains a list of Python packages and their version specifications -- This ensures everyone working on the project uses the same package versions -- The installation will include packages needed for model loading, inference, and any Python bindings for Llama.cpp - -## What Gets Installed - -After successful installation, your virtual environment will contain: -- **NumPy**: For numerical computations and array operations -- **Requests**: For HTTP operations and API calls -- **Other dependencies**: Specific packages needed for Llama.cpp Python integration - -The virtual environment is now ready for running Python scripts that interact with the compiled Llama.cpp binaries. Remember to always activate the virtual environment (`source env-llama-cpp/bin/activate`) before running any Python code related to this project. \ No newline at end of file diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_index.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_index.md deleted file mode 100644 index 4245bc0d32..0000000000 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_index.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -title: Deploy Arcee AFM-4.5B on Google Axion - -minutes_to_complete: 30 - -who_is_this_for: This is an introductory topic for developers and engineers who want to deploy the Arcee AFM-4.5B small language model on a Google Cloud Axion c4a instance. AFM-4.5B is a 4.5-billion-parameter frontier model that delivers excellent accuracy, strict compliance, and very high cost-efficiency. It was trained on almost 7 trillion tokens of clean, rigorously filtered data, and has been tested across a wide range of languages, including Arabic, English, French, German, Hindi, Italian, Korean, Mandarin, Portuguese, Russian, and Spanish - -learning_objectives: - - Launch and set up an Arm-based Axion c4a virtual machine on Google Cloud - - - Build llama.cpp from source - - - Download AFM-4.5B from Hugging Face - - - Quantize AFM-4.5B with llama.cpp - - - Deploy the model and run inference with llama.cpp - - - Evaluate the quality of quantized models by measuring perplexity - -prerequisites: - - A Google Cloud account, with quota for c4a instances - - - Basic familiarity with SSH - -author: Julien Simon - -### Tags -# Tagging metadata, see the Learning Path guide for the allowed values -skilllevels: Introductory -subjects: ML -arm_ips: - - Neoverse - -tools_software_languages: - - Google Cloud - - - Linux - - - Python - - - Llama.cpp - -operatingsystems: - - Linux - -further_reading: - - resource: - title: Arcee AI - link: https://www.arcee.ai - type: Website - - - resource: - title: Announcing Arcee Foundation Models - link: https://www.arcee.ai/blog/announcing-the-arcee-foundation-model-family - type: Blog - - - resource: - title: AFM-4.5B, the First Arcee Foundation Model - link: https://www.arcee.ai/blog/deep-dive-afm-4-5b-the-first-arcee-foundational-model - type: Blog - - - resource: - title: Google Cloud c4a Instances - link: https://cloud.google.com/blog/products/compute/try-c4a-the-first-google-axion-processor - type: Documentation - - - resource: - title: Google Cloud Compute Engine - link: https://cloud.google.com/compute/docs - type: Documentation - -### FIXED, DO NOT MODIFY -# ================================================================================ -weight: 1 # _index.md always has weight of 1 to order correctly -layout: "learningpathall" # All files under learning paths have this same wrapper -learning_path_main_page: "yes" # This should be surfaced when looking for related content. Only set for _index.md of learning path content. ---- diff --git a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_next-steps.md b/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_next-steps.md deleted file mode 100644 index c3db0de5a2..0000000000 --- a/content/learning-paths/servers-and-cloud-computing/arcee-foundation-model-on-gcp/_next-steps.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -# ================================================================================ -# FIXED, DO NOT MODIFY THIS FILE -# ================================================================================ -weight: 21 # Set to always be larger than the content in this path to be at the end of the navigation. -title: "Next Steps" # Always the same, html page title. -layout: "learningpathall" # All files under learning paths have this same wrapper for Hugo processing. ----