From 6e3eba9e063dbe366661649b4ffc47c6a71f2da2 Mon Sep 17 00:00:00 2001 From: ajaykrish2303 Date: Wed, 19 Nov 2025 14:43:55 -0500 Subject: [PATCH 1/3] added code files --- .../Dockerfile | 39 +++++ .../cerebrium.toml | 28 ++++ .../config.pbtxt | 44 +++++ .../download_model.py | 38 +++++ .../model.py | 151 ++++++++++++++++++ .../start_triton.sh | 14 ++ 6 files changed, 314 insertions(+) create mode 100644 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/Dockerfile create mode 100644 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml create mode 100644 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt create mode 100644 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/download_model.py create mode 100644 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py create mode 100644 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/start_triton.sh diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/Dockerfile b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/Dockerfile new file mode 100644 index 00000000..10cd6fef --- /dev/null +++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/Dockerfile @@ -0,0 +1,39 @@ +FROM nvcr.io/nvidia/tritonserver:25.10-trtllm-python-py3 + +# Environment variables +ENV PYTHONPATH=/usr/local/lib/python3.12/dist-packages:$PYTHONPATH +ENV PYTHONDONTWRITEBYTECODE=1 +ENV DEBIAN_FRONTEND=noninteractive +ENV HF_HOME=/persistent-storage/models +ENV TORCH_CUDA_ARCH_LIST=8.6 + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + git \ + git-lfs \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Install Python dependencies +RUN pip install --break-system-packages \ + huggingface_hub \ + transformers \ + || true + +# Create required directories +RUN mkdir -p \ + /app/model_repository/llama3_2/1 \ + /persistent-storage/models \ + /persistent-storage/engines + +# Copy application files +COPY --chmod=755 download_model.py start_triton.sh /app/ +COPY model.py /app/model_repository/llama3_2/1/ +COPY config.pbtxt /app/model_repository/llama3_2/ + +# Expose Triton ports +EXPOSE 8000 8001 8002 + +# Start Triton server +CMD ["/app/start_triton.sh"] \ No newline at end of file diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml new file mode 100644 index 00000000..358aa16d --- /dev/null +++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml @@ -0,0 +1,28 @@ +[cerebrium.deployment] +name = "tensorrt-triton-demo" +python_version = "3.12" +disable_auth = true +include = ['./*', 'cerebrium.toml'] +exclude = ['.*'] +deployment_initialization_timeout = 830 + +[cerebrium.hardware] +cpu = 4.0 +memory = 40.0 +compute = "AMPERE_A10" +gpu_count = 1 +provider = "aws" +region = "us-east-1" + +[cerebrium.scaling] +min_replicas = 0 +max_replicas = 2 +cooldown = 60 +replica_concurrency = 5 +scaling_metric = "concurrency_utilization" + +[cerebrium.runtime.custom] +port = 8000 +healthcheck_endpoint = "/v2/health/live" +readycheck_endpoint = "/v2/health/ready" +dockerfile_path = "./Dockerfile" \ No newline at end of file diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt new file mode 100644 index 00000000..bdec6a4a --- /dev/null +++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt @@ -0,0 +1,44 @@ +name: "llama3_2" +backend: "python" +max_batch_size: 0 + +instance_group [ + { + count: 1 + kind: KIND_GPU + } +] + +input [ + { + name: "text_input" + data_type: TYPE_STRING + dims: [ 1 ] + }, + { + name: "max_tokens" + data_type: TYPE_INT32 + dims: [ 1 ] + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + } +] + +output [ + { + name: "text_output" + data_type: TYPE_STRING + dims: [ 1 ] + } +] diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/download_model.py b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/download_model.py new file mode 100644 index 00000000..2aded4a4 --- /dev/null +++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/download_model.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +""" +Download HuggingFace model to persistent storage. +Only downloads if model doesn't already exist. +""" + +import os +from pathlib import Path +from huggingface_hub import snapshot_download, login + +MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct" +MODEL_DIR = Path("/persistent-storage/models") / MODEL_ID + + +def download_model(): + """Download model from HuggingFace if not already present.""" + hf_token = os.environ.get("HF_AUTH_TOKEN") + + if not hf_token: + print("WARNING: HF_AUTH_TOKEN not set, model download may fail") + return + + if MODEL_DIR.exists() and any(MODEL_DIR.iterdir()): + print("✓ Model already exists") + return + + print("Downloading model from HuggingFace...") + login(token=hf_token) + snapshot_download( + MODEL_ID, + local_dir=str(MODEL_DIR), + token=hf_token + ) + print("✓ Model downloaded successfully") + + +if __name__ == "__main__": + download_model() \ No newline at end of file diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py new file mode 100644 index 00000000..3b7e3473 --- /dev/null +++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py @@ -0,0 +1,151 @@ +""" +Triton Python Backend for TensorRT-LLM. + +This module implements a Triton Inference Server Python backend that uses +TensorRT-LLM's PyTorch backend for optimized LLM inference. +""" + +import numpy as np +import triton_python_backend_utils as pb_utils +import torch +from tensorrt_llm import LLM, SamplingParams, BuildConfig +from tensorrt_llm.plugin.plugin import PluginConfig +from transformers import AutoTokenizer + +# Model configuration +MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct" +MODEL_DIR = f"/persistent-storage/models/{MODEL_ID}" + + +class TritonPythonModel: + """ + Triton Python Backend model for TensorRT-LLM inference. + + This class handles model initialization, inference requests, and cleanup. + """ + + def initialize(self, args): + """ + Initialize the model using TensorRT-LLM's PyTorch backend. + + This method is called once when the model is loaded. It: + 1. Loads the tokenizer from HuggingFace + 2. Initializes TensorRT-LLM with PyTorch backend (loads model directly) + + Args: + args: Dictionary containing model configuration from Triton + """ + print("Loading tokenizer...") + self.tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR) + + print("Initializing TensorRT-LLM with PyTorch backend...") + + + plugin_config = PluginConfig.from_dict({ + "paged_kv_cache": True, # Efficient memory usage for KV cache + }) + + # Configure build parameters + build_config = BuildConfig( + plugin_config=plugin_config, + max_input_len=4096, # Maximum input sequence length + max_batch_size=1, # Batch size per request + ) + + self.llm = LLM( + model=MODEL_DIR, # HuggingFace model path + build_config=build_config, + tensor_parallel_size=torch.cuda.device_count(), + ) + print("✓ Model ready") + + def execute(self, requests): + """ + Execute inference requests. + + Processes one or more inference requests, generating text responses + using the TensorRT-LLM model. + + Args: + requests: List of InferenceRequest objects from Triton + + Returns: + List of InferenceResponse objects with generated text + """ + responses = [] + + for request in requests: + try: + # Extract input text + input_tensor = pb_utils.get_input_tensor_by_name(request, "text_input") + text = input_tensor.as_numpy()[0].decode('utf-8') + + # Extract optional parameters (with defaults) + max_tokens = 1024 + temperature = 0.8 + top_p = 0.95 + + max_tokens_tensor = pb_utils.get_input_tensor_by_name(request, "max_tokens") + if max_tokens_tensor is not None: + max_tokens = int(max_tokens_tensor.as_numpy()[0]) + + temp_tensor = pb_utils.get_input_tensor_by_name(request, "temperature") + if temp_tensor is not None: + temperature = float(temp_tensor.as_numpy()[0]) + + top_p_tensor = pb_utils.get_input_tensor_by_name(request, "top_p") + if top_p_tensor is not None: + top_p = float(top_p_tensor.as_numpy()[0]) + + # Format prompt using Llama chat template + messages = [{"role": "user", "content": text}] + prompt = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + + # Configure sampling parameters + sampling_params = SamplingParams( + temperature=temperature, + top_p=top_p, + max_tokens=max_tokens, + ) + + # Generate text + output = self.llm.generate(prompt, sampling_params) + generated_text = output.outputs[0].text + + # Create response tensor + output_tensor = pb_utils.Tensor( + "text_output", + np.array([generated_text.encode('utf-8')], dtype=object) + ) + + # Create inference response + inference_response = pb_utils.InferenceResponse( + output_tensors=[output_tensor] + ) + responses.append(inference_response) + + except Exception as e: + # Handle errors gracefully + print(f"Error processing request: {e}") + error_response = pb_utils.InferenceResponse( + output_tensors=[], + error=pb_utils.TritonError(f"Error: {str(e)}") + ) + responses.append(error_response) + + return responses + + def finalize(self): + """ + Cleanup when model is being unloaded. + + Shuts down the TensorRT-LLM engine and clears GPU memory. + """ + if hasattr(self, 'llm'): + self.llm.shutdown() + torch.cuda.empty_cache() + diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/start_triton.sh b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/start_triton.sh new file mode 100644 index 00000000..ae388189 --- /dev/null +++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/start_triton.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -e + +# Download model if not already present +echo "Checking for model..." +python3 /app/download_model.py + +# Start Triton Inference Server +echo "Starting Triton Inference Server..." +exec tritonserver \ + --model-repository=/app/model_repository \ + --http-port=8000 \ + --grpc-port=8001 \ + --metrics-port=8002 \ No newline at end of file From a99d36be649848438948fd8325f75889e0a02688 Mon Sep 17 00:00:00 2001 From: ajaykrish2303 Date: Wed, 19 Nov 2025 18:18:13 -0500 Subject: [PATCH 2/3] Enabled Batching, Updated execute logic() --- .../cerebrium.toml | 10 +- .../config.pbtxt | 2 +- .../model.py | 212 ++++++++++-------- 3 files changed, 130 insertions(+), 94 deletions(-) diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml index 358aa16d..46d175ca 100644 --- a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml +++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml @@ -4,7 +4,7 @@ python_version = "3.12" disable_auth = true include = ['./*', 'cerebrium.toml'] exclude = ['.*'] -deployment_initialization_timeout = 830 +deployment_initialization_timeout = 830 [cerebrium.hardware] cpu = 4.0 @@ -15,10 +15,10 @@ provider = "aws" region = "us-east-1" [cerebrium.scaling] -min_replicas = 0 -max_replicas = 2 -cooldown = 60 -replica_concurrency = 5 +min_replicas = 2 +max_replicas = 5 +cooldown = 300 +replica_concurrency = 10 scaling_metric = "concurrency_utilization" [cerebrium.runtime.custom] diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt index bdec6a4a..bc3d5864 100644 --- a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt +++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt @@ -1,6 +1,6 @@ name: "llama3_2" backend: "python" -max_batch_size: 0 +max_batch_size: 32 instance_group [ { diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py index 3b7e3473..e9c97bc1 100644 --- a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py +++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py @@ -26,34 +26,27 @@ class TritonPythonModel: def initialize(self, args): """ - Initialize the model using TensorRT-LLM's PyTorch backend. + Initialize the model - called once when Triton loads the model. - This method is called once when the model is loaded. It: - 1. Loads the tokenizer from HuggingFace - 2. Initializes TensorRT-LLM with PyTorch backend (loads model directly) - - Args: - args: Dictionary containing model configuration from Triton + Loads tokenizer and initializes TensorRT-LLM with PyTorch backend. """ print("Loading tokenizer...") self.tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR) - print("Initializing TensorRT-LLM with PyTorch backend...") - + print("Initializing TensorRT-LLM...") plugin_config = PluginConfig.from_dict({ "paged_kv_cache": True, # Efficient memory usage for KV cache }) - # Configure build parameters build_config = BuildConfig( plugin_config=plugin_config, - max_input_len=4096, # Maximum input sequence length - max_batch_size=1, # Batch size per request + max_input_len=4096, + max_batch_size=32, # Matches Triton max_batch_size in config.pbtxt ) self.llm = LLM( - model=MODEL_DIR, # HuggingFace model path + model=MODEL_DIR, build_config=build_config, tensor_parallel_size=torch.cuda.device_count(), ) @@ -61,83 +54,127 @@ def initialize(self, args): def execute(self, requests): """ - Execute inference requests. - - Processes one or more inference requests, generating text responses - using the TensorRT-LLM model. + Execute inference on batched requests. - Args: - requests: List of InferenceRequest objects from Triton - - Returns: - List of InferenceResponse objects with generated text + Triton automatically batches requests (up to max_batch_size: 32). + This function processes the batch that Triton provides. """ - responses = [] - - for request in requests: - try: - # Extract input text - input_tensor = pb_utils.get_input_tensor_by_name(request, "text_input") - text = input_tensor.as_numpy()[0].decode('utf-8') - - # Extract optional parameters (with defaults) - max_tokens = 1024 - temperature = 0.8 - top_p = 0.95 - - max_tokens_tensor = pb_utils.get_input_tensor_by_name(request, "max_tokens") - if max_tokens_tensor is not None: - max_tokens = int(max_tokens_tensor.as_numpy()[0]) - - temp_tensor = pb_utils.get_input_tensor_by_name(request, "temperature") - if temp_tensor is not None: - temperature = float(temp_tensor.as_numpy()[0]) - - top_p_tensor = pb_utils.get_input_tensor_by_name(request, "top_p") - if top_p_tensor is not None: - top_p = float(top_p_tensor.as_numpy()[0]) - - # Format prompt using Llama chat template - messages = [{"role": "user", "content": text}] - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True - ) - - # Configure sampling parameters - sampling_params = SamplingParams( - temperature=temperature, - top_p=top_p, - max_tokens=max_tokens, - ) - - # Generate text - output = self.llm.generate(prompt, sampling_params) - generated_text = output.outputs[0].text - - # Create response tensor - output_tensor = pb_utils.Tensor( - "text_output", - np.array([generated_text.encode('utf-8')], dtype=object) - ) - - # Create inference response - inference_response = pb_utils.InferenceResponse( - output_tensors=[output_tensor] - ) - responses.append(inference_response) - - except Exception as e: - # Handle errors gracefully - print(f"Error processing request: {e}") - error_response = pb_utils.InferenceResponse( - output_tensors=[], - error=pb_utils.TritonError(f"Error: {str(e)}") + try: + prompts = [] + sampling_params_list = [] + original_prompts = [] # Store original prompts to strip from output if needed + + # Extract data from each request in the batch + for request in requests: + try: + # Get input text - handle batched tensor structures + input_tensor = pb_utils.get_input_tensor_by_name(request, "text_input") + text_array = input_tensor.as_numpy() + + # Extract text handling different array structures (batched vs non-batched) + if text_array.ndim == 0: + # Scalar + text = text_array.item() + elif text_array.dtype == object: + # Object dtype array (common for BYTES/STRING with batching) + text = text_array.flat[0] if text_array.size > 0 else text_array.item() + else: + # Regular array - get first element + text = text_array.flat[0] if text_array.size > 0 else text_array.item() + + # Decode if bytes, otherwise use as string + if isinstance(text, bytes): + text = text.decode('utf-8') + elif isinstance(text, np.str_): + text = str(text) + + # Get optional parameters with defaults + max_tokens = 1024 + if pb_utils.get_input_tensor_by_name(request, "max_tokens") is not None: + max_tokens_array = pb_utils.get_input_tensor_by_name(request, "max_tokens").as_numpy() + max_tokens = int(max_tokens_array.item() if max_tokens_array.ndim == 0 else max_tokens_array.flat[0]) + + temperature = 0.8 + if pb_utils.get_input_tensor_by_name(request, "temperature") is not None: + temp_array = pb_utils.get_input_tensor_by_name(request, "temperature").as_numpy() + temperature = float(temp_array.item() if temp_array.ndim == 0 else temp_array.flat[0]) + + top_p = 0.95 + if pb_utils.get_input_tensor_by_name(request, "top_p") is not None: + top_p_array = pb_utils.get_input_tensor_by_name(request, "top_p").as_numpy() + top_p = float(top_p_array.item() if top_p_array.ndim == 0 else top_p_array.flat[0]) + + # Format prompt using chat template + prompt = self.tokenizer.apply_chat_template( + [{"role": "user", "content": text}], + tokenize=False, + add_generation_prompt=True + ) + + prompts.append(prompt) + original_prompts.append(prompt) # Store for potential stripping + sampling_params_list.append(SamplingParams( + temperature=temperature, + top_p=top_p, + max_tokens=max_tokens, + )) + except Exception as e: + print(f"Error processing request: {e}", flush=True) + import traceback + traceback.print_exc() + # Use default max_tokens instead of 1 to avoid single token output + prompts.append("") + original_prompts.append("") + sampling_params_list.append(SamplingParams(max_tokens=1024)) + + # Batch inference + if not prompts: + return [] + + outputs = self.llm.generate(prompts, sampling_params_list) + + # Create responses + responses = [] + for i, output in enumerate(outputs): + try: + # Extract generated text + generated_text = output.outputs[0].text + + # Remove the prompt from generated text if it's included + if original_prompts[i] and original_prompts[i] in generated_text: + generated_text = generated_text.replace(original_prompts[i], "").strip() + + responses.append(pb_utils.InferenceResponse( + output_tensors=[pb_utils.Tensor( + "text_output", + np.array([generated_text.encode('utf-8')], dtype=object) + )] + )) + except Exception as e: + print(f"Error creating response {i}: {e}", flush=True) + responses.append(pb_utils.InferenceResponse( + output_tensors=[pb_utils.Tensor( + "text_output", + np.array([f"Error: {str(e)}".encode('utf-8')], dtype=object) + )] + )) + + return responses + + except Exception as e: + print(f"Error in execute: {e}", flush=True) + import traceback + traceback.print_exc() + # Return error responses + return [ + pb_utils.InferenceResponse( + output_tensors=[pb_utils.Tensor( + "text_output", + np.array([f"Batch error: {str(e)}".encode('utf-8')], dtype=object) + )] ) - responses.append(error_response) - - return responses + for _ in requests + ] def finalize(self): """ @@ -148,4 +185,3 @@ def finalize(self): if hasattr(self, 'llm'): self.llm.shutdown() torch.cuda.empty_cache() - From 40ebd90993d729fd895dc1169893f84ef18d0e2e Mon Sep 17 00:00:00 2001 From: ajaykrish2303 Date: Sun, 23 Nov 2025 23:45:22 -0500 Subject: [PATCH 3/3] moved to llm folder + updated code --- .../start_triton.sh | 14 ---------- .../Dockerfile | 6 ++--- .../cerebrium.toml | 10 ++++--- .../config.pbtxt | 8 ++++-- .../download_model.py | 0 .../model.py | 26 +++++++++++++++++-- 6 files changed, 40 insertions(+), 24 deletions(-) delete mode 100644 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/start_triton.sh rename {2-advanced-concepts/6-faster-inference-with-triton-tensorrt => 5-large-language-models/8-faster-inference-with-triton-tensorrt}/Dockerfile (81%) rename {2-advanced-concepts/6-faster-inference-with-triton-tensorrt => 5-large-language-models/8-faster-inference-with-triton-tensorrt}/cerebrium.toml (77%) rename {2-advanced-concepts/6-faster-inference-with-triton-tensorrt => 5-large-language-models/8-faster-inference-with-triton-tensorrt}/config.pbtxt (87%) rename {2-advanced-concepts/6-faster-inference-with-triton-tensorrt => 5-large-language-models/8-faster-inference-with-triton-tensorrt}/download_model.py (100%) rename {2-advanced-concepts/6-faster-inference-with-triton-tensorrt => 5-large-language-models/8-faster-inference-with-triton-tensorrt}/model.py (90%) diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/start_triton.sh b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/start_triton.sh deleted file mode 100644 index ae388189..00000000 --- a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/start_triton.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -set -e - -# Download model if not already present -echo "Checking for model..." -python3 /app/download_model.py - -# Start Triton Inference Server -echo "Starting Triton Inference Server..." -exec tritonserver \ - --model-repository=/app/model_repository \ - --http-port=8000 \ - --grpc-port=8001 \ - --metrics-port=8002 \ No newline at end of file diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/Dockerfile b/5-large-language-models/8-faster-inference-with-triton-tensorrt/Dockerfile similarity index 81% rename from 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/Dockerfile rename to 5-large-language-models/8-faster-inference-with-triton-tensorrt/Dockerfile index 10cd6fef..9365a425 100644 --- a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/Dockerfile +++ b/5-large-language-models/8-faster-inference-with-triton-tensorrt/Dockerfile @@ -28,12 +28,12 @@ RUN mkdir -p \ /persistent-storage/engines # Copy application files -COPY --chmod=755 download_model.py start_triton.sh /app/ +COPY --chmod=755 download_model.py /app/ COPY model.py /app/model_repository/llama3_2/1/ COPY config.pbtxt /app/model_repository/llama3_2/ # Expose Triton ports EXPOSE 8000 8001 8002 -# Start Triton server -CMD ["/app/start_triton.sh"] \ No newline at end of file +# Start Triton server directly +CMD ["tritonserver", "--model-repository=/app/model_repository", "--http-port=8000", "--grpc-port=8001", "--metrics-port=8002"] \ No newline at end of file diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml b/5-large-language-models/8-faster-inference-with-triton-tensorrt/cerebrium.toml similarity index 77% rename from 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml rename to 5-large-language-models/8-faster-inference-with-triton-tensorrt/cerebrium.toml index 46d175ca..58155a8e 100644 --- a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml +++ b/5-large-language-models/8-faster-inference-with-triton-tensorrt/cerebrium.toml @@ -15,12 +15,16 @@ provider = "aws" region = "us-east-1" [cerebrium.scaling] -min_replicas = 2 -max_replicas = 5 +min_replicas = 1 +max_replicas = 5 cooldown = 300 -replica_concurrency = 10 +replica_concurrency = 128 scaling_metric = "concurrency_utilization" +[cerebrium.dependencies.pip] +huggingface_hub = "latest" +transformers = "latest" + [cerebrium.runtime.custom] port = 8000 healthcheck_endpoint = "/v2/health/live" diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt b/5-large-language-models/8-faster-inference-with-triton-tensorrt/config.pbtxt similarity index 87% rename from 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt rename to 5-large-language-models/8-faster-inference-with-triton-tensorrt/config.pbtxt index bc3d5864..0de64ed4 100644 --- a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt +++ b/5-large-language-models/8-faster-inference-with-triton-tensorrt/config.pbtxt @@ -1,6 +1,10 @@ name: "llama3_2" backend: "python" -max_batch_size: 32 +max_batch_size: 128 + +dynamic_batching { + max_queue_delay_microseconds: 800 +} instance_group [ { @@ -41,4 +45,4 @@ output [ data_type: TYPE_STRING dims: [ 1 ] } -] +] \ No newline at end of file diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/download_model.py b/5-large-language-models/8-faster-inference-with-triton-tensorrt/download_model.py similarity index 100% rename from 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/download_model.py rename to 5-large-language-models/8-faster-inference-with-triton-tensorrt/download_model.py diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py b/5-large-language-models/8-faster-inference-with-triton-tensorrt/model.py similarity index 90% rename from 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py rename to 5-large-language-models/8-faster-inference-with-triton-tensorrt/model.py index e9c97bc1..baba21c9 100644 --- a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py +++ b/5-large-language-models/8-faster-inference-with-triton-tensorrt/model.py @@ -11,12 +11,31 @@ from tensorrt_llm import LLM, SamplingParams, BuildConfig from tensorrt_llm.plugin.plugin import PluginConfig from transformers import AutoTokenizer +from pathlib import Path # Model configuration MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct" MODEL_DIR = f"/persistent-storage/models/{MODEL_ID}" +def ensure_model_downloaded(): + """Check if model exists, download if not available.""" + model_path = Path(MODEL_DIR) + + # Check if model directory exists and has content + if not model_path.exists() or not any(model_path.iterdir()): + print("Model not found, downloading...") + try: + # Import download function from download_model + from download_model import download_model + download_model() + except Exception as e: + print(f"Error downloading model: {e}") + raise + else: + print("✓ Model already exists") + + class TritonPythonModel: """ Triton Python Backend model for TensorRT-LLM inference. @@ -30,6 +49,9 @@ def initialize(self, args): Loads tokenizer and initializes TensorRT-LLM with PyTorch backend. """ + # Ensure model is downloaded before loading + ensure_model_downloaded() + print("Loading tokenizer...") self.tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR) @@ -42,7 +64,7 @@ def initialize(self, args): build_config = BuildConfig( plugin_config=plugin_config, max_input_len=4096, - max_batch_size=32, # Matches Triton max_batch_size in config.pbtxt + max_batch_size=128, # Matches Triton max_batch_size in config.pbtxt ) self.llm = LLM( @@ -184,4 +206,4 @@ def finalize(self): """ if hasattr(self, 'llm'): self.llm.shutdown() - torch.cuda.empty_cache() + torch.cuda.empty_cache() \ No newline at end of file