From 6e3eba9e063dbe366661649b4ffc47c6a71f2da2 Mon Sep 17 00:00:00 2001
From: ajaykrish2303 <ag8172@nyu.edu>
Date: Wed, 19 Nov 2025 14:43:55 -0500
Subject: [PATCH 1/3] added code files

---
 .../Dockerfile                                |  39 +++++
 .../cerebrium.toml                            |  28 ++++
 .../config.pbtxt                              |  44 +++++
 .../download_model.py                         |  38 +++++
 .../model.py                                  | 151 ++++++++++++++++++
 .../start_triton.sh                           |  14 ++
 6 files changed, 314 insertions(+)
 create mode 100644 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/Dockerfile
 create mode 100644 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml
 create mode 100644 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt
 create mode 100644 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/download_model.py
 create mode 100644 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py
 create mode 100644 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/start_triton.sh

diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/Dockerfile b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/Dockerfile
new file mode 100644
index 00000000..10cd6fef
--- /dev/null
+++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/Dockerfile
@@ -0,0 +1,39 @@
+FROM nvcr.io/nvidia/tritonserver:25.10-trtllm-python-py3
+
+# Environment variables
+ENV PYTHONPATH=/usr/local/lib/python3.12/dist-packages:$PYTHONPATH
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV DEBIAN_FRONTEND=noninteractive
+ENV HF_HOME=/persistent-storage/models
+ENV TORCH_CUDA_ARCH_LIST=8.6
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    git-lfs \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Install Python dependencies
+RUN pip install --break-system-packages \
+    huggingface_hub \
+    transformers \
+    || true
+
+# Create required directories
+RUN mkdir -p \
+    /app/model_repository/llama3_2/1 \
+    /persistent-storage/models \
+    /persistent-storage/engines
+
+# Copy application files
+COPY --chmod=755 download_model.py start_triton.sh /app/
+COPY model.py /app/model_repository/llama3_2/1/
+COPY config.pbtxt /app/model_repository/llama3_2/
+
+# Expose Triton ports
+EXPOSE 8000 8001 8002
+
+# Start Triton server
+CMD ["/app/start_triton.sh"]
\ No newline at end of file
diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml
new file mode 100644
index 00000000..358aa16d
--- /dev/null
+++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml
@@ -0,0 +1,28 @@
+[cerebrium.deployment]
+name = "tensorrt-triton-demo"
+python_version = "3.12"
+disable_auth = true
+include = ['./*', 'cerebrium.toml']
+exclude = ['.*']
+deployment_initialization_timeout = 830
+
+[cerebrium.hardware]
+cpu = 4.0
+memory = 40.0
+compute = "AMPERE_A10"
+gpu_count = 1
+provider = "aws"
+region = "us-east-1"
+
+[cerebrium.scaling]
+min_replicas = 0
+max_replicas = 2
+cooldown = 60
+replica_concurrency = 5
+scaling_metric = "concurrency_utilization"
+
+[cerebrium.runtime.custom]
+port = 8000
+healthcheck_endpoint = "/v2/health/live"
+readycheck_endpoint = "/v2/health/ready"
+dockerfile_path = "./Dockerfile"
\ No newline at end of file
diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt
new file mode 100644
index 00000000..bdec6a4a
--- /dev/null
+++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt
@@ -0,0 +1,44 @@
+name: "llama3_2"
+backend: "python"
+max_batch_size: 0
+
+instance_group [
+  {
+    count: 1
+    kind: KIND_GPU
+  }
+]
+
+input [
+  {
+    name: "text_input"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  },
+  {
+    name: "max_tokens"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    optional: true
+  }
+]
+
+output [
+  {
+    name: "text_output"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/download_model.py b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/download_model.py
new file mode 100644
index 00000000..2aded4a4
--- /dev/null
+++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/download_model.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+"""
+Download HuggingFace model to persistent storage.
+Only downloads if model doesn't already exist.
+"""
+
+import os
+from pathlib import Path
+from huggingface_hub import snapshot_download, login
+
+MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
+MODEL_DIR = Path("/persistent-storage/models") / MODEL_ID
+
+
+def download_model():
+    """Download model from HuggingFace if not already present."""
+    hf_token = os.environ.get("HF_AUTH_TOKEN")
+    
+    if not hf_token:
+        print("WARNING: HF_AUTH_TOKEN not set, model download may fail")
+        return
+    
+    if MODEL_DIR.exists() and any(MODEL_DIR.iterdir()):
+        print("✓ Model already exists")
+        return
+    
+    print("Downloading model from HuggingFace...")
+    login(token=hf_token)
+    snapshot_download(
+        MODEL_ID,
+        local_dir=str(MODEL_DIR),
+        token=hf_token
+    )
+    print("✓ Model downloaded successfully")
+
+
+if __name__ == "__main__":
+    download_model()
\ No newline at end of file
diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py
new file mode 100644
index 00000000..3b7e3473
--- /dev/null
+++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py
@@ -0,0 +1,151 @@
+"""
+Triton Python Backend for TensorRT-LLM.
+
+This module implements a Triton Inference Server Python backend that uses
+TensorRT-LLM's PyTorch backend for optimized LLM inference. 
+"""
+
+import numpy as np
+import triton_python_backend_utils as pb_utils
+import torch
+from tensorrt_llm import LLM, SamplingParams, BuildConfig
+from tensorrt_llm.plugin.plugin import PluginConfig
+from transformers import AutoTokenizer
+
+# Model configuration
+MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
+MODEL_DIR = f"/persistent-storage/models/{MODEL_ID}"
+
+
+class TritonPythonModel:
+    """
+    Triton Python Backend model for TensorRT-LLM inference.
+    
+    This class handles model initialization, inference requests, and cleanup.
+    """
+    
+    def initialize(self, args):
+        """
+        Initialize the model using TensorRT-LLM's PyTorch backend.
+        
+        This method is called once when the model is loaded. It:
+        1. Loads the tokenizer from HuggingFace
+        2. Initializes TensorRT-LLM with PyTorch backend (loads model directly)
+        
+        Args:
+            args: Dictionary containing model configuration from Triton
+        """
+        print("Loading tokenizer...")
+        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
+        
+        print("Initializing TensorRT-LLM with PyTorch backend...")
+
+        
+        plugin_config = PluginConfig.from_dict({
+            "paged_kv_cache": True,  # Efficient memory usage for KV cache
+        })
+        
+        # Configure build parameters
+        build_config = BuildConfig(
+            plugin_config=plugin_config,
+            max_input_len=4096,      # Maximum input sequence length
+            max_batch_size=1,         # Batch size per request
+        )
+        
+        self.llm = LLM(
+            model=MODEL_DIR,  # HuggingFace model path
+            build_config=build_config,
+            tensor_parallel_size=torch.cuda.device_count(),
+        )
+        print("✓ Model ready")
+    
+    def execute(self, requests):
+        """
+        Execute inference requests.
+        
+        Processes one or more inference requests, generating text responses
+        using the TensorRT-LLM model.
+        
+        Args:
+            requests: List of InferenceRequest objects from Triton
+            
+        Returns:
+            List of InferenceResponse objects with generated text
+        """
+        responses = []
+        
+        for request in requests:
+            try:
+                # Extract input text
+                input_tensor = pb_utils.get_input_tensor_by_name(request, "text_input")
+                text = input_tensor.as_numpy()[0].decode('utf-8')
+                
+                # Extract optional parameters (with defaults)
+                max_tokens = 1024
+                temperature = 0.8
+                top_p = 0.95
+                
+                max_tokens_tensor = pb_utils.get_input_tensor_by_name(request, "max_tokens")
+                if max_tokens_tensor is not None:
+                    max_tokens = int(max_tokens_tensor.as_numpy()[0])
+                
+                temp_tensor = pb_utils.get_input_tensor_by_name(request, "temperature")
+                if temp_tensor is not None:
+                    temperature = float(temp_tensor.as_numpy()[0])
+                
+                top_p_tensor = pb_utils.get_input_tensor_by_name(request, "top_p")
+                if top_p_tensor is not None:
+                    top_p = float(top_p_tensor.as_numpy()[0])
+                
+                # Format prompt using Llama chat template
+                messages = [{"role": "user", "content": text}]
+                prompt = self.tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True
+                )
+                
+                # Configure sampling parameters
+                sampling_params = SamplingParams(
+                    temperature=temperature,
+                    top_p=top_p,
+                    max_tokens=max_tokens,
+                )
+                
+                # Generate text
+                output = self.llm.generate(prompt, sampling_params)
+                generated_text = output.outputs[0].text
+                
+                # Create response tensor
+                output_tensor = pb_utils.Tensor(
+                    "text_output",
+                    np.array([generated_text.encode('utf-8')], dtype=object)
+                )
+                
+                # Create inference response
+                inference_response = pb_utils.InferenceResponse(
+                    output_tensors=[output_tensor]
+                )
+                responses.append(inference_response)
+                
+            except Exception as e:
+                # Handle errors gracefully
+                print(f"Error processing request: {e}")
+                error_response = pb_utils.InferenceResponse(
+                    output_tensors=[],
+                    error=pb_utils.TritonError(f"Error: {str(e)}")
+                )
+                responses.append(error_response)
+        
+        return responses
+    
+    def finalize(self):
+        """
+        Cleanup when model is being unloaded.
+        
+        Shuts down the TensorRT-LLM engine and clears GPU memory.
+        """
+        if hasattr(self, 'llm'):
+            self.llm.shutdown()
+            torch.cuda.empty_cache()
+
diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/start_triton.sh b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/start_triton.sh
new file mode 100644
index 00000000..ae388189
--- /dev/null
+++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/start_triton.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+# Download model if not already present
+echo "Checking for model..."
+python3 /app/download_model.py
+
+# Start Triton Inference Server
+echo "Starting Triton Inference Server..."
+exec tritonserver \
+    --model-repository=/app/model_repository \
+    --http-port=8000 \
+    --grpc-port=8001 \
+    --metrics-port=8002
\ No newline at end of file

From a99d36be649848438948fd8325f75889e0a02688 Mon Sep 17 00:00:00 2001
From: ajaykrish2303 <ag8172@nyu.edu>
Date: Wed, 19 Nov 2025 18:18:13 -0500
Subject: [PATCH 2/3] Enabled Batching, Updated execute logic()

---
 .../cerebrium.toml                            |  10 +-
 .../config.pbtxt                              |   2 +-
 .../model.py                                  | 212 ++++++++++--------
 3 files changed, 130 insertions(+), 94 deletions(-)

diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml
index 358aa16d..46d175ca 100644
--- a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml
+++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml
@@ -4,7 +4,7 @@ python_version = "3.12"
 disable_auth = true
 include = ['./*', 'cerebrium.toml']
 exclude = ['.*']
-deployment_initialization_timeout = 830
+deployment_initialization_timeout = 830 
 
 [cerebrium.hardware]
 cpu = 4.0
@@ -15,10 +15,10 @@ provider = "aws"
 region = "us-east-1"
 
 [cerebrium.scaling]
-min_replicas = 0
-max_replicas = 2
-cooldown = 60
-replica_concurrency = 5
+min_replicas = 2          
+max_replicas = 5        
+cooldown = 300            
+replica_concurrency = 10  
 scaling_metric = "concurrency_utilization"
 
 [cerebrium.runtime.custom]
diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt
index bdec6a4a..bc3d5864 100644
--- a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt
+++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt
@@ -1,6 +1,6 @@
 name: "llama3_2"
 backend: "python"
-max_batch_size: 0
+max_batch_size: 32
 
 instance_group [
   {
diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py
index 3b7e3473..e9c97bc1 100644
--- a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py
+++ b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py
@@ -26,34 +26,27 @@ class TritonPythonModel:
     
     def initialize(self, args):
         """
-        Initialize the model using TensorRT-LLM's PyTorch backend.
+        Initialize the model - called once when Triton loads the model.
         
-        This method is called once when the model is loaded. It:
-        1. Loads the tokenizer from HuggingFace
-        2. Initializes TensorRT-LLM with PyTorch backend (loads model directly)
-        
-        Args:
-            args: Dictionary containing model configuration from Triton
+        Loads tokenizer and initializes TensorRT-LLM with PyTorch backend.
         """
         print("Loading tokenizer...")
         self.tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
         
-        print("Initializing TensorRT-LLM with PyTorch backend...")
-
+        print("Initializing TensorRT-LLM...")
         
         plugin_config = PluginConfig.from_dict({
             "paged_kv_cache": True,  # Efficient memory usage for KV cache
         })
         
-        # Configure build parameters
         build_config = BuildConfig(
             plugin_config=plugin_config,
-            max_input_len=4096,      # Maximum input sequence length
-            max_batch_size=1,         # Batch size per request
+            max_input_len=4096,
+            max_batch_size=32,  # Matches Triton max_batch_size in config.pbtxt
         )
         
         self.llm = LLM(
-            model=MODEL_DIR,  # HuggingFace model path
+            model=MODEL_DIR,
             build_config=build_config,
             tensor_parallel_size=torch.cuda.device_count(),
         )
@@ -61,83 +54,127 @@ def initialize(self, args):
     
     def execute(self, requests):
         """
-        Execute inference requests.
-        
-        Processes one or more inference requests, generating text responses
-        using the TensorRT-LLM model.
+        Execute inference on batched requests.
         
-        Args:
-            requests: List of InferenceRequest objects from Triton
-            
-        Returns:
-            List of InferenceResponse objects with generated text
+        Triton automatically batches requests (up to max_batch_size: 32).
+        This function processes the batch that Triton provides.
         """
-        responses = []
-        
-        for request in requests:
-            try:
-                # Extract input text
-                input_tensor = pb_utils.get_input_tensor_by_name(request, "text_input")
-                text = input_tensor.as_numpy()[0].decode('utf-8')
-                
-                # Extract optional parameters (with defaults)
-                max_tokens = 1024
-                temperature = 0.8
-                top_p = 0.95
-                
-                max_tokens_tensor = pb_utils.get_input_tensor_by_name(request, "max_tokens")
-                if max_tokens_tensor is not None:
-                    max_tokens = int(max_tokens_tensor.as_numpy()[0])
-                
-                temp_tensor = pb_utils.get_input_tensor_by_name(request, "temperature")
-                if temp_tensor is not None:
-                    temperature = float(temp_tensor.as_numpy()[0])
-                
-                top_p_tensor = pb_utils.get_input_tensor_by_name(request, "top_p")
-                if top_p_tensor is not None:
-                    top_p = float(top_p_tensor.as_numpy()[0])
-                
-                # Format prompt using Llama chat template
-                messages = [{"role": "user", "content": text}]
-                prompt = self.tokenizer.apply_chat_template(
-                    messages,
-                    tokenize=False,
-                    add_generation_prompt=True
-                )
-                
-                # Configure sampling parameters
-                sampling_params = SamplingParams(
-                    temperature=temperature,
-                    top_p=top_p,
-                    max_tokens=max_tokens,
-                )
-                
-                # Generate text
-                output = self.llm.generate(prompt, sampling_params)
-                generated_text = output.outputs[0].text
-                
-                # Create response tensor
-                output_tensor = pb_utils.Tensor(
-                    "text_output",
-                    np.array([generated_text.encode('utf-8')], dtype=object)
-                )
-                
-                # Create inference response
-                inference_response = pb_utils.InferenceResponse(
-                    output_tensors=[output_tensor]
-                )
-                responses.append(inference_response)
-                
-            except Exception as e:
-                # Handle errors gracefully
-                print(f"Error processing request: {e}")
-                error_response = pb_utils.InferenceResponse(
-                    output_tensors=[],
-                    error=pb_utils.TritonError(f"Error: {str(e)}")
+        try:
+            prompts = []
+            sampling_params_list = []
+            original_prompts = []  # Store original prompts to strip from output if needed
+            
+            # Extract data from each request in the batch
+            for request in requests:
+                try:
+                    # Get input text - handle batched tensor structures
+                    input_tensor = pb_utils.get_input_tensor_by_name(request, "text_input")
+                    text_array = input_tensor.as_numpy()
+                    
+                    # Extract text handling different array structures (batched vs non-batched)
+                    if text_array.ndim == 0:
+                        # Scalar
+                        text = text_array.item()
+                    elif text_array.dtype == object:
+                        # Object dtype array (common for BYTES/STRING with batching)
+                        text = text_array.flat[0] if text_array.size > 0 else text_array.item()
+                    else:
+                        # Regular array - get first element
+                        text = text_array.flat[0] if text_array.size > 0 else text_array.item()
+                    
+                    # Decode if bytes, otherwise use as string
+                    if isinstance(text, bytes):
+                        text = text.decode('utf-8')
+                    elif isinstance(text, np.str_):
+                        text = str(text)
+                    
+                    # Get optional parameters with defaults
+                    max_tokens = 1024
+                    if pb_utils.get_input_tensor_by_name(request, "max_tokens") is not None:
+                        max_tokens_array = pb_utils.get_input_tensor_by_name(request, "max_tokens").as_numpy()
+                        max_tokens = int(max_tokens_array.item() if max_tokens_array.ndim == 0 else max_tokens_array.flat[0])
+                    
+                    temperature = 0.8
+                    if pb_utils.get_input_tensor_by_name(request, "temperature") is not None:
+                        temp_array = pb_utils.get_input_tensor_by_name(request, "temperature").as_numpy()
+                        temperature = float(temp_array.item() if temp_array.ndim == 0 else temp_array.flat[0])
+                    
+                    top_p = 0.95
+                    if pb_utils.get_input_tensor_by_name(request, "top_p") is not None:
+                        top_p_array = pb_utils.get_input_tensor_by_name(request, "top_p").as_numpy()
+                        top_p = float(top_p_array.item() if top_p_array.ndim == 0 else top_p_array.flat[0])
+                    
+                    # Format prompt using chat template
+                    prompt = self.tokenizer.apply_chat_template(
+                        [{"role": "user", "content": text}],
+                        tokenize=False,
+                        add_generation_prompt=True
+                    )
+                    
+                    prompts.append(prompt)
+                    original_prompts.append(prompt)  # Store for potential stripping
+                    sampling_params_list.append(SamplingParams(
+                        temperature=temperature,
+                        top_p=top_p,
+                        max_tokens=max_tokens,
+                    ))
+                except Exception as e:
+                    print(f"Error processing request: {e}", flush=True)
+                    import traceback
+                    traceback.print_exc()
+                    # Use default max_tokens instead of 1 to avoid single token output
+                    prompts.append("")
+                    original_prompts.append("")
+                    sampling_params_list.append(SamplingParams(max_tokens=1024))
+            
+            # Batch inference
+            if not prompts:
+                return []
+            
+            outputs = self.llm.generate(prompts, sampling_params_list)
+
+            # Create responses
+            responses = []
+            for i, output in enumerate(outputs):
+                try:
+                    # Extract generated text
+                    generated_text = output.outputs[0].text
+                    
+                    # Remove the prompt from generated text if it's included
+                    if original_prompts[i] and original_prompts[i] in generated_text:
+                        generated_text = generated_text.replace(original_prompts[i], "").strip()
+                    
+                    responses.append(pb_utils.InferenceResponse(
+                        output_tensors=[pb_utils.Tensor(
+                            "text_output",
+                            np.array([generated_text.encode('utf-8')], dtype=object)
+                        )]
+                    ))
+                except Exception as e:
+                    print(f"Error creating response {i}: {e}", flush=True)
+                    responses.append(pb_utils.InferenceResponse(
+                        output_tensors=[pb_utils.Tensor(
+                            "text_output",
+                            np.array([f"Error: {str(e)}".encode('utf-8')], dtype=object)
+                        )]
+                    ))
+            
+            return responses
+            
+        except Exception as e:
+            print(f"Error in execute: {e}", flush=True)
+            import traceback
+            traceback.print_exc()
+            # Return error responses
+            return [
+                pb_utils.InferenceResponse(
+                    output_tensors=[pb_utils.Tensor(
+                        "text_output",
+                        np.array([f"Batch error: {str(e)}".encode('utf-8')], dtype=object)
+                    )]
                 )
-                responses.append(error_response)
-        
-        return responses
+                for _ in requests
+            ]
     
     def finalize(self):
         """
@@ -148,4 +185,3 @@ def finalize(self):
         if hasattr(self, 'llm'):
             self.llm.shutdown()
             torch.cuda.empty_cache()
-

From 40ebd90993d729fd895dc1169893f84ef18d0e2e Mon Sep 17 00:00:00 2001
From: ajaykrish2303 <ag8172@nyu.edu>
Date: Sun, 23 Nov 2025 23:45:22 -0500
Subject: [PATCH 3/3] moved to llm folder + updated code

---
 .../start_triton.sh                           | 14 ----------
 .../Dockerfile                                |  6 ++---
 .../cerebrium.toml                            | 10 ++++---
 .../config.pbtxt                              |  8 ++++--
 .../download_model.py                         |  0
 .../model.py                                  | 26 +++++++++++++++++--
 6 files changed, 40 insertions(+), 24 deletions(-)
 delete mode 100644 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/start_triton.sh
 rename {2-advanced-concepts/6-faster-inference-with-triton-tensorrt => 5-large-language-models/8-faster-inference-with-triton-tensorrt}/Dockerfile (81%)
 rename {2-advanced-concepts/6-faster-inference-with-triton-tensorrt => 5-large-language-models/8-faster-inference-with-triton-tensorrt}/cerebrium.toml (77%)
 rename {2-advanced-concepts/6-faster-inference-with-triton-tensorrt => 5-large-language-models/8-faster-inference-with-triton-tensorrt}/config.pbtxt (87%)
 rename {2-advanced-concepts/6-faster-inference-with-triton-tensorrt => 5-large-language-models/8-faster-inference-with-triton-tensorrt}/download_model.py (100%)
 rename {2-advanced-concepts/6-faster-inference-with-triton-tensorrt => 5-large-language-models/8-faster-inference-with-triton-tensorrt}/model.py (90%)

diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/start_triton.sh b/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/start_triton.sh
deleted file mode 100644
index ae388189..00000000
--- a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/start_triton.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-set -e
-
-# Download model if not already present
-echo "Checking for model..."
-python3 /app/download_model.py
-
-# Start Triton Inference Server
-echo "Starting Triton Inference Server..."
-exec tritonserver \
-    --model-repository=/app/model_repository \
-    --http-port=8000 \
-    --grpc-port=8001 \
-    --metrics-port=8002
\ No newline at end of file
diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/Dockerfile b/5-large-language-models/8-faster-inference-with-triton-tensorrt/Dockerfile
similarity index 81%
rename from 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/Dockerfile
rename to 5-large-language-models/8-faster-inference-with-triton-tensorrt/Dockerfile
index 10cd6fef..9365a425 100644
--- a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/Dockerfile
+++ b/5-large-language-models/8-faster-inference-with-triton-tensorrt/Dockerfile
@@ -28,12 +28,12 @@ RUN mkdir -p \
     /persistent-storage/engines
 
 # Copy application files
-COPY --chmod=755 download_model.py start_triton.sh /app/
+COPY --chmod=755 download_model.py /app/
 COPY model.py /app/model_repository/llama3_2/1/
 COPY config.pbtxt /app/model_repository/llama3_2/
 
 # Expose Triton ports
 EXPOSE 8000 8001 8002
 
-# Start Triton server
-CMD ["/app/start_triton.sh"]
\ No newline at end of file
+# Start Triton server directly
+CMD ["tritonserver", "--model-repository=/app/model_repository", "--http-port=8000", "--grpc-port=8001", "--metrics-port=8002"]
\ No newline at end of file
diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml b/5-large-language-models/8-faster-inference-with-triton-tensorrt/cerebrium.toml
similarity index 77%
rename from 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml
rename to 5-large-language-models/8-faster-inference-with-triton-tensorrt/cerebrium.toml
index 46d175ca..58155a8e 100644
--- a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/cerebrium.toml
+++ b/5-large-language-models/8-faster-inference-with-triton-tensorrt/cerebrium.toml
@@ -15,12 +15,16 @@ provider = "aws"
 region = "us-east-1"
 
 [cerebrium.scaling]
-min_replicas = 2          
-max_replicas = 5        
+min_replicas = 1        
+max_replicas = 5         
 cooldown = 300            
-replica_concurrency = 10  
+replica_concurrency = 128  
 scaling_metric = "concurrency_utilization"
 
+[cerebrium.dependencies.pip]
+huggingface_hub = "latest"
+transformers = "latest"
+
 [cerebrium.runtime.custom]
 port = 8000
 healthcheck_endpoint = "/v2/health/live"
diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt b/5-large-language-models/8-faster-inference-with-triton-tensorrt/config.pbtxt
similarity index 87%
rename from 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt
rename to 5-large-language-models/8-faster-inference-with-triton-tensorrt/config.pbtxt
index bc3d5864..0de64ed4 100644
--- a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/config.pbtxt
+++ b/5-large-language-models/8-faster-inference-with-triton-tensorrt/config.pbtxt
@@ -1,6 +1,10 @@
 name: "llama3_2"
 backend: "python"
-max_batch_size: 32
+max_batch_size: 128
+
+dynamic_batching {
+  max_queue_delay_microseconds: 800
+}
 
 instance_group [
   {
@@ -41,4 +45,4 @@ output [
     data_type: TYPE_STRING
     dims: [ 1 ]
   }
-]
+]
\ No newline at end of file
diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/download_model.py b/5-large-language-models/8-faster-inference-with-triton-tensorrt/download_model.py
similarity index 100%
rename from 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/download_model.py
rename to 5-large-language-models/8-faster-inference-with-triton-tensorrt/download_model.py
diff --git a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py b/5-large-language-models/8-faster-inference-with-triton-tensorrt/model.py
similarity index 90%
rename from 2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py
rename to 5-large-language-models/8-faster-inference-with-triton-tensorrt/model.py
index e9c97bc1..baba21c9 100644
--- a/2-advanced-concepts/6-faster-inference-with-triton-tensorrt/model.py
+++ b/5-large-language-models/8-faster-inference-with-triton-tensorrt/model.py
@@ -11,12 +11,31 @@
 from tensorrt_llm import LLM, SamplingParams, BuildConfig
 from tensorrt_llm.plugin.plugin import PluginConfig
 from transformers import AutoTokenizer
+from pathlib import Path
 
 # Model configuration
 MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
 MODEL_DIR = f"/persistent-storage/models/{MODEL_ID}"
 
 
+def ensure_model_downloaded():
+    """Check if model exists, download if not available."""
+    model_path = Path(MODEL_DIR)
+    
+    # Check if model directory exists and has content
+    if not model_path.exists() or not any(model_path.iterdir()):
+        print("Model not found, downloading...")
+        try:
+            # Import download function from download_model
+            from download_model import download_model
+            download_model()
+        except Exception as e:
+            print(f"Error downloading model: {e}")
+            raise
+    else:
+        print("✓ Model already exists")
+
+
 class TritonPythonModel:
     """
     Triton Python Backend model for TensorRT-LLM inference.
@@ -30,6 +49,9 @@ def initialize(self, args):
         
         Loads tokenizer and initializes TensorRT-LLM with PyTorch backend.
         """
+        # Ensure model is downloaded before loading
+        ensure_model_downloaded()
+        
         print("Loading tokenizer...")
         self.tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
         
@@ -42,7 +64,7 @@ def initialize(self, args):
         build_config = BuildConfig(
             plugin_config=plugin_config,
             max_input_len=4096,
-            max_batch_size=32,  # Matches Triton max_batch_size in config.pbtxt
+            max_batch_size=128,  # Matches Triton max_batch_size in config.pbtxt
         )
         
         self.llm = LLM(
@@ -184,4 +206,4 @@ def finalize(self):
         """
         if hasattr(self, 'llm'):
             self.llm.shutdown()
-            torch.cuda.empty_cache()
+            torch.cuda.empty_cache()
\ No newline at end of file