<a href="https://colab.research.google.com/github/weedge/doraemon-nb/blob/main/Mistral_Engine_TensorRT_LLM_Builder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

💡

https://pypi.org/project/tensorrt-llm/

tensorrt_llm don't support 3.11,  support 3.10 3.12, so need upgrade python to 3.12 or 3.10

# use python3.10

In [None]:
!ls /usr/bin/python3.*

/usr/bin/python3.10  /usr/bin/python3.10-config  /usr/bin/python3.11  /usr/bin/python3.11-config


In [None]:
!update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
!update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 2


update-alternatives: using /usr/bin/python3.10 to provide /usr/bin/python3 (python3) in auto mode


In [None]:
!python --version

In [None]:
!apt-get install -y build-essential python3-dev python3-pip

# run

In [None]:
# Clone the nvidia TensorRT LLM repository

!git clone https://github.com/NVIDIA/TensorRT-LLM.git
%cd TensorRT-LLM/examples/llama

In [None]:
!python --version

In [None]:
# Install necessary python dependencies

!pip install -q tensorrt_llm --pre --extra-index-url https://pypi.nvidia.com
!pip install huggingface_hub pynvml mpi4py
!pip install -q -r requirements.txt

In [None]:
# Download the Mistral 7B Instruct v0.2 weights from hugging face

from huggingface_hub import snapshot_download
from google.colab import userdata


snapshot_download(
    "mistralai/Mistral-7B-Instruct-v0.2",
    local_dir="tmp/hf_models/mistral-7b-instruct-v0.2",
    max_workers=4
)

In [None]:
# Convert the raw model weights into tensorrt-llm checkpoint format

!python convert_checkpoint.py --model_dir ./tmp/hf_models/mistral-7b-instruct-v0.2 \
                             --output_dir ./tmp/trt_engines/1-gpu/ \
                             --dtype float16

In [None]:
# Compile the model

!trtllm-build --checkpoint_dir ./tmp/trt_engines/1-gpu/ \
            --output_dir ./tmp/trt_engines/compiled-model/ \
            --gpt_attention_plugin float16 \
            --gemm_plugin float16 \
            --max_input_len 32256


In [None]:
# Upload the compiled model to hugging face hub

import os
from huggingface_hub import HfApi

for root, dirs, files in os.walk(f"tmp/trt_engines/compiled-model", topdown=False):
    for name in files:
        filepath = os.path.join(root, name)
        filename = "/".join(filepath.split("/")[-2:])
        print("uploading file: ", filename)
        api = HfApi(token=userdata.get('HF_WRITE_TOKEN'))
        api.upload_file(
            path_or_fileobj=filepath,
            path_in_repo=filename,
            repo_id="htrivedi99/mistral-7b-v0.2-trtllm"
        )

In [None]:
%cd ..
!pwd

/content/TensorRT-LLM/examples
/content/TensorRT-LLM/examples


In [None]:
# Test the compiled model

!python3 run.py --max_output_len=256 \
               --tokenizer_dir ./llama/tmp/hf_models/mistral-7b-instruct-v0.2/ \
               --engine_dir=./llama/tmp/trt_engines/compiled-model \
               --max_attention_window_size=4096