In [None]:
# Clear GPU memory and prevent fragmentation issues in one cell

import torch
import gc
import os

# Set environment variable to reduce memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Clear any existing CUDA cache
torch.cuda.empty_cache()

# Perform garbage collection to free up unreferenced memory
gc.collect()

print("GPU memory cache cleared and memory fragmentation settings applied.")



This cell will:

1. Set the environment variable to help prevent memory fragmentation. 
2. Clear the PyTorch CUDA cache. 
3. Run garbage collection to free up additional memory. 

You can run this cell anytime you need to clear GPU memory before running your model export or inference tasks. 

## Step-by-Step Process to Export and Save the ONNX Model with External Data to a Specified Folder
### Step 1: Create Output Directory
Define the output directory where all generated files will be saved.

In [1]:

import os

# Define the directory to save ONNX and related files
output_dir = "model_export"
os.makedirs(output_dir, exist_ok=True)


### Step 2: Load the Model and Prepare for Export
Here, you’ll load the model and make any required configurations, such as disabling FlashAttention and converting to FP16.

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model with device_map="auto" for automatic model parallelism
model_name = "microsoft/Phi-3.5-vision-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    trust_remote_code=True, 
    device_map="auto"  # Automatically distribute layers across GPUs
)

# Convert model to half-precision (FP16)
model = model.half()

# Example inference
dummy_input = tokenizer("Hello, world!", return_tensors="pt").input_ids.to(model.hf_device_map[0])  # Send to the first device in map
output = model(dummy_input)
print("Output:", output)


  from .autonotebook import tqdm as notebook_tqdm
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.72s/it]


KeyError: 0

### Step 3: Export the Model to ONNX with External Data to the Output Directory
The ONNX export will automatically generate external data files if the model is large. Specify the path for the main ONNX file in the output_dir so all related files are saved in that directory.

In [None]:
import os

# Define the path for the ONNX file in the output directory
onnx_model_path = os.path.join(output_dir, "phi_3.5_vision.onnx")

# Export to ONNX format with potential for external data files
torch.onnx.export(
    model,
    dummy_input,
    onnx_model_path,
    input_names=["input_ids"],
    output_names=["output"],
    dynamic_axes={"input_ids": {0: "batch_size", 1: "sequence"}},
    opset_version=13,
    export_params=True,  # Saves parameters in the model file if possible
    keep_initializers_as_inputs=True  # Keeps initializers within the ONNX model graph
)

print(f"Model exported to ONNX format in {output_dir} successfully!")


### Step 4: Quantize the ONNX Model to INT8 and Save to the Same Folder
Using ONNX Runtime, quantize the model to INT8 precision. Save the quantized model in the same directory

In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType

# Define the path for the quantized ONNX model
quantized_model_path = os.path.join(output_dir, "phi_3.5_vision_quantized.onnx")

# Quantize to INT8
quantize_dynamic(
    onnx_model_path,
    quantized_model_path,
    op_types_to_quantize=["MatMul"],
    weight_type=QuantType.QInt8
)

print(f"Quantized model saved to {quantized_model_path}")


### Step 5: Convert the Quantized ONNX Model to TensorRT and Save to the Output Directory
For TensorRT conversion, use trtexec to optimize and convert the quantized ONNX model to a TensorRT engine. This step requires running a shell command, so here’s how to do it in Jupyter or a Python script.

In [None]:
# Define the path for the TensorRT engine file
tensorrt_model_path = os.path.join(output_dir, "phi_3.5_vision_quantized.trt")

# Run trtexec to convert the model to TensorRT
!trtexec --onnx={quantized_model_path} --saveEngine={tensorrt_model_path} --int8

print(f"TensorRT engine file saved to {tensorrt_model_path}")


### Step 6: Verify All Files Are in the Specified Output Directory
After running the notebook, the model_export directory should contain:

phi_3.5_vision.onnx: The exported ONNX model.
phi_3.5_vision.onnx.data_0 (and other .data files, if applicable): External data files for model weights and biases.
phi_3.5_vision_quantized.onnx: The INT8 quantized ONNX model.
phi_3.5_vision_quantized.trt: The TensorRT engine file for deployment on NVIDIA devices.
### Step 7: Transfer and Run on Jetson Orin
Refer to the instructions for transferring and running the TensorRT model on Jetson Orin provided in the previous messages. Ensure the phi_3.5_vision_quantized.trt file is transferred to the Jetson device, as it contains the fully optimized and quantized model for efficient inference.

## Summary
**Export Directory**: All files are saved in the `model_export` directory.  
**ONNX Export**: The original ONNX model and external data files are saved.  
**Quantization**: The INT8 model is saved in the same directory.  
**TensorRT Engine**: The final optimized TensorRT engine file is saved for deployment on Jetson Orin.
