# AIMET Quantization workflow for Gemma3-4B Vision Encoder

This notebook shows a working code example of how to use AIMET to quantize Gemma3-4B Vision Encoder

---
### Required packages
The notebook assumes AIMET and Gemma3 related packages are already installed.

In [None]:
if __name__ != '__main__':
    raise Exception("Killing multiprocessing spawn started by Converter during model preparation.")

In [None]:
# Install packages only if running in jupyter notebook mode
if hasattr(__builtins__,'__IPYTHON__'):
    !sudo -H apt-get -qq update
    !sudo -H apt-get -qq install libc++-dev
    !sudo -H pip install --quiet --upgrade --root-user-action=ignore --no-cache-dir transformers==4.50.0
    !sudo -H pip install --quiet --upgrade --root-user-action=ignore --no-cache-dir tokenizers==0.21.4

### Overall flow
This notebook covers the following
1. Setting QNN SDK and NSP target
2. Instantiate HuggingFace model and Dataloader
3. Model adaptation
4. Prepare model using QAIRT model preparer pro
5. Quantization 
6. Export

### 1.1 Setting QNN SDK

In [None]:
import sys
import os
import copy

QNN_SDK_ROOT = "/tmp/qnn"
assert QNN_SDK_ROOT is not None, 'Please point the QNN_SDK_ROOT variable to your QNN SDK'
assert os.path.exists(QNN_SDK_ROOT), "QNN_SDK_ROOT doesn't exist!"
sys.path.insert(0, QNN_SDK_ROOT + '/lib/python')

lib_clang_path = os.path.join(QNN_SDK_ROOT, 'lib', 'x86_64-linux-clang')
LD_LIBRARY_PATH = os.getenv('LD_LIBRARY_PATH', None)
os.environ['LD_LIBRARY_PATH'] = lib_clang_path + ':' + LD_LIBRARY_PATH if LD_LIBRARY_PATH is not None else lib_clang_path


### 1.2 Setting NSP Target

In [None]:
sys.path.append('./')
# Select quantsim config based on target
htp_config_file = f'htp_quantsim_config_v81.json'

### 2. Instantiate HuggingFace model and Dataloader

In [None]:
import torch
from genai_lib.common.debug.recipe_logger import recipe_dump_init
from genai_lib.common.debug.recipe_logger import llm_lib_log_env_info
from transformers import AutoConfig, AutoTokenizer, AutoProcessor, AutoImageProcessor

#======================Configurable setting by users================================
run_sqnr_eval = True

model_name = 'siglip'
model_id="google/gemma-3-4b-it"

cache_dir='/tmp/cache_dir'
output_dir = '/tmp/output_dir'  # point to where the export artifacts of this notebook to be saved

llm_config = AutoConfig.from_pretrained(model_id, cache_dir=cache_dir, trust_remote_code=True)

# To help with debugging num_hidden_layers could be set to 2 to quickly verify the pipeline and export a two layer model for verification purposes
num_hidden_layers = int(os.getenv("NUM_HIDDEN_LAYERS", 0))
llm_config.vision_config.num_hidden_layers = num_hidden_layers if num_hidden_layers > 0 else llm_config.vision_config.num_hidden_layers

if num_hidden_layers > llm_config.vision_config.num_hidden_layers:
    print("Setting num_hidden_layer greater than original model weight will result in randomized weight in the model!")

dtype = torch.float32
llm_config.torch_dtype = llm_config.vision_config.torch_dtype = dtype
llm_config.vision_config.vision_use_head = False

print('num_layer: {}'.format(llm_config.vision_config.num_hidden_layers))

In [None]:
# Recipe_logger: Initialize the logger and log environment details 
os.makedirs(output_dir, exist_ok=True)
recipe_dump_init(output_dir)

llm_lib_log_env_info()

#### 2.1 Adapt Gemma3 multi-modal projector

The Gemma3 projector in Hugging Face uses MatMul and nn.Parameter for the projection layer, here we replace them with a Conv2d layer.

In [None]:
from transformers.models.gemma3 import modeling_gemma3
from gemma3.adaptation import Gemma3MultiModalProjector

modeling_gemma3.Gemma3MultiModalProjector = Gemma3MultiModalProjector

#### 2.2 Instantiate the HuggingFace model

In [None]:
from transformers.models.gemma3 import modeling_gemma3
from genai_lib.common.debug.profiler import event_marker

with event_marker('HuggingFace FP model creation'):
    model = modeling_gemma3.Gemma3ForConditionalGeneration.from_pretrained(model_id, config=llm_config, cache_dir=cache_dir, trust_remote_code=True)

    os.environ['TOKENIZERS_PARALLELISM'] = '0'
    tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir, use_fast=True, trust_remote_code=True)
    processor = AutoImageProcessor.from_pretrained(model_id, cache_dir=cache_dir, trust_remote_code=True)

#### 2.3 Preprocess Calibration Dataset

In [None]:
from datasets import load_dataset
from PIL import Image as PILImage

dataset_path = "<path to folder containing the coco dataset root folder>"
data_files = "llm_utils/llava_dataset/llava_v1_5_mix665k_300.json"

class VisionDatasetLoader:
    """
    Dataset for GPTQ-preprocessed tokens
    """
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        pixel_values = self.processor(PILImage.open(fp=os.path.join(dataset_path, self.dataset['image'][idx]))).pixel_values[0]
        num_channels, height, width = pixel_values.shape
        return torch.tensor(pixel_values.reshape(1, num_channels, height, width))

with event_marker("Load and Preprocess Calibration Data"):
    train_dataset = VisionDatasetLoader(load_dataset("json", data_files=data_files, cache_dir=cache_dir, split='train'), processor)

sample_inputs = train_dataset[0]

#### 2.4 Module for VEG (Vision Tower + Projector)

In [None]:
from aimet_torch.utils import place_model

class VisualEmbeddingGenerator(torch.nn.Module):
    def __init__(self, vision_tower, multi_modal_projector):
        super().__init__()
        self.multi_modal_projector = multi_modal_projector
        self.vision_tower = vision_tower
        self.device = vision_tower.device

    # this forwrad gets the image pixel values that we get from the AutoProcessor when we pass the image and text (text -> input ids, and image-> pixel values)
    # input shape is [1,3,896,896], output shape is [1,256,2560]
    def forward(self, pixel_values):
        image_outputs = self.vision_tower(pixel_values=pixel_values).last_hidden_state
        image_features = self.multi_modal_projector(image_outputs)
        return image_features

In [None]:
if run_sqnr_eval:
    veg_orig = VisualEmbeddingGenerator(model.vision_tower, model.multi_modal_projector)
    with torch.no_grad(), place_model(veg_orig, torch.device('cuda')):
        fp_output = veg_orig(sample_inputs.to(device='cuda')).cpu()  
    del veg_orig

### 3. Model Adaptation

In [None]:
from genai_lib.lvm.dev.transformer.siglip_adaptation import align_siglip_vision_model_tensor_dimensionlity

with event_marker('FP model adaptation for NSP backend completion'):
    # adaptation for siglip
    align_siglip_vision_model_tensor_dimensionlity(model.vision_tower)

    # adaptation for projector
    model.multi_modal_projector.replace_matmul_with_conv()

veg_adapted = VisualEmbeddingGenerator(model.vision_tower, model.multi_modal_projector)

#### 3.1 SQNR evaluation for adapted FP model

In [None]:
# SQNR of adapted model
from genai_lib.lvm.eval_utils import get_sqnr

if run_sqnr_eval:
    with torch.no_grad(), place_model(veg_adapted, torch.device('cuda')):
        output_adapted = veg_adapted(sample_inputs.to(device='cuda')).cpu()

    fp_adapted_sqnr = get_sqnr(output_adapted, fp_output)
    print(f'Adapted to prepared model SQNR: {fp_adapted_sqnr}') 

### 4. Prepare model using QAIRT model preparer pro

In [None]:
import time
from qti.aisw.preparer_api import prepare_model
from qti.aisw.emitter.utils.torch_utils import load_torch_model_using_safetensors

from genai_lib.llm.model_preparation_utils import llm_build_preparer_converter_args
from genai_lib.llm.utils import llm_model_input_output_names


skip_prepare = False
if skip_prepare:
    prepare_path = "<path to prepared model dir>"
else:
    prepare_path = os.path.join(output_dir, 'prepare')
os.makedirs(prepare_path, exist_ok=True)
prepare_filename = f'{model_name}'

if skip_prepare:
    with event_marker(f"KVCache load pre-prepared {prepare_filename}", flush_ram=True):
        prepared_model_path = os.path.join(prepare_path, f'{prepare_filename}.py')
        if not os.path.exists(prepared_model_path):
            raise ValueError(f"prepared artifacts not found in {prepare_path}")
        else:
            print(f'WARNING: preparation skipped for model={prepare_filename}, prepared at {time.ctime(os.path.getmtime(prepared_model_path))}')
            prepared_model = load_torch_model_using_safetensors(path=prepare_path, filename=prepare_filename, model_name=prepare_filename)

else:
    input_names=['pixel_values']
    output_names=['image_features']
    
    with event_marker("Prepare Model", flush_ram=True):
        if __name__ == '__main__': # We use the main guard to prevent child processes from re-running the top-level code
            prepared_veg = prepare_model(veg_adapted,
                                        sample_inputs.to(device=model.device),
                                        model_name=prepare_filename,
                                        filename=prepare_filename,
                                        path=prepare_path,
                                        input_names=input_names,
                                        output_names=output_names,
                                        onnx_export_args={"opset_version":17},
                                        keep_original_model_structure=False, # Flatten the model to enable weight-sharing by setting `keep_original_model_structure = False\n",
                                        order_inputs=True,
                                        order_outputs=True,
                                        skipped_optimizers=['eliminate_common_subexpression',
                                                            'eliminate_nop_with_unit', 
                                                            'eliminate_duplicate_initializer'
                                                            ],
                                        return_prepare_model=True,
                                        )
        else:
            raise Exception("Killing multiprocessing spawn started by Converter during model preparation.")

#### 4.1 SQNR evaluation for prepared FP model

In [None]:
if run_sqnr_eval:
    with torch.no_grad(), place_model(prepared_veg, torch.device('cuda')):
        output_prepared = prepared_veg(sample_inputs.to(device='cuda')).cpu()  
    
    adapted_prepared_sqnr = get_sqnr(output_adapted, output_prepared)
    print(f'Adapted to prepared model SQNR: {adapted_prepared_sqnr}')

### 5. Quantization

In [None]:
from aimet_common.defs import QuantScheme
from aimet_torch.v2.quantsim import QuantizationSimModel
from aimet_torch.v2.experimental import set_matmul_second_input_producer_to_8bit_symmetric, propagate_output_encodings
from aimet_torch.nn.modules import custom as elementwise_ops
from aimet_torch.v2.experimental.quantsim_utils import clip_weights_to_7f7f
from tqdm import tqdm
from copy import deepcopy
import functools



def copy_model_with_shared_weights(source_model):
    target_model = deepcopy(source_model)
    for name, source_parameter in source_model.named_parameters():
        pre, _, post = name.rpartition('.')
        pre_obj = functools.reduce(getattr, [target_model] + pre.split('.')) if pre else target_model
        setattr(pre_obj, post, source_parameter)
    return target_model

# Create copy of fp model defintion for SeqMSE and LoRA Flow
fp_prepared_veg = copy_model_with_shared_weights(prepared_veg)

with event_marker('Create QuantSim'):
    veg_sim = QuantizationSimModel(model=prepared_veg,
                                      quant_scheme='tf',
                                      dummy_input=sample_inputs.to(device=next(prepared_veg.parameters()).device),
                                      default_output_bw=16,
                                      default_param_bw=8,
                                      in_place=True,
                                      config_file=htp_config_file)

set_matmul_second_input_producer_to_8bit_symmetric(veg_sim)
propagate_output_encodings(veg_sim, elementwise_ops.Concat)

#### 5.1 Calibration

In [None]:
num_calibration_batches = 50

def _forward_pass(_model, kwargs):
    dataset = kwargs['dataset']
    num_calibration_batches = kwargs['num_calibration_batches']
    with torch.no_grad(), place_model(_model, torch.device('cuda')):
        for idx, inputs in enumerate(tqdm(dataset, total=num_calibration_batches)):
            if idx > num_calibration_batches:
                break
            _ = _model(inputs.to(device='cuda'))
            
kwargs = { 
            'dataset': train_dataset,
            'num_calibration_batches': num_calibration_batches
         }
    
with event_marker(f"Compute Encodings"):
    with place_model(veg_sim.model, torch.device('cuda')):
        veg_sim.compute_encodings(_forward_pass, kwargs)

clip_weights_to_7f7f(veg_sim)

#### 5.1 SQNR evaluation for prepared QuantSim model

In [None]:
if run_sqnr_eval:
    with torch.no_grad(), place_model(veg_sim.model, torch.device("cuda")):
        sim_output = veg_sim.model(sample_inputs.to(device='cuda')).cpu()
    
    prepared_sim_sqnr = get_sqnr(output_prepared, sim_output)
    print(f'Prepared to sim model SQNR: {prepared_sim_sqnr}')

### 6. Export
the pipeline call below would export onnx model, encoding and test vector for KVCache models.

#### 6.1 Generate Test Vectors for QNN SDK

In [None]:
from genai_lib.lvm.eval_utils import generate_vectors

base_output_dir = os.path.join(output_dir, 'export')
os.makedirs(base_output_dir, exist_ok=True)

with event_marker(f"Vector generation"):
    generate_vectors(output_path = os.path.join(base_output_dir, 'test_vectors'),
                     sim_model = veg_sim.model.to(device='cuda',dtype=dtype),
                     sample_inputs = [sample_inputs.to(device='cuda')],
                     input_names = input_names,
                     output_names = output_names,
                     num_samples = 1)

#### 6.2 Model Export

In [None]:
from aimet_torch import onnx_utils
from aimet_torch.onnx_utils import OnnxExportApiArgs


# Setting this flag to False means that the prepared model will be flattened
onnx_utils.EXPORT_TO_ONNX_DIRECT = True

base_onnx_dir = os.path.join(base_output_dir, 'onnx')
os.makedirs(base_onnx_dir, exist_ok=True)
with event_marker('Export ONNX and Encodings'):
    onnx_api_args = OnnxExportApiArgs(opset_version=17, input_names=input_names, output_names=output_names)
    # onnx_utils.RESTORE_ONNX_MODEL_INITIALIZERS = True
    veg_sim.model.cpu()
    veg_sim.export(path=base_onnx_dir, filename_prefix=model_name, dummy_input=sample_inputs, onnx_export_args=onnx_api_args)

#### 6.3 Save Quantsim Model

In [None]:
import pickle as pkl

# Increase recursion depth limit to save full model
sys.setrecursionlimit(100000)

# base_dir = os.path.join(output_dir, 'quantsim')
with event_marker("save quantsim model"), open(f"{output_dir}/{prepare_filename}.pkl", 'wb') as file:
    pkl.dump(veg_sim, file)

### Summary

In [None]:
from genai_lib.common.debug.profiler import EventProfiler
from genai_lib.common.debug.recipe_logger import dump_logs_to_json

EventProfiler().report()
EventProfiler().json_dump(os.path.join(output_dir, 'profiling_stats.json'))
dump_logs_to_json()

Copyright (c) 2024 Qualcomm Technologies, Inc. and/or its subsidiaries.