# Import Required Libraries
Import necessary libraries including numpy, onnxruntime, and transformers for tokenization.

In [22]:
# Import Required Libraries
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer

# Load Model and Tokenizer
Load the ONNX model using QNN execution provider and initialize the BERT tokenizer.

In [23]:
# Load Model and Tokenizer

# Load the model using QNN execution provider
session = ort.InferenceSession("model.onnx", providers=['QNNExecutionProvider'])

# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Prepare Input Data
Tokenize input text and prepare input tensors in the correct format for the model.

In [24]:
# Prepare Input Data

# Input data
input_data = "hello, my dog is cute"

# Tokenize input data
inputs = tokenizer(input_data, return_tensors="np")

# Setup dimensions
batch_size = 1
sequence_length = inputs["input_ids"].shape[1]  # Length of your input sequence
num_heads = 8   # Number of attention heads
head_dim = 128  # Dimension per head
num_layers = 28 # Number of transformer layers
past_sequence_length = 0  # For first inference

# Prepare all required inputs
input_feed = {
    "input_ids": inputs["input_ids"],  # Your actual input data
    "attention_mask": inputs["attention_mask"],
}

# Add past key values for each layer
for i in range(num_layers):
    input_feed[f"past_key_values.{i}.key"] = np.zeros(
        (batch_size, num_heads, past_sequence_length, head_dim), 
        dtype=np.float32
    )
    input_feed[f"past_key_values.{i}.value"] = np.zeros(
        (batch_size, num_heads, past_sequence_length, head_dim), 
        dtype=np.float32
    )

# Run inference
outputs = session.run(None, input_feed)

# Make the output human readable and print it
for output in outputs:
    print(output)

[[[ 0.3228024  0.4603106  4.0372653 ... -2.9745815 -2.9745815 -2.9746618]
  [ 1.847092   3.8387372  0.943582  ... -2.0506527 -2.0506527 -2.050631 ]
  [ 3.4689922  5.4430666  3.4331334 ... -2.5087562 -2.5087562 -2.5087974]
  ...
  [ 2.7443786  4.7720385  3.5915008 ... -3.5692616 -3.5692616 -3.5692048]
  [ 2.6249418  4.804499   3.2207792 ... -3.5266652 -3.5266652 -3.5268598]
  [ 1.3153296  3.2112432  2.1495159 ... -4.469041  -4.469041  -4.4688897]]]
[[[[ 6.8922815   1.3648775   2.1178865  ...  0.88829756 -1.0494108
    -1.3997861 ]
   [ 2.2399905   1.1979821  -0.17310703 ...  1.1191653  -1.0379423
    -1.1593561 ]
   [-4.121564   -1.9221435  -1.815557   ...  1.0300308  -1.0724905
    -1.2763766 ]
   ...
   [ 3.855073    0.8876053  -0.33351257 ...  0.9209229  -1.1247166
    -1.3711717 ]
   [ 5.7193522   1.4098068   0.7186619  ...  1.1051456  -0.86902326
    -1.234222  ]
   [ 4.11571     2.908988    2.9472194  ...  0.8436419  -1.1788815
    -1.3909605 ]]

  [[ 0.31731564 -0.38888586  0.788

# Setup QNN Inference Parameters
Configure batch size, sequence length, attention heads and other model parameters.

In [25]:
# Setup QNN Inference Parameters
batch_size = 1
sequence_length = inputs["input_ids"].shape[1]  # Length of your input sequence
num_heads = 8   # Number of attention heads
head_dim = 128  # Dimension per head
num_layers = 28 # Number of transformer layers
past_sequence_length = 0  # For first inference

# Prepare all required inputs
input_feed = {
    "input_ids": inputs["input_ids"],  # Your actual input data
    "attention_mask": inputs["attention_mask"],
}

# Add past key values for each layer
for i in range(num_layers):
    input_feed[f"past_key_values.{i}.key"] = np.zeros(
        (batch_size, num_heads, past_sequence_length, head_dim), 
        dtype=np.float32
    )
    input_feed[f"past_key_values.{i}.value"] = np.zeros(
        (batch_size, num_heads, past_sequence_length, head_dim), 
        dtype=np.float32
    )

# Run inference
outputs = session.run(None, input_feed)

# Make the output human readable and print it
for output in outputs:
    print(output)

[[[ 0.3228024  0.4603106  4.0372653 ... -2.9745815 -2.9745815 -2.9746618]
  [ 1.847092   3.8387372  0.943582  ... -2.0506527 -2.0506527 -2.050631 ]
  [ 3.4689922  5.4430666  3.4331334 ... -2.5087562 -2.5087562 -2.5087974]
  ...
  [ 2.7443786  4.7720385  3.5915008 ... -3.5692616 -3.5692616 -3.5692048]
  [ 2.6249418  4.804499   3.2207792 ... -3.5266652 -3.5266652 -3.5268598]
  [ 1.3153296  3.2112432  2.1495159 ... -4.469041  -4.469041  -4.4688897]]]
[[[[ 6.8922815   1.3648775   2.1178865  ...  0.88829756 -1.0494108
    -1.3997861 ]
   [ 2.2399905   1.1979821  -0.17310703 ...  1.1191653  -1.0379423
    -1.1593561 ]
   [-4.121564   -1.9221435  -1.815557   ...  1.0300308  -1.0724905
    -1.2763766 ]
   ...
   [ 3.855073    0.8876053  -0.33351257 ...  0.9209229  -1.1247166
    -1.3711717 ]
   [ 5.7193522   1.4098068   0.7186619  ...  1.1051456  -0.86902326
    -1.234222  ]
   [ 4.11571     2.908988    2.9472194  ...  0.8436419  -1.1788815
    -1.3909605 ]]

  [[ 0.31731564 -0.38888586  0.788

# Run Model Inference
Execute inference using the QNN provider and obtain model outputs.

In [26]:
# Run Model Inference

# Run inference
outputs = session.run(None, input_feed)

# Make the output human readable and print it
for idx, output in enumerate(outputs):
    print(f"Output {idx}:")
    print(output)

Output 0:
[[[ 0.3228024  0.4603106  4.0372653 ... -2.9745815 -2.9745815 -2.9746618]
  [ 1.847092   3.8387372  0.943582  ... -2.0506527 -2.0506527 -2.050631 ]
  [ 3.4689922  5.4430666  3.4331334 ... -2.5087562 -2.5087562 -2.5087974]
  ...
  [ 2.7443786  4.7720385  3.5915008 ... -3.5692616 -3.5692616 -3.5692048]
  [ 2.6249418  4.804499   3.2207792 ... -3.5266652 -3.5266652 -3.5268598]
  [ 1.3153296  3.2112432  2.1495159 ... -4.469041  -4.469041  -4.4688897]]]
Output 1:
[[[[ 6.8922815   1.3648775   2.1178865  ...  0.88829756 -1.0494108
    -1.3997861 ]
   [ 2.2399905   1.1979821  -0.17310703 ...  1.1191653  -1.0379423
    -1.1593561 ]
   [-4.121564   -1.9221435  -1.815557   ...  1.0300308  -1.0724905
    -1.2763766 ]
   ...
   [ 3.855073    0.8876053  -0.33351257 ...  0.9209229  -1.1247166
    -1.3711717 ]
   [ 5.7193522   1.4098068   0.7186619  ...  1.1051456  -0.86902326
    -1.234222  ]
   [ 4.11571     2.908988    2.9472194  ...  0.8436419  -1.1788815
    -1.3909605 ]]

  [[ 0.3173156

# Process and Display Results
Process the model outputs and display results in a human-readable format.

In [27]:
# Process and Display Results

# Process the model outputs and display results in a human-readable format
for idx, output in enumerate(outputs):
    print(f"Output {idx}:")
    print(output)

Output 0:
[[[ 0.3228024  0.4603106  4.0372653 ... -2.9745815 -2.9745815 -2.9746618]
  [ 1.847092   3.8387372  0.943582  ... -2.0506527 -2.0506527 -2.050631 ]
  [ 3.4689922  5.4430666  3.4331334 ... -2.5087562 -2.5087562 -2.5087974]
  ...
  [ 2.7443786  4.7720385  3.5915008 ... -3.5692616 -3.5692616 -3.5692048]
  [ 2.6249418  4.804499   3.2207792 ... -3.5266652 -3.5266652 -3.5268598]
  [ 1.3153296  3.2112432  2.1495159 ... -4.469041  -4.469041  -4.4688897]]]
Output 1:
[[[[ 6.8922815   1.3648775   2.1178865  ...  0.88829756 -1.0494108
    -1.3997861 ]
   [ 2.2399905   1.1979821  -0.17310703 ...  1.1191653  -1.0379423
    -1.1593561 ]
   [-4.121564   -1.9221435  -1.815557   ...  1.0300308  -1.0724905
    -1.2763766 ]
   ...
   [ 3.855073    0.8876053  -0.33351257 ...  0.9209229  -1.1247166
    -1.3711717 ]
   [ 5.7193522   1.4098068   0.7186619  ...  1.1051456  -0.86902326
    -1.234222  ]
   [ 4.11571     2.908988    2.9472194  ...  0.8436419  -1.1788815
    -1.3909605 ]]

  [[ 0.3173156

In [28]:
# Process and Display Results
for idx, output in enumerate(outputs):
    # Get the predicted token IDs (take argmax along the last dimension)
    predicted_ids = np.argmax(output[0], axis=-1).flatten().tolist()
    
    # Decode the predicted tokens back to text
    predicted_text = tokenizer.decode(predicted_ids, skip_special_tokens=True)
    
    print(f"Output {idx}: {predicted_text}")

Output 0: tutor [unused741] [unused193] [unused891] [unused193] [unused193] [unused193] [unused193]
Output 1: [unused63] [unused63] [unused59] [unused107] [unused63] [unused63] [unused2] [unused83] [unused27] [unused37] [unused37] [unused37] [unused37] [unused37] [unused37] [unused37] [unused37] [unused29] [unused63] [unused63] [unused96] [unused0] [unused0] [unused37] [unused37] [unused37] [unused37] [unused37] [unused37] [unused37] [unused37] [unused101] [unused1] [unused1] [unused65] [unused63] [unused63] [unused64] [unused101] [unused68] [unused63] [unused63] [unused0] [unused64] [unused64]
Output 2: [unused75] [unused38] [unused49] [unused32] [unused46] [unused75] [unused88] [unused74] [unused76] [unused48] [unused16] [unused37] [unused28] [unused48] [unused48] [unused112] [unused84] [unused110] [unused110] [unused85] [unused92] [unused6] [unused82] [unused55] [unused57] [unused52] [unused6] [unused69] [unused73] [unused80] [unused28] [unused12] [unused72] [unused24] [unused104] [