# Custom Kernel Tutorial

## Env Setup in Colab

Check if Colab is connected to a NViDIA Tesla T4 GPU, if not, change Colab runtime to this GPU

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('❌ Not connected to a GPU')
else:
  print(gpu_info)

Mon Jan 20 22:26:51 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

Install dependencies

In [2]:
! pip install tox ninja torch numpy scipy rust-just pytest transformers
! pip install -U build

git_token = "YOUR_GIT_TOKEN"
! git clone --recurse-submodules https://{git_token}@github.com/DeepWok/mase-cuda.git
%cd mase-cuda

Collecting tox
  Downloading tox-4.23.2-py3-none-any.whl.metadata (3.7 kB)
Collecting ninja
  Downloading ninja-1.11.1.3-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.3 kB)
Collecting rust-just
  Downloading rust_just-1.38.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.2/117.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting colorama>=0.4.6 (from tox)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting pyproject-api>=1.8 (from tox)
  Downloading pyproject_api-1.8.0-py3-none-any.whl.metadata (2.7 kB)
Collecting virtualenv>=20.26.6 (from tox)
  Downloading virtualenv-20.29.1-py3-none-any.whl.metadata (4.5 kB)
Collecting distlib<1,>=0.3.7 (from virtualenv>=20.26.6->tox)
  Downloading distlib-0.3.9-py2.py3-none-any.whl.metadata (5.2 kB)
Downloading tox-4.23.2-py3-none-any.whl (166 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

## Build & Run C++ Test

In [3]:
!just build-cu-test

[1m# python[0m
[1mif [ -d /content/mase-cuda/dist ]; then rm -r /content/mase-cuda/dist; fi[0m
[1mif [ -d /content/mase-cuda/src/mase_cuda.egg-info ]; then rm -r /content/mase-cuda/src/mase_cuda.egg-info; fi[0m
[1m# all[0m
[1mif [ -d /content/mase-cuda/build ]; then rm -r /content/mase-cuda/build; fi[0m
[1mecho $(which cmake)[0m
/usr/local/bin/cmake
[1mcmake -D BUILD_TESTING=ON -D CUDA_ARCHITECTURES=native -B build -S .[0m
-- The CUDA compiler identification is NVIDIA 12.2.140 with host compiler GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting CUDA compiler ABI info
-- Detecting CUDA compiler ABI info - done
-- Check for working CUDA compiler: /usr/local/cuda/bin/nvcc - skipped
-- Detecting CUDA compile features
-- Detecting CUDA compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile feat

Run test executable

In [4]:
! ./build/test/cu/mxint/dequantize/test_mxint8_dequantize1d

Usage: ./build/test/cu/mxint/dequantize/test_mxint8_dequantize1d [m] [group_size] [is_random]
m=4096, group_size=128, num_groups=32, is_random=0
PASSED


## Build & Try mase-cuda Package

The building process can be slow

In [5]:
! TORCH_CUDA_ARCH_LIST="7.5" MAX_JOBS=$(nproc --all) python -m build --wheel .

[1m* Creating isolated environment: venv+pip...[0m
[1m* Installing packages in isolated environment:[0m
  - numpy
  - setuptools
  - torch
[1m* Getting build dependencies for wheel...[0m
running egg_info
creating src/mase_cuda.egg-info
writing src/mase_cuda.egg-info/PKG-INFO
writing dependency_links to src/mase_cuda.egg-info/dependency_links.txt
writing requirements to src/mase_cuda.egg-info/requires.txt
writing top-level names to src/mase_cuda.egg-info/top_level.txt
writing manifest file 'src/mase_cuda.egg-info/SOURCES.txt'
reading manifest file 'src/mase_cuda.egg-info/SOURCES.txt'
writing manifest file 'src/mase_cuda.egg-info/SOURCES.txt'
[1m* Building wheel...[0m
running bdist_wheel
running build
running build_py
creating build/lib.linux-x86_64-cpython-311/mase_cuda
copying src/mase_cuda/__init__.py -> build/lib.linux-x86_64-cpython-311/mase_cuda
copying src/mase_cuda/constants.py -> build/lib.linux-x86_64-cpython-311/mase_cuda
copying src/mase_cuda/utils.py -> build/lib.lin

Install the mase-cuda wheel

In [6]:
! pip install ./dist/mase_cuda-0.0.1-cp311-cp311-linux_x86_64.whl

Processing ./dist/mase_cuda-0.0.1-cp311-cp311-linux_x86_64.whl
Installing collected packages: mase-cuda
Successfully installed mase-cuda-0.0.1


Profile dequantization latency (CPU vs GPU). This is slow.

In [7]:
! pytest -v --log-cli-level INFO test/py/mxint8/dequantize/test_dequantize1d.py::test_ext_dequantize1d_latency

platform linux -- Python 3.11.11, pytest-8.3.4, pluggy-1.5.0 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /content/mase-cuda
configfile: tox.ini
plugins: typeguard-4.4.1, anyio-3.7.1
collected 1 item                                                                                   [0m

test/py/mxint8/dequantize/test_dequantize1d.py::test_ext_dequantize1d_latency 
[1m------------------------------------------ live log call -------------------------------------------[0m
[32mINFO    [0m test_dequantize1d:test_dequantize1d.py:203 
+-----------+------------+------------------------+------------------------+---------------------+
|     m     | group_size |      latency_cpu       |      latency_gpu       |     GPU speedup     |
+-----------+------------+------------------------+------------------------+---------------------+
|   1024    |     8      | 0.00033932924270629883 | 7.334719887003302e-05  |  4.626342218024857  |
|   1024    |     16     | 1.4746189117431641e-05 | 2.7062

### FP32 Deberta Demo
Create the Deberta demo file and run it

In [8]:
%%bash
cat > demo.py <<- EOF
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "AnkitAI/deberta-xlarge-base-emotions-classifier"
# if you meet OOM error, try this smaller model, but the quantization effect may not be obvious later
# model_name = "AnkitAI/deberta-v3-small-base-emotions-classifier"
model = AutoModelForSequenceClassification.from_pretrained(model_name).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)
label2emotion = {idx: emotion for emotion, idx in model.config.label2id.items()}


# Example usage
@torch.no_grad()
def predict_emotion(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.cuda() for k, v in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = logits.argmax(dim=1)
    predictions = label2emotion[predictions.item()]
    top3_values, top3_indices = torch.topk(logits, 3)
    top3_values = top3_values.cpu().tolist()
    top3_indices = top3_indices.cpu().tolist()
    return predictions, (top3_values, top3_indices)


text = "I'm so happy with the results!"
emotion, top3 = predict_emotion(model, tokenizer, text)

print("Index to Emotion Mapping:", label2emotion)
print("Input text:", text)
print("Detected Emotion:", emotion)
print(f"top3 logits: {top3[0]}, top3 indices: {top3[1]}")
EOF

In [9]:
! python demo.py

config.json:   0% 0.00/1.12k [00:00<?, ?B/s]config.json: 100% 1.12k/1.12k [00:00<00:00, 8.49MB/s]
2025-01-20 22:40:44.039772: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-20 22:40:44.060592: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-20 22:40:44.067121: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-20 22:40:44.082384: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild T

### MXINT8 Deberta
Create the quantized Deberta and compare GPU memory usage with FP32 model.

In [10]:
%%bash
cat > demo-q.py <<- EOF
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from mase_cuda.mxint8.linear import QLinearPacked

init_memory = torch.cuda.memory_allocated()  # in bytes
model_name = "AnkitAI/deberta-xlarge-base-emotions-classifier"
model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=torch.float32).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)
label2emotion = {idx: emotion for emotion, idx in model.config.label2id.items()}

mxint8_group_size = 32
assert model.config.hidden_size % mxint8_group_size == 0
assert model.config.intermediate_size % mxint8_group_size == 0

text = "I'm so happy with the results!"


# Example usage
@torch.no_grad()
def predict_emotion(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.cuda() for k, v in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = logits.argmax(dim=1)
    predictions = label2emotion[predictions.item()]
    top3_values, top3_indices = torch.topk(logits, 3)
    top3_values = top3_values.cpu().tolist()
    top3_indices = top3_indices.cpu().tolist()
    return predictions, (top3_values, top3_indices)


# check the GPU memory usage of FP32 model
torch.cuda.reset_peak_memory_stats()
emotion_fp32, top3_fp32 = predict_emotion(model, tokenizer, text)
peak_memory_fp32 = torch.cuda.max_memory_allocated() - init_memory  # in bytes


def set_layer_by_name(module: torch.nn.Module, name: str, new_layer: torch.nn.Module):
    """
    Replace a layer (`new_layer`) in a model (`module`) by its `name`.
    """
    levels = name.split(".")
    if len(levels) > 1:
        mod_ = module
        for l_idx in range(len(levels) - 1):
            if levels[l_idx].isdigit():
                mod_ = mod_[int(levels[l_idx])]
            else:
                mod_ = getattr(mod_, levels[l_idx])
        setattr(mod_, levels[-1], new_layer)
    else:
        setattr(module, name, new_layer)


for layer_name, layer in model.named_modules():
    if not isinstance(layer, torch.nn.Linear):
        continue
    if "classifier" in layer_name:
        continue
    layer.cuda()
    layer_q = QLinearPacked.build_from_linear(layer, group_size=mxint8_group_size)
    set_layer_by_name(model, layer_name, layer_q)
    del layer
    torch.cuda.empty_cache()

torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
emotion_mxint8, top3_mxint8 = predict_emotion(model, tokenizer, text)
peak_memory_mxint8 = torch.cuda.max_memory_allocated() - init_memory  # in bytes

print(f"FP32 model peak memory: {peak_memory_fp32/1024**2:.4f} MB")
print(f"PF32 prediction: {emotion_fp32}")
print(f"FP32 top3 logits: {top3_fp32[0]}, indices: {top3_fp32[1]}")

print(f"MXINT8 model peak memory: {peak_memory_mxint8/1024**2:.4f} MB")
print(f"MXINT8 prediction: {emotion_mxint8}")
print(f"MXINT8 top3 logits: {top3_mxint8[0]}, indices: {top3_mxint8[1]}")

bash: line 1: new_layer: command not found
bash: line 1: module: command not found
bash: line 1: name: command not found


In [11]:
! python demo-q.py

2025-01-20 22:42:28.139444: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-20 22:42:28.160214: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-20 22:42:28.166695: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-20 22:42:28.181280: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
FP32 model peak memory: 2906.1997 MB
PF32 prediction: