# Custom Kernel Tutorial
> Revision
> - Created on 21/01/2025, Cheng Zhang: PyTorch 2.5.0, CUDA 12.3
> - Fixed and Tested on 05/02/2025, Cheng Zhang: PyTorch 2.6.0, CUDA 12.5

## Env Setup in Colab

Check if Colab is connected to a NVIDIA Tesla T4 or Ada L4 GPU (L4 is faster), if not, change Colab runtime to T4 or L4.

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('❌ Not connected to a GPU')
else:
  print(gpu_info)

Wed Feb  5 00:10:56 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   62C    P8             14W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

Install dependencies

In [2]:
! pip install tox ninja torch numpy scipy rust-just pytest transformers
! pip install -U build

git_token = "Your Git Token"
! git clone --recurse-submodules https://{git_token}@github.com/DeepWok/mase-cuda.git
%cd mase-cuda

Collecting tox
  Downloading tox-4.24.1-py3-none-any.whl.metadata (3.7 kB)
Collecting ninja
  Downloading ninja-1.11.1.3-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.3 kB)
Collecting rust-just
  Downloading rust_just-1.39.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.5/120.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting colorama>=0.4.6 (from tox)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting pyproject-api>=1.8 (from tox)
  Downloading pyproject_api-1.9.0-py3-none-any.whl.metadata (2.7 kB)
Collecting virtualenv>=20.27.1 (from tox)
  Downloading virtualenv-20.29.1-py3-none-any.whl.metadata (4.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvid

## Build & Run C++ Test

In [3]:
!just build-cu-test

[1m# python[0m
[1mif [ -d /content/mase-cuda/dist ]; then rm -r /content/mase-cuda/dist; fi[0m
[1mif [ -d /content/mase-cuda/src/mase_cuda.egg-info ]; then rm -r /content/mase-cuda/src/mase_cuda.egg-info; fi[0m
[1m# all[0m
[1mif [ -d /content/mase-cuda/build ]; then rm -r /content/mase-cuda/build; fi[0m
[1mecho $(which cmake)[0m
/usr/local/bin/cmake
[1mcmake -D BUILD_TESTING=ON -D CUDA_ARCHITECTURES=native -B build -S .[0m
-- The CUDA compiler identification is NVIDIA 12.5.82 with host compiler GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting CUDA compiler ABI info
-- Detecting CUDA compiler ABI info - done
-- Check for working CUDA compiler: /usr/local/cuda/bin/nvcc - skipped
-- Detecting CUDA compile features
-- Detecting CUDA compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile featu

Run test executable

In [4]:
! ./build/test/cu/mxint/dequantize/test_mxint8_dequantize1d

Usage: ./build/test/cu/mxint/dequantize/test_mxint8_dequantize1d [m] [group_size] [is_random]
m=4096, group_size=128, num_groups=32, is_random=0
PASSED


## Build & Try mase-cuda Package

The building process can be slow. NVIDIA T4's compuate capability is 7.5, and L4 is 8.9.

In [5]:
! TORCH_CUDA_ARCH_LIST="7.5 8.9" MAX_JOBS=$(nproc --all) python -m build --wheel

[1m* Creating isolated environment: venv+pip...[0m
[1m* Installing packages in isolated environment:[0m
  - numpy
  - setuptools
  - torch
[1m* Getting build dependencies for wheel...[0m
running egg_info
creating src/mase_cuda.egg-info
writing src/mase_cuda.egg-info/PKG-INFO
writing dependency_links to src/mase_cuda.egg-info/dependency_links.txt
writing requirements to src/mase_cuda.egg-info/requires.txt
writing top-level names to src/mase_cuda.egg-info/top_level.txt
writing manifest file 'src/mase_cuda.egg-info/SOURCES.txt'
reading manifest file 'src/mase_cuda.egg-info/SOURCES.txt'
writing manifest file 'src/mase_cuda.egg-info/SOURCES.txt'
[1m* Building wheel...[0m
running bdist_wheel
running build
running build_py
creating build/lib.linux-x86_64-cpython-311/mase_cuda
copying src/mase_cuda/utils.py -> build/lib.linux-x86_64-cpython-311/mase_cuda
copying src/mase_cuda/constants.py -> build/lib.linux-x86_64-cpython-311/mase_cuda
copying src/mase_cuda/__init__.py -> build/lib.lin

> 🩹 We create a new env and run experiments there to avoid errors like `cuda pytorch undefined symbol` raied by the Colab's built-in Python

When the wheel is built, install the mase-cuda wheel into **a new dev env**

- Open **Colab termimal** and run the following commands to create a dev env and install mase-cuda:

  ```bash
  cd mase-cuda
  tox -e dev # create dev env
  . .tox/dev/bin/activate # activate dev env
  which pip # ensure this is the pip in .tox/dev
  pip install dist/mase_cuda-0.0.1-cp311-cp311-linux_x86_64.whl # install mase-cuda
  ```
- Colab terminal: Running the following command to profile dequantization latency (CPU vs GPU). This is slow:

  ```bash
  pytest -v --log-cli-level INFO test/py/mxint8/dequantize/test_dequantize1d.py::test_ext_dequantize1d_latency
  ```

The output looks like this

```bash
============================================================== test session starts ===============================================================
platform linux -- Python 3.11.11, pytest-8.3.4, pluggy-1.5.0 -- /content/mase-cuda/.tox/dev/bin/python
cachedir: .pytest_cache
rootdir: /content/mase-cuda
configfile: tox.ini
collected 1 item                                                                                                                                 

test/py/mxint8/dequantize/test_dequantize1d.py::test_ext_dequantize1d_latency
----------------------------------------------------------------- live log call ------------------------------------------------------------------
INFO     test_dequantize1d:test_dequantize1d.py:203
+-----------+------------+------------------------+------------------------+---------------------+
|     m     | group_size |      latency_cpu       |      latency_gpu       |     GPU speedup     |
+-----------+------------+------------------------+------------------------+---------------------+
|   1024    |     8      | 1.8215179443359376e-05 | 5.830879891291261e-05  | 0.31239160783546144 |
|   1024    |     16     | 1.1014938354492188e-05 | 2.411839971318841e-05  | 0.45670270355744236 |
|   1024    |     32     | 1.0752677917480469e-05 | 2.3455999884754422e-05 | 0.45841908127179576 |
|   1024    |     64     | 1.0418891906738282e-05 | 2.366719990968704e-05  | 0.44022495041645404 |
|   1024    |    128     | 1.043081283569336e-05  | 2.370399972423911e-05  | 0.44004442106987857 |
|   1024    |    256     | 9.298324584960938e-06  | 2.200479982420802e-05  | 0.42255892619989316 |
|   1024    |    512     | 9.393692016601562e-06  | 2.2529599815607072e-05 | 0.41694890692617675 |
|  2097152  |     8      |  0.019188427925109865  | 3.4844799526035784e-05 |  550.682689701584   |
|  2097152  |     16     |  0.018679165840148927  | 3.453759923577308e-05  |  540.8356762910598  |
|  2097152  |     32     |  0.018654394149780273  | 3.327519977465272e-05  |  560.6095313059608  |
|  2097152  |     64     |  0.01862926483154297   | 3.344159927219152e-05  |  557.068598302181   |
...
| 234881024 |     32     |   2.2112363457679747   |  0.008763614416122436  | 252.32013194235924  |
| 234881024 |     64     |   2.3624018669128417   |  0.008794196844100953  | 268.63190678947717  |
| 234881024 |    128     |   2.402256155014038    |  0.008851744079589843  |  271.3878907268803  |
| 234881024 |    256     |   2.4408880949020384   |  0.008949855947494508  |  272.7293164517759  |
| 234881024 |    512     |   2.4862973570823668   |  0.004568324756622315  |  544.2470685732645  |
+-----------+------------+------------------------+------------------------+---------------------+
PASSED                                                                                                                                     [100%]

========================================================= 1 passed in 466.97s (0:07:46) ==========================================================
```

### FP32 Deberta Demo

- Colab Terminal: Install transformers in the dev env: `pip install transformers`

- demo.py: Copy the following codes into a new file demo.py

```python
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "AnkitAI/deberta-xlarge-base-emotions-classifier"
# if you meet OOM error, try this smaller model, but the quantization effect may not be obvious later
# model_name = "AnkitAI/deberta-v3-small-base-emotions-classifier"
model = AutoModelForSequenceClassification.from_pretrained(model_name).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)
label2emotion = {idx: emotion for emotion, idx in model.config.label2id.items()}

# Example usage
@torch.no_grad()
def predict_emotion(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.cuda() for k, v in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = logits.argmax(dim=1)
    predictions = label2emotion[predictions.item()]
    top3_values, top3_indices = torch.topk(logits, 3)
    top3_values = top3_values.cpu().tolist()
    top3_indices = top3_indices.cpu().tolist()
    return predictions, (top3_values, top3_indices)


text = "I'm so happy with the results!"
emotion, top3 = predict_emotion(model, tokenizer, text)

print("Index to Emotion Mapping:", label2emotion)
print("Input text:", text)
print("Detected Emotion:", emotion)
print(f"top3 logits: {top3[0]}, top3 indices: {top3[1]}")
```

- Colab Terminal: run demo.py in the dev env

The output looks like this:

```bash
config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1.12k/1.12k [00:00<00:00, 7.73MB/s]
model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████| 3.04G/3.04G [01:12<00:00, 42.0MB/s]
tokenizer_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████| 1.31k/1.31k [00:00<00:00, 12.6MB/s]
vocab.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 798k/798k [00:00<00:00, 5.79MB/s]
merges.txt: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 3.35MB/s]
tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████| 2.11M/2.11M [00:00<00:00, 10.3MB/s]
special_tokens_map.json: 100%|███████████████████████████████████████████████████████████████████████████████████| 969/969 [00:00<00:00, 8.67MB/s]
Index to Emotion Mapping: {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
Input text: I'm so happy with the results!
Detected Emotion: joy
top3 logits: [[7.345228672027588, -1.4850201606750488, -1.6403964757919312]], top3 indices: [[1, 4, 0]]
```

### MXINT8 Deberta
- demo-q.py: Copy the following codes into a new file demo-q.py. This file creates the quantized Deberta and compare GPU memory usage of MXINT8 model with FP32 model

```python
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from mase_cuda.mxint8.linear import QLinearPacked

init_memory = torch.cuda.memory_allocated()  # in bytes
model_name = "AnkitAI/deberta-xlarge-base-emotions-classifier"
model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype=torch.float32).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)
label2emotion = {idx: emotion for emotion, idx in model.config.label2id.items()}

mxint8_group_size = 32
assert model.config.hidden_size % mxint8_group_size == 0
assert model.config.intermediate_size % mxint8_group_size == 0

text = "I'm so happy with the results!"

@torch.no_grad()
def predict_emotion(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.cuda() for k, v in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = logits.argmax(dim=1)
    predictions = label2emotion[predictions.item()]
    top3_values, top3_indices = torch.topk(logits, 3)
    top3_values = top3_values.cpu().tolist()
    top3_indices = top3_indices.cpu().tolist()
    return predictions, (top3_values, top3_indices)

# check the GPU memory usage of FP32 model
torch.cuda.reset_peak_memory_stats()
emotion_fp32, top3_fp32 = predict_emotion(model, tokenizer, text)
peak_memory_fp32 = torch.cuda.max_memory_allocated() - init_memory  # in bytes


def set_layer_by_name(module: torch.nn.Module, name: str, new_layer: torch.nn.Module):
    levels = name.split(".")
    if len(levels) > 1:
        mod_ = module
        for l_idx in range(len(levels) - 1):
            if levels[l_idx].isdigit():
                mod_ = mod_[int(levels[l_idx])]
            else:
                mod_ = getattr(mod_, levels[l_idx])
        setattr(mod_, levels[-1], new_layer)
    else:
        setattr(module, name, new_layer)


for layer_name, layer in model.named_modules():
    if not isinstance(layer, torch.nn.Linear):
        continue
    if "classifier" in layer_name:
        continue
    layer.cuda()
    layer_q = QLinearPacked.build_from_linear(layer, group_size=mxint8_group_size)
    set_layer_by_name(model, layer_name, layer_q)
    del layer
    torch.cuda.empty_cache()

torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
emotion_mxint8, top3_mxint8 = predict_emotion(model, tokenizer, text)
peak_memory_mxint8 = torch.cuda.max_memory_allocated() - init_memory  # in bytes

print(f"FP32 model peak memory: {peak_memory_fp32/1024**2:.4f} MB")
print(f"PF32 prediction: {emotion_fp32}")
print(f"FP32 top3 logits: {top3_fp32[0]}, indices: {top3_fp32[1]}")

print(f"MXINT8 model peak memory: {peak_memory_mxint8/1024**2:.4f} MB")
print(f"MXINT8 prediction: {emotion_mxint8}")
print(f"MXINT8 top3 logits: {top3_mxint8[0]}, indices: {top3_mxint8[1]}")
```

- Colab Terminal: Run demo-q.py in the dev env

The output looks like this:

```bash
FP32 model peak memory: 2906.1997 MB
PF32 prediction: joy
FP32 top3 logits: [[7.345228672027588, -1.4850201606750488, -1.6403964757919312]], indices: [[1, 4, 0]]
MXINT8 model peak memory: 976.1616 MB
MXINT8 prediction: joy
MXINT8 top3 logits: [[7.350157737731934, -1.488325834274292, -1.649757981300354]], indices: [[1, 4, 0]]
```