# Quantization

Links:
- https://huggingface.co/docs/transformers/main/main_classes/quantization
- https://huggingface.co/blog/4bit-transformers-bitsandbytes
- https://developer.nvidia.com/blog/sparsity-in-int8-training-workflow-and-best-practices-for-tensorrt-acceleration/?ncid=so-nvsh-193308&dysig_tid=2bb9863664f84623b6e8f425242611ba#cid=dl13_so-nvsh_en-us

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_MODULE_LOADING"] = "LAZY"

In [None]:
# Helpers
import subprocess as sp
import time
import os
import gc
import torch
from transformers import AutoTokenizer, set_seed

set_seed(0)

model_id = "facebook/opt-1.3b"

tokenizer = AutoTokenizer.from_pretrained(model_id)

input_string = "What is the capital of Canada?"

tokens = tokenizer(input_string, return_tensors="pt").input_ids.to("cuda")

prev_used_mem = 0

def print_used_memory():
    global prev_used_mem
    command = "nvidia-smi --query-gpu=memory.used --format=csv"
    memory_used_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
    memory_used_values = [int(x.split()[0]) for i, x in enumerate(memory_used_info)]
    print(f"Used {(memory_used_values[0] - prev_used_mem) / 1000} GiB")
    prev_used_mem = memory_used_values[0]

print_used_memory()

In [None]:
# FP32
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

In [None]:
with torch.no_grad():
    output_32bit_tokens = model.generate(tokens, max_new_tokens=16)
    print(tokenizer.decode(output_32bit_tokens[0], skip_special_tokens=True))
    
    print_used_memory()
    
    %timeit -n 10 model.generate(tokens, max_new_tokens=16)

In [None]:
# cleanup
del model
del output_32bit_tokens
gc.collect()

In [None]:
torch.cuda.empty_cache()

print_used_memory()

In [None]:
# FP16

from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16,)

In [None]:
with torch.no_grad():
    output_16bit_tokens = model.generate(tokens, max_new_tokens=16)
    print(tokenizer.decode(output_16bit_tokens[0], skip_special_tokens=True))
    
    print_used_memory()
    
    %timeit -n 10 model.generate(tokens, max_new_tokens=16)

In [None]:
# cleanup
del model
del output_16bit_tokens
torch.cuda.empty_cache()

print_used_memory()

In [None]:
# FP4
from transformers import BitsAndBytesConfig

config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=config)

In [None]:
with torch.no_grad():
    output_4bit_tokens = model.generate(tokens, max_new_tokens=16)
    print(tokenizer.decode(output_4bit_tokens[0], skip_special_tokens=True))
    
    print_used_memory()
    
    %timeit -n 10 model.generate(tokens, max_new_tokens=16)

In [None]:
# cleanup
del model
del output_4bit_tokens
torch.cuda.empty_cache()

print_used_memory()

In [None]:
#FP4 using FP16 computation
from transformers import BitsAndBytesConfig

config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=config)

In [None]:
with torch.no_grad():
    output_4bit_tokens = model.generate(tokens, max_new_tokens=16)
    print(tokenizer.decode(output_4bit_tokens[0], skip_special_tokens=True))
    
    print_used_memory()
    
    %timeit -n 10 model.generate(tokens, max_new_tokens=16)

In [None]:
# cleanup
del model
del output_4bit_tokens
torch.cuda.empty_cache()

print_used_memory()