In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
torch.manual_seed(0)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x704a44baaa70>

In [4]:
model_id = 'gpt2'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the model directly in 8-bit mode and assign it to the device
model_int8 = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map='auto',
    load_in_8bit=True
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [5]:
# Print memory footprint
try:
    print(f"Model size: {model_int8.get_memory_footprint():,} bytes")
except AttributeError:
    print("Model size information is not available for 8-bit models.")

Model size: 176,527,896 bytes


In [7]:
# Ensure the type of weights in model_int8 is int8
dtype_set = set(param.dtype for param in model_int8.parameters())

In [8]:
dtype_set

{torch.float16, torch.int8}

In [9]:
# Create a list of tupples with the model parameters indicating layer name and the type of the weights
model_int8_params = [(name, param.dtype) for name, param in model_int8.named_parameters()]

In [10]:
model_int8_params

[('transformer.wte.weight', torch.float16),
 ('transformer.wpe.weight', torch.float16),
 ('transformer.h.0.ln_1.weight', torch.float16),
 ('transformer.h.0.ln_1.bias', torch.float16),
 ('transformer.h.0.attn.c_attn.weight', torch.int8),
 ('transformer.h.0.attn.c_attn.bias', torch.float16),
 ('transformer.h.0.attn.c_proj.weight', torch.int8),
 ('transformer.h.0.attn.c_proj.bias', torch.float16),
 ('transformer.h.0.ln_2.weight', torch.float16),
 ('transformer.h.0.ln_2.bias', torch.float16),
 ('transformer.h.0.mlp.c_fc.weight', torch.int8),
 ('transformer.h.0.mlp.c_fc.bias', torch.float16),
 ('transformer.h.0.mlp.c_proj.weight', torch.int8),
 ('transformer.h.0.mlp.c_proj.bias', torch.float16),
 ('transformer.h.1.ln_1.weight', torch.float16),
 ('transformer.h.1.ln_1.bias', torch.float16),
 ('transformer.h.1.attn.c_attn.weight', torch.int8),
 ('transformer.h.1.attn.c_attn.bias', torch.float16),
 ('transformer.h.1.attn.c_proj.weight', torch.int8),
 ('transformer.h.1.attn.c_proj.bias', torch.

The presence of float16 layers in a model loaded with load_in_8bit=True occurs because the 8-bit quantization process in bitsandbytes is applied selectively, and not all layers are quantized.

1. Selective Quantization
Not all parts of the model are quantized into INT8. Specifically:

- Weights in linear layers (e.g., c_attn.weight, c_proj.weight) are quantized to INT8 because they involve large matrix multiplications, where memory and computation savings are most impactful.
- Biases (e.g., c_attn.bias) and other parameters like layer norms (e.g., ln_1.weight, ln_1.bias) remain in FP16 or FP32:
    - These parameters are not as computationally intensive.
    - Quantizing biases and layer norm weights to INT8 can cause large numerical inaccuracies, negatively impacting model performance