**Day 5 Purpose:**  
Learn model quantization (FP16, 8-bit, 4-bit) to run LLMs with less memory. Compare speed/accuracy trade-offs. Apply to load bigger models on limited hardware.

`use less memory`

Learn to shrink LLMs (quantization) to run them with less memory, while measuring speed/quality trade-offs.

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

#### 0 : checking speed

In [None]:
def load_quant(model='distilgpt2'):

    # fp16
    fp16 = AutoModelForCausalLM.from_pretrained(
        model, torch_dtype=torch.float16, device_map='auto')

    # 8 bit
    m_8bits = AutoModelForCausalLM.from_pretrained(
        model, load_in_8bit=True, device_map='auto')

    # 4 bit + BitsAndBytesConfig
    bnb = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type='nf4', bnb_4bit_compute_dtype=torch.float16)
    
    m_4bits = AutoModelForCausalLM.from_pretrained(
        model, quantization_config=bnb, device_map='auto')
    
    return fp16, m_8bits, m_4bits

In [None]:
print("Loading quantized models...")
f16, m_8bit, m_4bit = load_quant()

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')

In [None]:
from datetime import datetime

In [None]:
today = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(today)

prompt = "Hello world"
inputs = tokenizer(prompt, return_tensors='pt').to(f16.device)
outputs = f16.generate(**inputs, max_new_tokens=30, do_sample=True, temperature=1.0, top_p=1.0)
print(tokenizer.decode(outputs[0]))

today = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(today)

In [None]:
today = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(today)

prompt = "Hello world"
inputs = tokenizer(prompt, return_tensors='pt').to(m_8bit.device)
outputs = m_8bit.generate(**inputs, max_new_tokens=30, do_sample=True, temperature=1.0, top_p=1.0)
print(tokenizer.decode(outputs[0]))

today = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(today)

In [None]:
today = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(today)

prompt = "Hello world"
inputs = tokenizer(prompt, return_tensors='pt').to(m_4bit.device)
outputs = m_4bit.generate(**inputs, max_new_tokens=30, do_sample=True, temperature=1.0, top_p=1.0)
print(tokenizer.decode(outputs[0]))

today = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(today)

#### 1 : Cache Memory & Speed Benchmark

**Purpose:** Measure and compare memory usage and generation speed across different quantization levels (FP16, 8-bit, 4-bit) of the same model.

In [None]:
import time

In [None]:
def benchmark(model, tokenizer, prompt, model_name):

    # cache memory stuffs
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        memory_before = torch.cuda.memory_allocated() / 1024**3

    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

    # time
    start_time = time.time()
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs, max_new_tokens=100, do_sample=True, temperature=0.7)
        
    end_time = time.time()

        # Memory after
    if torch.cuda.is_available():
        memory_after = torch.cuda.memory_allocated() / 1024**3
        memory_used = memory_after - memory_before
    else:
        memory_used = 0
    
    generation_time = end_time - start_time
    tokens_generated = outputs.shape[1] - inputs.input_ids.shape[1]
    tokens_per_second = tokens_generated / generation_time

    print(f"\n=== {model_name} ===")
    print(f"Memory used: {memory_used:.2f} GB")
    print(f"Generation time: {generation_time:.2f} seconds")
    print(f"Tokens/second: {tokens_per_second:.2f}")
    print(f"Text: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")
    
    return memory_used, tokens_per_second

In [None]:
prompt = "The future of renewable energy will be"

In [None]:
models = {
    "FP16": f16,
    "8-bit": m_8bit, 
    "4-bit": m_4bit}

In [None]:
results = {}
for name, model in models.items():
    memory, speed = benchmark(model, tokenizer, prompt, name)
    results[name] = {"memory_gb": memory, "tokens_per_sec": speed}

8-bit fastest (5.03 tokens/sec).

FP16 slower (2.46 tokens/sec).

4-bit slowest (2.34 tokens/sec)

In [None]:
# from transformers import BitsAndBytesConfig

# # 8-bit
# bnb_8bit = BitsAndBytesConfig(load_in_8bit=True)
# model_8bit = AutoModelForCausalLM.from_pretrained(
#     model, quantization_config=bnb_8bit, device_map="auto"
# )

# # 4-bit
# bnb_4bit = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16
# )
# model_4bit = AutoModelForCausalLM.from_pretrained(
#     model, quantization_config=bnb_4bit, device_map="auto"
# )

#### 2

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
name = "distilgpt2"
quantize = True

if quantize:
    bnb = BitsAndBytesConfig(load_in_8bit=True)
    
    model = AutoModelForCausalLM.from_pretrained(
        name, quantization_config = bnb, device_map = 'auto')
else:
    model = AutoModelForCausalLM.from_pretrained(
        name, torch_dtype = torch.float16, device_map = 'auto')

In [8]:
tokenizer = AutoTokenizer.from_pretrained(name)

In [39]:
def generate(model, tokenizer, prompt, max_new_tokens=50, do_sample=True, temperature=1.0, top_p=1.0):
    
    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
    
    with torch.no_grad():
    
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.eos_token_id)
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [45]:
prompt = 'lawsuit'
print(generate(model, tokenizer, prompt))

lawsuit against Thessaloniki and the New Testament. The New Testament uses a metaphor for the death and resurrection of Jesus Christ – and the resurrection of Jesus Christ. The New Testament uses an allegory of the death and resurrection of Jesus Christ – and the


In [42]:
prompts = ["What is AI?", "Diligents."]

In [43]:
for i in prompts:
    text = generate(model, tokenizer, i)
    print(text + '\n')

What is AI?


How does that sound?
It sounds like I'm on a mission to get to my house when I need a little help!
Why not do you know I've been doing this for a while that I'm working on?


Diligents. These compounds, of course, are so abundant, that the formation of different substances was not thought to have been sufficient. They were then not found to have a fundamental or fundamental function in a brain; it was an organoid organoid. These compounds

