In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import unsloth
import peft
import transformers
from typing import cast
import vllm

class CausallLM(transformers.PreTrainedModel, transformers.GenerationMixin):
    ...

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-26 03:22:31 __init__.py:207] Automatically detected platform cuda.


In [3]:
lora_rank = 32
model_name = "meta-llama/Llama-3.1-8B-Instruct"

model, tokenizer = cast(
    tuple[CausallLM, transformers.PreTrainedTokenizerBase],
    unsloth.FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=8192,
        load_in_4bit=True,  # False for LoRA 16bit
        fast_inference=True,  # Enable vLLM fast inference
        # vLLM args
        disable_log_requests=True,
        disable_log_stats=False,
        enable_prefix_caching=True,
        gpu_memory_utilization=0.62,  # Reduce if out of memory
        max_lora_rank=lora_rank,
        # max_num_seqs=1024,
        # enforce_eager=True,
        num_scheduler_steps=16,
        use_async=True,
    ),
)
vllm_engine = cast(vllm.AsyncLLMEngine, model.vllm_engine)
peft_model = cast(
    peft.peft_model.PeftModelForCausalLM,
    unsloth.FastLanguageModel.get_peft_model(
        model,
        r=lora_rank,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],  # Remove QKVO if out of memory
        lora_alpha=lora_rank,
        # Enable long context finetuning
        use_gradient_checkpointing="unsloth",  # type: ignore
        random_state=3407,
    ),
)
lora_model = cast(peft.tuners.lora.LoraModel, peft_model.base_model)
peft_model

==((====))==  Unsloth 2025.3.18: Fast Llama patching. Transformers: 4.49.0. vLLM: 0.7.3.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.109 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/llama-3.1-8b-instruct-unsloth-bnb-4bit with actual GPU utilization = 61.53%
Unsloth: Your GPU has CUDA compute capability 9.0 with VRAM = 79.11 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 8192. Num Sequences = 226.
Unsloth: vLLM's KV Cache can use up to 42.5 GB. Also swap space = 6 GB.
INFO 03-26 03:22:39 config.py:549] This model supports multiple tasks: {'classify', 'generate', 'score', 'reward', 'embed'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes



INFO 03-26 03:22:41 weight_utils.py:254] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-26 03:22:43 model_runner.py:1115] Loading model weights took 5.5976 GB
INFO 03-26 03:22:43 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-26 03:22:45 worker.py:267] Memory profiling takes 1.44 seconds
INFO 03-26 03:22:45 worker.py:267] the current vLLM instance can use total_gpu_memory (79.11GiB) x gpu_memory_utilization (0.62) = 48.68GiB
INFO 03-26 03:22:45 worker.py:267] model weights take 5.60GiB; non_torch_memory takes 0.15GiB; PyTorch activation peak memory takes 1.13GiB; the rest of the memory reserved for KV Cache is 41.80GiB.
INFO 03-26 03:22:45 executor_base.py:111] # cuda blocks: 21401, # CPU blocks: 3072
INFO 03-26 03:22:45 executor_base.py:116] Maximum concurrency for 8192 tokens per request: 41.80x
INFO 03-26 03:22:48 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error 

Capturing CUDA graph shapes: 100%|██████████| 32/32 [00:19<00:00,  1.63it/s]

INFO 03-26 03:23:08 model_runner.py:1562] Graph capturing finished in 20 secs, took 4.17 GiB
INFO 03-26 03:23:08 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 24.35 seconds



Unsloth 2025.3.18 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
  

In [4]:
vllm_engine.engine.scheduler_config.max_num_seqs

226

In [5]:
import asyncio
import torch

prompt = {
    "prompt": torch.tensor(
        [0] * (vllm_engine.engine.scheduler_config.max_model_len - 2)
    )
}


for i in range(vllm_engine.engine.scheduler_config.max_num_seqs):
    await vllm_engine.add_request(
        f"{i}", prompt=prompt, params=vllm.SamplingParams(max_tokens=2)
    )

while await vllm_engine.engine_step(0):
    ...

INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 0.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 1.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 2.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 3.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 4.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 5.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 6.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 7.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 8.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 9.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 10.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 11.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 12.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 13.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 14.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 15.
IN

INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 64.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 65.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 66.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 67.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 68.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 69.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 70.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 71.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 72.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 73.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 74.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 75.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 76.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 77.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added request 78.
INFO 03-26 03:23:15 async_llm_engine.py:211] Added requ

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [None]:
54341

In [None]:
type(peft_model.base_model)

peft.tuners.lora.model.LoraModel

In [None]:
from transformers import models
models.llama.LlamaForCausalLM

from typing import Protocol



