In [1]:
import os
os.chdir(os.getcwd()+"/pratical-llms")

In [2]:
import torch
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os

## Model Sharding 
Model sharding allow you to split your model in defined dimension shards
### Skip this if your model is already shared

In [3]:
from accelerate import Accelerator

In [4]:
accelerator = Accelerator()

In [5]:
shard_output_path = os.getcwd() + "/sharded_model"

In [9]:
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model = AutoModelForCausalLM.from_pretrained(model_id)


In [10]:
accelerator.save_model(model=model,save_directory=shard_output_path,max_shard_size='250MB')

Now you should find in the 'sharded_model' folder all the shards created with a fixed size."
#### Delete the model and clear the memory

In [11]:
del model

## Let's load models in both GPU and CPU (if needed)

Here I'm going to inizialize an empty weights model. Your CPU or GPU RAM utilization should not increase.

In [12]:
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

In [13]:
with init_empty_weights():
    model = AutoModelForCausalLM.from_pretrained(model_id) 



In [14]:
weights_location = os.getcwd() + "/sharded_model/"

In [15]:
model = load_checkpoint_and_dispatch(
    model, checkpoint=weights_location, device_map="auto", no_split_module_classes=['Block']
)

In [16]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

![memory_state.PNG](images/VRAM_1.png)

##### Our model is now in the GPU VRAM. Since I'm using a small model like TinyLLama to demonstrate the point, we'll load more until we reach the limit to see what happens."

In [17]:
with init_empty_weights():
    model_1 = AutoModelForCausalLM.from_pretrained(model_id) 
model_1 = load_checkpoint_and_dispatch(model_1, checkpoint=weights_location, device_map="auto", no_split_module_classes=['Block']
)

with init_empty_weights():
    model_2 = AutoModelForCausalLM.from_pretrained(model_id) 
model_2 = load_checkpoint_and_dispatch(model_2, checkpoint=weights_location, device_map="auto", no_split_module_classes=['Block']
)

In [18]:
with init_empty_weights():
    model_3 = AutoModelForCausalLM.from_pretrained(model_id) 
model_3 = load_checkpoint_and_dispatch(model_3, checkpoint=weights_location, device_map="auto", no_split_module_classes=['Block']
)



  0%|          | 0/1 [00:00<?, ?w/s]

  0%|          | 0/13 [00:00<?, ?w/s]

  0%|          | 0/10 [00:00<?, ?w/s]

  0%|          | 0/10 [00:00<?, ?w/s]

  0%|          | 0/15 [00:00<?, ?w/s]

  0%|          | 0/11 [00:00<?, ?w/s]

  0%|          | 0/10 [00:00<?, ?w/s]

  0%|          | 0/15 [00:00<?, ?w/s]

  0%|          | 0/11 [00:00<?, ?w/s]

  0%|          | 0/10 [00:00<?, ?w/s]

  0%|          | 0/15 [00:00<?, ?w/s]

  0%|          | 0/11 [00:00<?, ?w/s]

  0%|          | 0/10 [00:00<?, ?w/s]

  0%|          | 0/15 [00:00<?, ?w/s]

  0%|          | 0/11 [00:00<?, ?w/s]

  0%|          | 0/10 [00:00<?, ?w/s]

  0%|          | 0/15 [00:00<?, ?w/s]

  0%|          | 0/7 [00:00<?, ?w/s]

  0%|          | 0/1 [00:00<?, ?w/s]



"As you can see, some layers have been offloaded to system RAM."

![memory_2.PNG](images/VRAM_2.png)

For the next one won't be enough space on the GPU VRAM

In [19]:
with init_empty_weights():
    model_4 = AutoModelForCausalLM.from_pretrained(model_id) 
model_4 = load_checkpoint_and_dispatch(model_4, checkpoint=weights_location, device_map="auto", no_split_module_classes=['Block']
)

Let's see the devices

In [20]:
print(model.device)
print(model_1.device) ##This one is all in the GPU VRAM
print(model_2.device) ##This one is all in the GPU VRAM
print(model_3.device) ##This one has some layer on the CPU RAM
print(model_4.device) ##This one is all on the CPU RAM

cuda:0
cuda:0
cuda:0
cuda:0
cpu


### Let's see how many layer have been offloaded to CPU. 

In [21]:
model_3.hf_device_map

{'model.embed_tokens': 0,
 'model.layers.0': 0,
 'model.layers.1': 0,
 'model.layers.2': 0,
 'model.layers.3': 0,
 'model.layers.4': 0,
 'model.layers.5': 0,
 'model.layers.6': 0,
 'model.layers.7': 0,
 'model.layers.8': 0,
 'model.layers.9.self_attn': 0,
 'model.layers.9.mlp.gate_proj': 0,
 'model.layers.9.mlp.up_proj': 'cpu',
 'model.layers.9.mlp.down_proj': 'cpu',
 'model.layers.9.mlp.act_fn': 'cpu',
 'model.layers.9.input_layernorm': 'cpu',
 'model.layers.9.post_attention_layernorm': 'cpu',
 'model.layers.10': 'cpu',
 'model.layers.11': 'cpu',
 'model.layers.12': 'cpu',
 'model.layers.13': 'cpu',
 'model.layers.14': 'cpu',
 'model.layers.15': 'cpu',
 'model.layers.16': 'cpu',
 'model.layers.17': 'cpu',
 'model.layers.18': 'cpu',
 'model.layers.19': 'cpu',
 'model.layers.20': 'cpu',
 'model.layers.21': 'cpu',
 'model.norm': 'cpu',
 'lm_head': 'cpu'}

More then half of the network is on the CPU. I think this will be really slow.
The entire activations and inference state will be transfered on the RAM. 

## Let's see the models in action 

In [22]:
import time

In [23]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [24]:
prompts = ["Write me a small poem about Naples:"]
inputs = tokenizer(prompts, return_tensors="pt", padding=False)
inputs.to("cuda:0")

{'input_ids': tensor([[    1, 14350,   592,   263,  2319, 26576,  1048,  8344,   793, 29901]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

#### Model 0 (GPU)

In [25]:
%%time
start = time.time()
output = model.generate(**inputs,max_new_tokens=50)
end = time.time()
print(end-start)
print(output)
print(tokenizer.batch_decode(output, skip_special_tokens=True))

2.1837682723999023
tensor([[    1, 14350,   592,   263,  2319, 26576,  1048,  8344,   793, 29901,
            13,    13, 29940,   481,   793, 29892,   278,  4272,   310,   278,
          6575, 29892,    13, 11921,   278,  7205,   289,   929,   911,  1559,
          2722,   278,   885,   296,   310,   278,  4940, 29892,    13, 11921,
           278, 19756,   526,   301,  1312,   411, 12297, 13814, 29892,    13,
          2855,   278, 14744,   338, 23139,   411,   278, 11955,   310,   278]],
       device='cuda:0')
['Write me a small poem about Naples:\n\nNaples, the city of the sun,\nWhere the sea breeze carries the scent of the past,\nWhere the streets are lined with ancient buildings,\nAnd the sky is painted with the colors of the']
CPU times: user 2.06 s, sys: 126 ms, total: 2.19 s
Wall time: 2.19 s


#### Model 1 (GPU)

In [26]:
%%time
start = time.time()
output = model_1.generate(**inputs,max_new_tokens=50)
end = time.time()
print(end-start)
print(output)
print(tokenizer.batch_decode(output, skip_special_tokens=True))

1.5310943126678467
tensor([[    1, 14350,   592,   263,  2319, 26576,  1048,  8344,   793, 29901,
            13,    13, 29940,   481,   793, 29892,   278,  4272,   310,   278,
          6575, 29892,    13, 11921,   278,  7205,   289,   929,   911,  1559,
          2722,   278,   885,   296,   310,   278,  4940, 29892,    13, 11921,
           278, 19756,   526,   301,  1312,   411, 12297, 13814, 29892,    13,
          2855,   278, 14744,   338, 23139,   411,   278, 11955,   310,   278]],
       device='cuda:0')
['Write me a small poem about Naples:\n\nNaples, the city of the sun,\nWhere the sea breeze carries the scent of the past,\nWhere the streets are lined with ancient buildings,\nAnd the sky is painted with the colors of the']
CPU times: user 1.53 s, sys: 4.01 ms, total: 1.53 s
Wall time: 1.53 s


### Model 2 (GPU)

In [27]:
%%time
start = time.time()
output=model_2.generate(**inputs,max_new_tokens=50)
end = time.time()
print(end-start)
print(output)
print(tokenizer.batch_decode(output, skip_special_tokens=True))

1.4748246669769287
tensor([[    1, 14350,   592,   263,  2319, 26576,  1048,  8344,   793, 29901,
            13,    13, 29940,   481,   793, 29892,   278,  4272,   310,   278,
          6575, 29892,    13, 11921,   278,  7205,   289,   929,   911,  1559,
          2722,   278,   885,   296,   310,   278,  4940, 29892,    13, 11921,
           278, 19756,   526,   301,  1312,   411, 12297, 13814, 29892,    13,
          2855,   278, 14744,   338, 23139,   411,   278, 11955,   310,   278]],
       device='cuda:0')
['Write me a small poem about Naples:\n\nNaples, the city of the sun,\nWhere the sea breeze carries the scent of the past,\nWhere the streets are lined with ancient buildings,\nAnd the sky is painted with the colors of the']
CPU times: user 1.47 s, sys: 5.35 ms, total: 1.48 s
Wall time: 1.48 s


### Model 3 (GPU and CPU offloading)

In [28]:
%%time
start = time.time()
output=model_3.generate(**inputs,max_new_tokens=50)
end = time.time()
print(end-start)
print(tokenizer.batch_decode(output, skip_special_tokens=True))

31.25254988670349
['Write me a small poem about Naples:\n\nNaples, the city of the sun,\nWhere the sea breeze carries the scent of the past,\nWhere the streets are lined with ancient buildings,\nAnd the sky is painted with the colors of the']
CPU times: user 30.4 s, sys: 819 ms, total: 31.3 s
Wall time: 31.3 s


It is really slow as aspected. Too many layers on the CPU and the transfer between devices is a bottleneck

### Model 4 (CPU)

In [29]:
inputs.to("cpu")

{'input_ids': tensor([[    1, 14350,   592,   263,  2319, 26576,  1048,  8344,   793, 29901]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [30]:
%%time
start = time.time()
output=model_4.generate(**inputs,max_new_tokens=50)
end = time.time()
print(end-start)
print(output)
print(tokenizer.batch_decode(output, skip_special_tokens=True))

6.998687267303467
tensor([[    1, 14350,   592,   263,  2319, 26576,  1048,  8344,   793, 29901,
            13,    13, 29940,   481,   793, 29892,   278,  4272,   310,   278,
          6575, 29892,    13, 11921,   278,  7205,   289,   929,   911,  1559,
          2722,   278,   885,   296,   310,   278,  4940, 29892,    13, 11921,
           278, 19756,   526,   301,  1312,   411, 12297, 13814, 29892,    13,
          2855,   278, 14744,   338, 23139,   411,   278, 11955,   310,   278]])
['Write me a small poem about Naples:\n\nNaples, the city of the sun,\nWhere the sea breeze carries the scent of the past,\nWhere the streets are lined with ancient buildings,\nAnd the sky is painted with the colors of the']
CPU times: user 27.7 s, sys: 206 ms, total: 27.9 s
Wall time: 7 s


From this example seems that CPU offloading is not that great. The inference time is even slower of running the entire model on the CPU. We should controll better how many layers offload to CPU memory.


#### Let's organize better our layer

In [None]:
del model
del model_1
del model_2
del model_3
del model_4

In [33]:
import gc
torch.cuda.empty_cache()
gc.collect()

8859

In [34]:
from accelerate import infer_auto_device_map

## Organize the space of our model by max memory on any device

In [35]:
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

In [36]:
with init_empty_weights():
    model = AutoModelForCausalLM.from_pretrained(model_id) 

In [38]:
device_map = infer_auto_device_map(model, max_memory={0: "4GiB", "cpu": "10GiB"})
with init_empty_weights():
    model = AutoModelForCausalLM.from_pretrained(model_id) 
model = load_checkpoint_and_dispatch(model, checkpoint=weights_location, device_map=device_map, no_split_module_classes=['Block']
)

  0%|          | 0/1 [00:00<?, ?w/s]

  0%|          | 0/13 [00:00<?, ?w/s]

  0%|          | 0/10 [00:00<?, ?w/s]

  0%|          | 0/10 [00:00<?, ?w/s]

  0%|          | 0/15 [00:00<?, ?w/s]

  0%|          | 0/11 [00:00<?, ?w/s]

  0%|          | 0/10 [00:00<?, ?w/s]

  0%|          | 0/15 [00:00<?, ?w/s]

  0%|          | 0/11 [00:00<?, ?w/s]

  0%|          | 0/10 [00:00<?, ?w/s]

  0%|          | 0/15 [00:00<?, ?w/s]

  0%|          | 0/11 [00:00<?, ?w/s]

  0%|          | 0/10 [00:00<?, ?w/s]

  0%|          | 0/15 [00:00<?, ?w/s]

  0%|          | 0/11 [00:00<?, ?w/s]

  0%|          | 0/10 [00:00<?, ?w/s]

  0%|          | 0/15 [00:00<?, ?w/s]

  0%|          | 0/7 [00:00<?, ?w/s]

  0%|          | 0/1 [00:00<?, ?w/s]



In [39]:
model.hf_device_map

{'model.embed_tokens': 0,
 'model.layers.0': 0,
 'model.layers.1': 0,
 'model.layers.2': 0,
 'model.layers.3': 0,
 'model.layers.4': 0,
 'model.layers.5': 0,
 'model.layers.6': 0,
 'model.layers.7': 0,
 'model.layers.8': 0,
 'model.layers.9': 0,
 'model.layers.10': 0,
 'model.layers.11': 0,
 'model.layers.12': 0,
 'model.layers.13': 0,
 'model.layers.14': 0,
 'model.layers.15': 0,
 'model.layers.16': 0,
 'model.layers.17': 0,
 'model.layers.18': 0,
 'model.layers.19': 0,
 'model.layers.20': 0,
 'model.layers.21.self_attn': 0,
 'model.layers.21.input_layernorm': 'cpu',
 'model.layers.21.post_attention_layernorm': 'cpu',
 'model.norm': 'cpu',
 'lm_head': 'cpu',
 'model.layers.21.mlp': 'cpu'}

In [40]:
prompts = ["Write me a small poem about Naples:"]
inputs = tokenizer(prompts, return_tensors="pt", padding=False)
inputs.to("cuda:0")

{'input_ids': tensor([[    1, 14350,   592,   263,  2319, 26576,  1048,  8344,   793, 29901]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [41]:
%%time
start = time.time()
output = model.generate(**inputs,max_new_tokens=50)
end = time.time()
print(end-start)
print(tokenizer.batch_decode(output, skip_special_tokens=True))

6.539928436279297
['Write me a small poem about Naples:\n\nNaples, the city of the sun,\nWhere the sea breeze carries the scent of the past,\nWhere the streets are lined with ancient buildings,\nAnd the sky is painted with the colors of the']
CPU times: user 6.44 s, sys: 93.3 ms, total: 6.54 s
Wall time: 6.54 s


Now we have reached a trade-off that allow us to run at the some speed of running just on CPU. The time needed to transfer the parameter and the state of the inference between device does not compensate the gain of running on a GPU (TinyLLama is not that deep of a model). 

On larger model this could be more convenient, expecially if you don't have enough space on your GPU memory to run a specific model.