In [1]:
#Check out the available resources
!nvidia-smi

Mon Nov  6 22:57:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install accelerate
!pip install transformers



In [3]:
import accelerate #library from hugging face (accelerate computations, parallelism...)
import transformers #library to work with pretrained models, tokenizers... from hugging face
import json

REPO_ID = "NousResearch/Llama-2-7b-hf"


In [None]:
!wget https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model

In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
config = transformers.AutoConfig.from_pretrained(REPO_ID) #config of the model: details about the model architecture

#Normally, the model needs 25 GB to be loaded, but if we don't have enough resources, we can use the accelerate module in order
#to have the skeleton of the model (empty version of the model)
with accelerate.init_empty_weights():
    fake_model = transformers.AutoModelForCausalLM.from_config(config)

device_map = accelerate.infer_auto_device_map(fake_model, max_memory={0: "10GiB", "cpu": "10GiB"}) #Spare 10 GB memory and 10GB GPU to load the model
print(json.dumps(device_map, indent=4))

#The GPU resource isn't enough to load the model, so the accelerate module spills the remaining onto the CPU and the disk

{
    "model.embed_tokens": 0,
    "model.layers.0": 0,
    "model.layers.1": 0,
    "model.layers.2": 0,
    "model.layers.3": 0,
    "model.layers.4": 0,
    "model.layers.5": 0,
    "model.layers.6": 0,
    "model.layers.7": 0,
    "model.layers.8": 0,
    "model.layers.9": 0,
    "model.layers.10": 0,
    "model.layers.11.self_attn": 0,
    "model.layers.11.mlp.gate_proj": 0,
    "model.layers.11.mlp.up_proj": 0,
    "model.layers.11.mlp.down_proj": "cpu",
    "model.layers.11.mlp.act_fn": "cpu",
    "model.layers.11.input_layernorm": "cpu",
    "model.layers.11.post_attention_layernorm": "cpu",
    "model.layers.12": "cpu",
    "model.layers.13": "cpu",
    "model.layers.14": "cpu",
    "model.layers.15": "cpu",
    "model.layers.16": "cpu",
    "model.layers.17": "cpu",
    "model.layers.18": "cpu",
    "model.layers.19": "cpu",
    "model.layers.20": "cpu",
    "model.layers.21": "cpu",
    "model.layers.22": "cpu",
    "model.layers.23": "cpu",
    "model.layers.24.self_attn.q_

In [8]:
device_map = {
    'model.embed_tokens': 0,
     'model.layers.0': 0,
     'model.layers.1': 0,
     'model.layers.2': 0,
     'model.layers.3': 0,
     'model.layers.4': 0,
     'model.layers.5': 0,
     'model.layers.6': 0,
     'model.layers.7': 0,
     'model.layers.8': 0,
     'model.layers.9': 0,
     'model.layers.10': 0,
     'model.layers.11': 0,
     'model.layers.12': 0,
     'model.layers.13': 0,
     'model.layers.14': 'cpu',
     'model.layers.15': 'cpu',
     'model.layers.16': 'cpu',
     'model.layers.17': 'cpu',
     'model.layers.18': 'cpu',
     'model.layers.19': 'cpu',
     'model.layers.20': 'cpu',
     'model.layers.21': 'cpu',
     'model.layers.22': 'cpu',
     'model.layers.23': 'cpu',
     'model.layers.24': 'cpu',
     'model.layers.25': 'cpu',
     'model.layers.26': 'cpu',
     'model.layers.27': 'cpu',
     'model.layers.28': 'disk',
     'model.layers.29': 'disk',
     'model.layers.30': 'disk',
     'model.layers.31': 'disk',
     'model.norm': 'disk',
     'lm_head': 'disk'
}

In [6]:
#LlamaTokenizer requires the SentencePiece library
!pip install sentencepiece
!pip install bitsandbytes



In [9]:
import transformers
import sentencepiece
import accelerate
import bitsandbytes

tokenizer = transformers.LlamaTokenizer.from_pretrained(REPO_ID)


model = transformers.LlamaForCausalLM.from_pretrained(
    REPO_ID,
    device_map=device_map,
    offload_folder="/tmp/.offload", #A path to a folder where parts of the model might be stored on disk, if necessary to manage memory
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True, #offloading some computations to the CPU with 32-bit precision
)
#balance between memory efficiency on the GPU, where memory is limited, and numerical accuracy on the CPU

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



In [10]:
import time

batch = tokenizer(
    "Tell me about the Earth planet.",
    return_tensors="pt",
    add_special_tokens=False
)

batch = {k: v for k, v in batch.items()}
n_input_tokens = batch["input_ids"].shape[-1]

t1 = time.perf_counter()
generated = model.generate(batch["input_ids"].to("cuda"), max_length=n_input_tokens+25)
t2 = time.perf_counter()
print(tokenizer.decode(generated[0]))
n_generated = generated.shape[-1]-batch["input_ids"].shape[-1]
print(f"It took {t2-t1}s to generate the sequence of {n_generated} tokens ({n_generated/(t2-t1)} token/s).")



Tell me about the Earth planet.
The Earth planet is a planet that is located in the solar system. It is the third planet from the sun and is
It took 892.913323833s to generate the sequence of 25 tokens (0.02799823827544957 tokens/s).
