# Running Llama2 chat in different sizes: 7B, 13B, 70B

In [None]:
# From: https://huggingface.co/docs/transformers/model_doc/llama2
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import accelerate
import bitsandbytes  # Works with CUDA
import numpy as np
from tqdm import tqdm
import pandas as pd
import time

device = "cuda:0" if torch.cuda.is_available() else "cpu"
access_token = os.environ["HF_API_KEY"]

In [None]:
def print_gpu_usage(n=4):
    for idx in range(n):
        print('Memory allocated in the GPU-{}: {:.2f} GB'.format(idx, torch.cuda.memory_allocated(device='cuda:{}'.format(idx))//10**9))

## Sizes of downloaded models on disk

In [None]:
!huggingface-cli scan-cache

## 1. Loading Llama2-7B-chat

In [None]:
# Get model and tokenizer
model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=access_token);

### 1.1 LLama2-7B-chat with quantization

In [None]:
# With quantization: https://huggingface.co/docs/transformers/v4.33.2/en/main_classes/quantization
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, quantization_config=bnb_config,  token=access_token);

print('Main device for model: ', model.device)
print_gpu_usage()

In [None]:
%%time
prompt = "What does the fox say?"
model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(**model_inputs, max_new_tokens=16)

print(tokenizer.decode(output[0], skip_special_tokens=True), '\n')

### 1.2 LLama2-7B-chat without quantization

In [None]:
# Without quantization
# del model
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.float16, token=access_token, trust_remote_code=True)

print('Main device for model: ', model.device)
print_gpu_usage()

In [None]:
%%time
prompt = "What does the fox say?"
model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(**model_inputs, max_new_tokens=16)

print(tokenizer.decode(output[0], skip_special_tokens=True), '\n')

## 2. Loading Llama2-13B-chat

In [None]:
del model, tokenizer
# Get model and tokenizer
model_name = "meta-llama/Llama-2-13b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=access_token);

### 2.2 LLama2-13B-chat with quantization

In [None]:
# With quantization: https://huggingface.co/docs/transformers/v4.33.2/en/main_classes/quantization
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=bnb_config,  token=access_token);

print('Main device for model: ', model.device)
print_gpu_usage()

In [None]:
%%time
prompt = "What does the fox say?"
model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(**model_inputs, max_new_tokens=16)

print(tokenizer.decode(output[0], skip_special_tokens=True), '\n')

### 2.2 LLama2-13B-chat without quantization

In [None]:
# Without quantization
del model
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.float16, token=access_token, trust_remote_code=True)

print('Main device for model: ', model.device)
print_gpu_usage()

In [None]:
%%time
prompt = "What does the fox say?"
model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(**model_inputs, max_new_tokens=16)

print(tokenizer.decode(output[0], skip_special_tokens=True), '\n')

## 3. Loading Llama2-70B-chat

In [None]:
del model, tokenizer
# Get model and tokenizer
model_name = "meta-llama/Llama-2-70b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=access_token);

### 3.1 LLama2-70B-chat with quantization

In [None]:
# With quantization: https://huggingface.co/docs/transformers/v4.33.2/en/main_classes/quantization
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=bnb_config,  token=access_token);

print('Main device for model: ', model.device)
print_gpu_usage()

In [None]:
%%time
prompt = "What does the fox say?"
model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(**model_inputs, max_new_tokens=16)

print(tokenizer.decode(output[0], skip_special_tokens=True), '\n')

### 3.2 LLama2-70B-chat without quantization

In [None]:
# Without quantization
del model
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16, token=access_token)

print('Main device for model: ', model.device)
print_gpu_usage()

In [None]:
%%time
prompt = "What does the fox say?"
model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
output = model.generate(**model_inputs, max_new_tokens=16)

print(tokenizer.decode(output[0], skip_special_tokens=True), '\n')