In [1]:
# Importing Dependencies
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gc

### Sign in to Hugging Face

In [2]:
hf_token = userdata.get('HF_TOKEN')
login(token = hf_token, add_to_git_credential = True)

### Instruct Models

In [4]:
QWEN2 = "Qwen/Qwen2-7B-Instruct"
GEMMA2 = "google/gemma-2-2b-it"

In [5]:
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
  ]

In [6]:
# Quantization Config = allows us to load the model into the memory and use less memory

quant_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype = torch.bfloat16,
    bnb_4bit_quant_type = "nf4"
)

### Checking for QWEN2 Model

In [7]:
# Tokenizer

tokenizer = AutoTokenizer.from_pretrained(QWEN2) # Loading the tokenizer
tokenizer.pad_token = tokenizer.eos_token # Sets the padding token to be the same as the end-of-sequence (EOS) token
inputs = tokenizer.apply_chat_template(messages, return_tensors = "pt").to("cuda") # Applies a chat-style formatting template

In [None]:
# Loading our Model

model = AutoModelForCausalLM.from_pretrained(QWEN2, device_map = "auto", quantization_config = quant_config)

It basically downloads the model from hugging face and stores it in the local cache folder of this Google Colab.
Once we disconnect from this session, that file will be deleted automatically.

In [9]:
memory = model.get_memory_footprint() / 1e6
print(f"Memory footprint: {memory:,.1f} MB")

Memory footprint: 5,443.3 MB


### **Transformer Model**

Now let's take a look at the layers of the Neural Network that get printed in the next cell. Look out for this:

- It consists of layers
- There's something called "embedding" - this takes tokens and turns them into 4,096 dimensional vectors. We'll learn more about this in Week 5.
- There are then 32 sets of groups of layers called "Decoder layers". Each Decoder layer contains three types of layer: (a) self-attention layers (b) multi-layer perceptron (MLP) layers (c) batch norm layers.
- There is an LM Head layer at the end; this produces the output

In [10]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear4bit(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear4bit(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((3584,), 

In [12]:
outputs = model.generate(inputs, max_new_tokens = 80)
print(tokenizer.decode(outputs[0]))

<|im_start|>system
You are a helpful assistant<|im_end|>
<|im_start|>user
Tell a light-hearted joke for a room of Data Scientists<|im_end|>
<|im_start|>答案:
Why did the data scientist refuse the job offer at Google?

Because she already had a better offer from the "Data Science in a Box" company!<|im_end|>


In [19]:
# Cleaning up memory

del model, inputs, tokenizer, outputs
gc.collect()
torch.cuda.empty_cache()

In [13]:
def generate(model, messages):
  tokenizer = AutoTokenizer.from_pretrained(model)
  tokenizer.pad_token = tokenizer.eos_token
  inputs = tokenizer.apply_chat_template(messages, return_tensors = "pt", add_generation_prompt = True).to("cuda")
  streamer = TextStreamer(tokenizer)
  model = AutoModelForCausalLM.from_pretrained(model, device_map = "auto", quantization_config = quant_config)
  outputs = model.generate(inputs, max_new_tokens = 80, streamer = streamer)
  del model, inputs, tokenizer, outputs, streamer
  gc.collect()
  torch.cuda.empty_cache()

In [16]:
generate(QWEN2, messages)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

<|im_start|>system
You are a helpful assistant<|im_end|>
<|im_start|>user
Tell a light-hearted joke for a room of Data Scientists<|im_end|>
<|im_start|>assistant
Why did the data scientist break up with the statistician?

Because she couldn't handle the variance!<|im_end|>


### Checking for Gemma Model

In [20]:
gemma2_messages = [
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
]
generate(GEMMA2, gemma2_messages)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<bos><start_of_turn>user
Tell a light-hearted joke for a room of Data Scientists<end_of_turn>
<start_of_turn>model
Why did the data scientist break up with the statistician? 

Because they had too many differences! 😂 

---

Let me know if you'd like to hear another joke! 😊 
<end_of_turn>
