In [1]:
import torch
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from transformers.generation.configuration_utils import GenerationConfig

In [2]:
config = AutoConfig.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", config=config)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
model.model.layers[0]

LlamaDecoderLayer(
  (self_attn): LlamaSdpaAttention(
    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
    (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
    (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (rotary_emb): LlamaDynamicNTKScalingRotaryEmbedding()
  )
  (mlp): LlamaMLP(
    (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
    (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
    (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
    (act_fn): SiLU()
  )
  (input_layernorm): LlamaRMSNorm()
  (post_attention_layernorm): LlamaRMSNorm()
)

In [5]:
model.cuda();

In [6]:
len(model.model.layers)

32

In [7]:
model.training, config.use_cache

(False, True)

In [8]:
x = torch.randint(0,100,(1,16)).cuda()

In [9]:
x.shape

torch.Size([1, 16])

In [10]:
model.config.use_cla = False
model.config.cla_factor = None

In [11]:
output = model(**{"input_ids":x})

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [12]:
output.logits

tensor([[[ 1.1543, -0.4030, -0.9169,  ..., -3.9624, -3.9625, -3.9625],
         [ 4.1748,  8.3917,  4.7381,  ..., -4.6748, -4.6749, -4.6749],
         [ 5.5781,  7.3743,  6.2945,  ..., -4.1477, -4.1479, -4.1478],
         ...,
         [ 5.7303,  3.5076,  4.6866,  ..., -2.4438, -2.4442, -2.4439],
         [ 7.8036,  7.8437,  9.1977,  ..., -2.6672, -2.6676, -2.6673],
         [ 8.0629,  7.7348,  9.0712,  ..., -2.2379, -2.2380, -2.2379]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>)

In [13]:
model.config.use_cla = True
model.config.cla_factor = 1

In [14]:
output = model(**{"input_ids":x})

In [15]:
output.logits

tensor([[[ 1.1543, -0.4030, -0.9169,  ..., -3.9624, -3.9625, -3.9625],
         [ 4.1748,  8.3917,  4.7381,  ..., -4.6748, -4.6749, -4.6749],
         [ 5.5781,  7.3743,  6.2945,  ..., -4.1477, -4.1479, -4.1478],
         ...,
         [ 5.7303,  3.5076,  4.6866,  ..., -2.4438, -2.4442, -2.4439],
         [ 7.8036,  7.8437,  9.1977,  ..., -2.6672, -2.6676, -2.6673],
         [ 8.0629,  7.7348,  9.0712,  ..., -2.2379, -2.2380, -2.2379]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>)

In [16]:
model.config.use_cla = True
model.config.cla_factor = 2

In [17]:
output = model(**{"input_ids":x})

In [19]:
output.logits

tensor([[[ 1.5976,  3.7597,  5.2264,  ..., -3.1546, -3.1546, -3.1545],
         [ 5.1862,  6.1135,  2.3630,  ..., -3.9097, -3.9096, -3.9097],
         [ 6.0368,  6.2814,  3.1696,  ..., -4.4445, -4.4445, -4.4444],
         ...,
         [ 4.0171,  1.7216,  1.8627,  ..., -2.7373, -2.7373, -2.7369],
         [ 4.6751,  5.4727,  4.9282,  ..., -2.3280, -2.3279, -2.3276],
         [ 4.6047,  5.1129,  5.0691,  ..., -2.7998, -2.7996, -2.7993]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>)

### Decoding

In [20]:
inp = torch.tensor(tokenizer.encode("Say Hello world")).cuda()

In [21]:
messages = [
    {"role": "system", "content": "You are an AI assistant."},
    {"role": "user", "content": "Say Hello world 10 times as numbered list."}
]
input_tokens = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True, 
    return_tensors="pt"
).cuda()

In [22]:
model.config.use_cache = False

In [23]:
new_tokens = model.generate(input_tokens, max_new_tokens=128, use_cache=False)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [24]:
print(tokenizer.decode(new_tokens[0]))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an AI assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

Say Hello world 10 times as numbered list.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

*? Madison; and; and; and; and;udo; m; and; m; and; and; and; and;#; roadmap;   ;         ;  ;    Musc;   ;      Road    Road               Road  ;   ;  Road   ;  ;  ;  ;  *  ;  *�͊��� ‐�
