In [1]:
import pandas as pd
from dotenv import load_dotenv
import os
import numpy as np

import torch
import transformers
from huggingface_hub import login
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)

In [3]:
load_dotenv(".env")
hf_token = os.getenv("hf_token")
login(hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/s448780/.cache/huggingface/token
Login successful


# target model

In [4]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# quantization

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [6]:
from transformers import AutoConfig

AutoConfig.from_pretrained(model_id)

LlamaConfig {
  "_name_or_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 8.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.44.0",
  "use_cache": true,
  "vocab_size": 128256
}

# loading the model

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config, 
    use_cache=False, 
    device_map="auto"
)

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [8]:
model # instruct

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps

# tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_id,
                                          # add_bos_token = True, 
                                          padding_side = "left")

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

# eos token

In [10]:
tokenizer.eos_token_id, tokenizer.eos_token

(128009, '<|eot_id|>')

In [11]:
tokenizer.bos_token_id, tokenizer.bos_token

(128000, '<|begin_of_text|>')

In [13]:
tokenizer.pad_token = tokenizer.eos_token

In [14]:
tokenizer.pad_token

'<|eot_id|>'

# stop ids

In [157]:
# stop_ids = tokenizer([".", "?", "!"], return_tensors="pt").input_ids.to("cuda:0")
# stop_ids = tokenizer(["\n"], return_tensors="pt").input_ids.to("cuda:0")

stop_ids = tokenizer(["end_of_response"], return_tensors="pt").input_ids.to("cuda:0")

In [158]:
stop_ids[0][1]

tensor(408, device='cuda:0')

In [17]:
# for llama 3.1
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
terminators

[128009, 128009]

# inference

In [16]:
# input_ids = tokenizer('''Always answer the question in a concise manner.
# You must not repeat the question and add eos_token only at the end of your response to the question.
# who invented electricity?''', padding = True, return_tensors="pt").input_ids.to("cuda:0")
# input_ids


# for llama 3.1
# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct
messages = [
    {"role": "system", "content": "Always answer the question in a concise manner."},
    {"role": "user", "content": "Who invented electricity?"},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

input_ids

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1627,  10263,    220,   2366,     19,    271,  38195,   4320,
            279,   3488,    304,    264,  64694,  11827,     13, 128009, 128006,
            882, 128007,    271,  15546,  36592,  18200,     30, 128009, 128006,
          78191, 128007,    271]], device='cuda:0')

In [178]:
# test = torch.tensor([128000,  15546,  36592,  18200,     30, 128009])

In [179]:
# torch.where(test == 128009, 0, 1)

In [18]:
# model.eval() # dropout modules are deactivated
# with torch.inference_mode():
#     outputs = model.generate(
#         input_ids=input_ids,
#         # attention_mask = torch.where(input_ids == 128009, 0, 1),
#         max_new_tokens=2048,
#         # pad_token_id = tokenizer.eos_token_id,
#         # do_sample=True, 
#         # top_p=0.9,
#         # temperature=0.5,
#         top_p = None,
#         temperature = None,
#         # eos_token_id = stop_ids[0][1]
#     )

# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [32]:
outputs = model.generate(
    input_ids,
    max_new_tokens=2048,
    eos_token_id=terminators[0],
    do_sample=True,
    temperature=0.6,
    top_p=0.9
)
response = outputs[0][input_ids.shape[-1]:]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


In [33]:
print(tokenizer.decode(response, skip_special_tokens=False))

Electricity is a naturally occurring phenomenon, and it's difficult to attribute its discovery to a single person. However, the understanding and harnessing of electricity as we know it today is often credited to Benjamin Franklin, who demonstrated the connection between lightning and electricity in the 18th century, and Michael Faraday, who made significant contributions to the understanding of electromagnetic induction.<|eot_id|>


In [25]:
print(outputs)

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1627,  10263,    220,   2366,     19,    271,  38195,   4320,
            279,   3488,    304,    264,  64694,  11827,     13, 128009, 128006,
            882, 128007,    271,  15546,  36592,  18200,     30, 128009, 128006,
          78191, 128007,    271,   3947,    374,    912,   3254,  72001,    315,
          18200,     13,  74456,    374,    264,   5933,  25885,    430,    706,
           1027,  13468,    323,  20041,    369,   9214,    315,   1667,     13,
           4452,     11,   1401,  12678,   1093,  30411,  19372,    323,   8096,
          13759,  65726,   1903,   5199,  19564,    311,   1057,   8830,    315,
          18200,     13, 128009]], device='cuda:0')


In [28]:
tokenizer.decode(outputs[0])

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nAlways answer the question in a concise manner.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho invented electricity?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThere is no single inventor of electricity. Electricity is a natural phenomenon that has been observed and studied for thousands of years. However, key figures like Benjamin Franklin and Michael Faraday made significant contributions to our understanding of electricity.<|eot_id|>'

# analysing outputs

In [22]:
flattened_list = np.array([item for sublist in outputs.to("cpu") for item in sublist])
np.where(flattened_list == 128009)

(array([ 34,  43, 113]),)

# pipeline

In [52]:
inference = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=2048,
)

In [23]:
# inference("Who invented electricity?")