<a href="https://colab.research.google.com/github/Brownwang0426/customLlama/blob/main/Cullama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 0 - Pip install

In [1]:
!pip install huggingface_hub==0.24.5 transformers==4.44.0 trl==0.9.6 peft==0.12.0 bitsandbytes==0.43.3 datasets==2.20.0 openpyxl==3.1.5 xlrd==2.0.1 protobuf==5.27.3 safetensors==0.4.4 faiss-cpu==1.8.0 langchain==0.2.14

Collecting huggingface_hub==0.24.5
  Downloading huggingface_hub-0.24.5-py3-none-any.whl.metadata (13 kB)
Collecting transformers==4.44.0
  Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl==0.9.6
  Downloading trl-0.9.6-py3-none-any.whl.metadata (12 kB)
Collecting peft==0.12.0
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes==0.43.3
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting datasets==2.20.0
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting protobuf==5.27.3
  Downloading protobuf-5.27.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting tyro>=0.5.11 (from trl==0.9.6)
  Downloading tyro-0.8.8-py3-none-any.whl.metadata (8.4 kB)
Collecting pyarrow>=15.0.0 (from datasets==2.20.0)
  Downloading pyarrow-17.0

### 1 - Log in

In [1]:
your_huggingface_finegrained_token = "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"

In [2]:
from huggingface_hub import login

login(token = your_huggingface_finegrained_token )

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### 2 - Download pretrained model and tokenizer

Select your llama-3 model and the number of layers you want to preserve from huggingface. We support only llama-3 for the present time.

In [None]:
your_model_name              = "yentinglin/Llama-3-Taiwan-8B-Instruct"
your_preserved_hidden_layers = 2

In [None]:
from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast, BitsAndBytesConfig
import torch
import torch.nn as nn

if your_preserved_hidden_layers != "all":
    model_name        = your_model_name
    num_hidden_layers = your_preserved_hidden_layers

    config            = LlamaConfig.from_pretrained(model_name)
    config.num_hidden_layers = num_hidden_layers
    new_model         = LlamaForCausalLM(config)
    new_state_dict    = {}

    old_model         = LlamaForCausalLM.from_pretrained(model_name)
    old_state_dict    = old_model.state_dict()

    for name, param in new_model.state_dict().items():
        if name in old_state_dict:
            if 'layers.' in name:
                # Check if the layer index is within the range of the new model
                original_layer_index = int(name.split('.')[2])
                if original_layer_index < config.num_hidden_layers:
                    new_state_dict[name] = old_state_dict[name]
            else:
                new_state_dict[name] = old_state_dict[name]

    # Load the filtered state dictionary into the new model
    new_model.load_state_dict(new_state_dict, strict=False)

    model = new_model

    tokenizer   = PreTrainedTokenizerFast.from_pretrained(model_name )

    # if tokenizer.pad_token is None:
    #     tokenizer.pad_token = tokenizer.eos_token

    del old_model

else:

    model       = LlamaForCausalLM.from_pretrained(your_model_name)
    tokenizer   = PreTrainedTokenizerFast.from_pretrained(your_model_name )

    # if tokenizer.pad_token is None:
    #     tokenizer.pad_token = tokenizer.eos_token


#### *** Check layers, modules, params

In [None]:
try:
  print(model.model.layers[0])
  print(model.model.layers[0].self_attn.state_dict())
except:
  print(model.base_model.model.model.layers[0])
  print(model.base_model.model.model.layers[0].self_attn.state_dict())

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())
num_params = count_parameters(model)
print(f"Number of parameters: {num_params}")

### 3 - Customize your attention

#### 3.1 - Append your attn to llama's attn

In [None]:
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention

customization = True

if customization:

    class custom_attn(nn.Module):
        def __init__(self, d_model, num_heads = 8):
            super(custom_attn, self).__init__()

            assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

            self.bias      = False
            self.d_model   = d_model
            self.num_heads = num_heads
            self.d_k       = d_model // num_heads

            self.W_q_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_0  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_1  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_2  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_final  = nn.Linear(d_model, d_model, bias=self.bias)

        def scaled_dot_product_attention(self, Q, K, V, mask=None):

            attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask

            attn_probs = torch.softmax(attn_scores, dim=-1)
            output     = torch.matmul(attn_probs, V)

            return output

        def split_heads(self, x):
            batch_size, seq_length, d_model = x.size()
            return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
            #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

        def combine_heads(self, x):
            batch_size, _, seq_length, d_k = x.size()
            return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        def forward(self, Q, K, V, mask=None):
            # Q    -> (batch_size, seq_length, d_model)
            # mask -> (batch_size, 1, seq_length, d_model)
            Q_0 = self.split_heads(self.W_q_0(Q))
            K_0 = self.split_heads(self.W_k_0(K))
            V_0 = self.split_heads(self.W_v_0(V))
            attn_output_0 = self.scaled_dot_product_attention(Q_0, K_0, V_0, mask)
            output_0      = self.W_o_0(self.combine_heads(attn_output_0))

            Q_1 = self.split_heads(self.W_q_1(Q))
            K_1 = self.split_heads(self.W_k_1(K))
            V_1 = self.split_heads(self.W_v_1(V))
            attn_output_1 = self.scaled_dot_product_attention(Q_1, K_1, V_1, mask)
            output_1      = self.W_o_1(self.combine_heads(attn_output_1))

            Q_2 = self.split_heads(self.W_q_2(Q))
            K_2 = self.split_heads(self.W_k_2(K))
            V_2 = self.split_heads(self.W_v_2(V))
            attn_output_2 = self.scaled_dot_product_attention(Q_2, K_2, V_2, mask)
            output_2      = self.W_o_2(self.combine_heads(attn_output_2))

            Q_final = self.split_heads(self.W_q_final(output_0))
            K_final = self.split_heads(self.W_k_final(output_1))
            V_final = self.split_heads(self.W_v_final(output_2))
            attn_output_final = self.scaled_dot_product_attention(Q_final, K_final, V_final, mask)
            output_final      = self.W_o_final(self.combine_heads(attn_output_final))

            return output_final






    class self_attn(LlamaAttention):
        def __init__(self, config, layer_idx):
            super().__init__(config, layer_idx)
            self.layer_idx   = layer_idx
            self.custom_attn = custom_attn(config.hidden_size, 2)

        def forward(self, hidden_states,
                          attention_mask=None,
                          position_ids=None,
                          past_key_value=None,
                          output_attentions=None,
                          use_cache=None,
                          cache_position=None,
                          max_position_embeddings=None,
                          layer_idx = None,
                          **kwargs):

            # Apply the custom attention layer
            hidden_states = self.custom_attn(hidden_states,hidden_states,hidden_states,attention_mask)
            # self_attn_weights, present_key_value skipped

            # Continue with the original LLaMA attention mechanism
            return super().forward(hidden_states       ,
                                  attention_mask=attention_mask,
                                  position_ids=position_ids,
                                  past_key_value=past_key_value,
                                  output_attentions=output_attentions,
                                  use_cache=use_cache,
                                  cache_position=cache_position,
                                  max_position_embeddings=max_position_embeddings,
                                  layer_idx=self.layer_idx,
                                  **kwargs)




    for i in range(model.config.num_hidden_layers):
      print(f"Replacing attention layer: {i}")
      try:
        model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)
      except:
        model.base_model.model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)




#### 3.2 - Append traditional attn to llama's attn

In [None]:
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention

customization = True

if customization:

    class custom_attn(nn.Module):
        def __init__(self, d_model, num_heads = 8):
            super(custom_attn, self).__init__()

            assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

            self.bias      = False
            self.d_model   = d_model
            self.num_heads = num_heads
            self.d_k       = d_model // num_heads

            self.W_q  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o  = nn.Linear(d_model, d_model, bias=self.bias)


        def scaled_dot_product_attention(self, Q, K, V, mask=None):


            attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask

            attn_probs = torch.softmax(attn_scores, dim=-1)
            output     = torch.matmul(attn_probs, V)

            return output

        def split_heads(self, x):
            batch_size, seq_length, d_model = x.size()
            return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
            #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

        def combine_heads(self, x):
            batch_size, _, seq_length, d_k = x.size()
            return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        def forward(self, Q, K, V, mask=None):
            # Q    -> (batch_size, seq_length, d_model)
            # mask -> (batch_size, 1, seq_length, d_model)
            Q = self.split_heads(self.W_q(Q))
            K = self.split_heads(self.W_k(K))
            V = self.split_heads(self.W_v(V))
            attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
            output      = self.W_o(self.combine_heads(attn_output))
            return output






    class self_attn(LlamaAttention):
        def __init__(self, config, layer_idx):
            super().__init__(config, layer_idx)
            self.layer_idx   = layer_idx
            self.brown_layer = custom_attn(config.hidden_size, 2)

        def forward(self, hidden_states,
                          attention_mask=None,
                          position_ids=None,
                          past_key_value=None,
                          output_attentions=None,
                          use_cache=None,
                          cache_position=None,
                          max_position_embeddings=None,
                          layer_idx = None,
                          **kwargs):

            # Apply the custom attention layer
            hidden_states = self.brown_layer(hidden_states,hidden_states,hidden_states,attention_mask)
            # self_attn_weights, present_key_value skipped

            # Continue with the original LLaMA attention mechanism
            return super().forward(hidden_states       ,
                                  attention_mask=attention_mask,
                                  position_ids=position_ids,
                                  past_key_value=past_key_value,
                                  output_attentions=output_attentions,
                                  use_cache=use_cache,
                                  cache_position=cache_position,
                                  max_position_embeddings=max_position_embeddings,
                                  layer_idx=self.layer_idx,
                                  **kwargs)




    for i in range(model.config.num_hidden_layers):
      print(f"Replacing attention layer: {i}")
      try:
        model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)
      except:
        model.base_model.model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)


#### 3.3 - Replace llama's attn with your attn (multi-multi-head attn)

In [None]:
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention

customization = True

if customization:

    class custom_attn(nn.Module):
        def __init__(self, d_model, num_heads = 8):
            super(custom_attn, self).__init__()

            assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

            self.bias      = False
            self.d_model   = d_model
            self.num_heads = num_heads
            self.d_k       = d_model // num_heads

            self.W_q_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_0  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_1  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_2  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_final  = nn.Linear(d_model, d_model, bias=self.bias)

        def scaled_dot_product_attention(self, Q, K, V, mask=None):

            attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask

            attn_probs = torch.softmax(attn_scores, dim=-1)
            output     = torch.matmul(attn_probs, V)

            return output

        def split_heads(self, x):
            batch_size, seq_length, d_model = x.size()
            return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
            #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

        def combine_heads(self, x):
            batch_size, _, seq_length, d_k = x.size()
            return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        def forward(self, Q, K, V, mask=None):
            # Q    -> (batch_size, seq_length, d_model)
            # mask -> (batch_size, 1, seq_length, d_model)
            Q_0 = self.split_heads(self.W_q_0(Q))
            K_0 = self.split_heads(self.W_k_0(K))
            V_0 = self.split_heads(self.W_v_0(V))
            attn_output_0 = self.scaled_dot_product_attention(Q_0, K_0, V_0, mask)
            output_0      = self.W_o_0(self.combine_heads(attn_output_0))

            Q_1 = self.split_heads(self.W_q_1(Q))
            K_1 = self.split_heads(self.W_k_1(K))
            V_1 = self.split_heads(self.W_v_1(V))
            attn_output_1 = self.scaled_dot_product_attention(Q_1, K_1, V_1, mask)
            output_1      = self.W_o_1(self.combine_heads(attn_output_1))

            Q_2 = self.split_heads(self.W_q_2(Q))
            K_2 = self.split_heads(self.W_k_2(K))
            V_2 = self.split_heads(self.W_v_2(V))
            attn_output_2 = self.scaled_dot_product_attention(Q_2, K_2, V_2, mask)
            output_2      = self.W_o_2(self.combine_heads(attn_output_2))

            Q_final = self.split_heads(self.W_q_final(output_0))
            K_final = self.split_heads(self.W_k_final(output_1))
            V_final = self.split_heads(self.W_v_final(output_2))
            attn_output_final = self.scaled_dot_product_attention(Q_final, K_final, V_final, mask)
            output_final      = self.W_o_final(self.combine_heads(attn_output_final))

            return output_final






    class self_attn(nn.Module):
        def __init__(self, config, layer_idx):
            super(self_attn, self).__init__()
            self.layer_idx   = layer_idx
            self.custom_attn = custom_attn(config.hidden_size, 8)

        def forward(self, hidden_states,
                          attention_mask=None,
                          position_ids=None,
                          past_key_value=None,
                          output_attentions=False,
                          use_cache=False,
                          cache_position=None,
                          max_position_embeddings  =None,
                          layer_idx = None,
                          **kwargs):

            # Apply the custom attention layer
            hidden_states = self.custom_attn(hidden_states,hidden_states,hidden_states,attention_mask)
            attn_weights = None
            past_key_value = None

            # Continue with the original LLaMA attention mechanism
            return hidden_states, attn_weights, past_key_value



    for i in range(model.config.num_hidden_layers):
      print(f"Replacing attention layer: {i}")
      try:
        model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)
      except:
        model.base_model.model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)




#### 3.4 - Replace llama's attn with your attn (consecutive softmax)

In [None]:
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention

customization = True

if customization:

    class custom_attn(nn.Module):
        def __init__(self, d_model, num_heads = 8):
            super(custom_attn, self).__init__()

            assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

            self.bias      = False
            self.d_model   = d_model
            self.num_heads = num_heads
            self.d_k       = d_model // num_heads

            self.W_q  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v1 = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v2 = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v3 = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v4 = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o  = nn.Linear(d_model, d_model, bias=self.bias)


        def scaled_dot_product_attention(self, Q, K, V1, V2, V3, mask=None):

            attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask
            attn_probs = torch.softmax(attn_scores, dim=-1)

            output     = torch.matmul(attn_probs, V1)

            attn_scores = torch.matmul(output, V2.transpose(-2, -1)) / (self.d_k ** 0.5)
            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask
            attn_probs = torch.softmax(attn_scores, dim=-1)

            output     = torch.matmul(attn_probs, V3)

            return output


        def split_heads(self, x):
            batch_size, seq_length, d_model = x.size()
            return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
            #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

        def combine_heads(self, x):
            batch_size, _, seq_length, d_k = x.size()
            return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        def forward(self, Q, K, V1, V2, V3, mask=None):
            # Q    -> (batch_size, seq_length, d_model)
            # mask -> (batch_size, 1, seq_length, d_model)
            Q  = self.split_heads(self.W_q(Q))
            K  = self.split_heads(self.W_k(K))
            V1 = self.split_heads(self.W_v1(V1))
            V2 = self.split_heads(self.W_v2(V2))
            V3 = self.split_heads(self.W_v3(V3))
            attn_output = self.scaled_dot_product_attention(Q, K, V1, V2, V3, mask)
            output      = self.W_o(self.combine_heads(attn_output))

            return output






    class self_attn(nn.Module):
        def __init__(self, config, layer_idx):
            super(self_attn, self).__init__()
            self.layer_idx   = layer_idx
            self.custom_attn = custom_attn(config.hidden_size, 8)

        def forward(self, hidden_states,
                          attention_mask=None,
                          position_ids=None,
                          past_key_value=None,
                          output_attentions=False,
                          use_cache=False,
                          cache_position=None,
                          max_position_embeddings  =None,
                          layer_idx = None,
                          **kwargs):

            # Apply the custom attention layer
            hidden_states = self.custom_attn(hidden_states,hidden_states,hidden_states,hidden_states,hidden_states,attention_mask)
            attn_weights = None
            past_key_value = None

            # Continue with the original LLaMA attention mechanism
            return hidden_states, attn_weights, past_key_value



    for i in range(model.config.num_hidden_layers):
      print(f"Replacing attention layer: {i}")
      try:
        model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)
      except:
        model.base_model.model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)




#### 3.5 - No Customization! Only fine-tune!

In [None]:
model

In [None]:
target_modules = ["q_proj", "k_proj", "v_proj","o_proj"] # I don't know why these parameters are called "modules"... They should be "parameters"!

In [None]:
from peft import get_peft_model, LoraConfig

lora_config = LoraConfig(
    r=8,  # Rank of the low-rank decomposition
    lora_alpha=16,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate for LoRA
    target_modules= target_modules,  # Target modules to apply LoRA
    bias="none"  # Bias handling, can be "none", "lora_only", or "both"
)

model       = get_peft_model(model, lora_config)

#### *** Check

In [None]:
try:
  print(model.model.layers[0])
  print(model.model.layers[0].self_attn.state_dict())
except:
  print(model.base_model.model.model.layers[0])
  print(model.base_model.model.model.layers[0].self_attn.state_dict())

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())
num_params = count_parameters(model)
print(f"Number of parameters: {num_params}")

### 4 - Prep data

#### 4.1 Use HF data

In [8]:
your_train_dataset = "taide/TAIDE-14-tasks" # Jamie0510/taiwan-law-exam
split              = "train"
sub_set            = ''
instruction_col    = 'Prompt'
input_col          = 'Input'
output_col         = 'Positive Response'

In [9]:
from datasets import load_dataset

# Load the dataset
train_dataset = load_dataset(your_train_dataset, sub_set)[split]
original_cols = train_dataset.column_names
print(original_cols)

# ---------------------------------------------------

# Tokenize the formatted dataset
def tokenize_function(examples):
    inputs           = [f"{instruction} {input}" for instruction, input in zip(examples[instruction_col], examples[input_col] )]
    # Tokenize combined inputs
    tokenized_inputs = tokenizer(inputs              , padding="max_length", truncation=True, max_length=512, return_attention_mask=True , return_tensors='pt')
    # Tokenize outputs (labels)
    tokenized_labels = tokenizer(examples[output_col], padding="max_length", truncation=True, max_length=512, return_attention_mask=True , return_tensors='pt')
    combined_inputs = {
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask'],
        'labels': tokenized_labels['input_ids']  # Include labels
    }
    return combined_inputs
train_dataset = train_dataset.map(tokenize_function, batched=True)

# Remove unused columns
train_dataset = train_dataset.remove_columns(original_cols)

# Now train_dataset contains the columns `input_ids`, `attention_mask`, and `labels`
print(train_dataset)


['Unnamed: 0', 'Topic', 'Task', 'Keywords', 'Prompt', 'Input', 'Positive Response', 'Negative Response']
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 140
})


#### 4.2 Use local data

In [None]:
your_train_dataset = './fintech_qa.xlsx'
split              = "train"
instruction_col    = 'instruction'
input_col          = 'input'
output_col         = 'output'

In [None]:
from datasets import load_dataset

# Read local xlsx
import pandas as pd
excel_file = pd.ExcelFile(your_train_dataset, engine='openpyxl')
for sheet_name in excel_file.sheet_names:
    df = pd.read_excel(excel_file, sheet_name=sheet_name)
    df.to_csv(f'./{sheet_name}.csv', index=False)

# Load the dataset
from datasets import load_dataset
train_dataset = load_dataset('csv', data_files={'train': f'./{sheet_name}.csv'})[split]
original_cols = train_dataset.column_names
print(original_cols)

# ---------------------------------------------------

# Tokenize the formatted dataset
def tokenize_function(examples):
    inputs           = [f"{instruction} {input}" for instruction, input in zip(examples[instruction_col], examples[input_col])]
    # Tokenize combined inputs
    tokenized_inputs = tokenizer(inputs              , padding="max_length", truncation=True, max_length=512, return_attention_mask=True , return_tensors='pt')
    # Tokenize outputs (labels)
    tokenized_labels = tokenizer(examples[output_col], padding="max_length", truncation=True, max_length=512, return_attention_mask=True , return_tensors='pt')
    combined_inputs = {
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask'],
        'labels': tokenized_labels['input_ids']  # Include labels
    }
    return combined_inputs
train_dataset = train_dataset.map(tokenize_function, batched=True)

# Remove unused columns
train_dataset = train_dataset.remove_columns(original_cols)

# Now train_dataset contains the columns `input_ids`, `attention_mask`, and `labels`
print(train_dataset)


### 5 - Train

In [10]:
your_learning_rate = 1e-4
your_batch_size    = 1
your_epoch         = 10

In [11]:
from transformers import TrainingArguments, Trainer, TrainerCallback

model = model.to('cuda')

training_args = TrainingArguments(
    output_dir = './result',
    learning_rate= your_learning_rate,
    per_device_train_batch_size=your_batch_size,
    # per_device_eval_batch_size=1,
    num_train_epochs=your_epoch,
    remove_unused_columns=False,
    save_strategy=  "no" # 'epoch',  # Save model every epoch
    # save_total_limit=1,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=eval_dataset
)

trainer_stats = trainer.train()


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.96 GiB. GPU 

#### *** Check

In [None]:
try:
  print(model.model.layers[0])
  print(model.model.layers[0].self_attn.state_dict())
except:
  print(model.base_model.model.model.layers[0])
  print(model.base_model.model.model.layers[0].self_attn.state_dict())

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())
num_params = count_parameters(model)
print(f"Number of parameters: {num_params}")

### 6 - Inference

In [None]:
your_user_input = "請問你為何外匯帳戶容易受到洗錢者的喜愛?拜託不要跟我講幹話啦"

In [None]:
import torch

model = model.to('cuda')

def generate_response(input_text, model, tokenizer):
    # Tokenize the input text
    inputs = tokenizer(input_text, padding=True, truncation=True, max_length=512, return_attention_mask=True , return_tensors='pt').to(model.device)
    # Generate a response from the model
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs['attention_mask'],
            pad_token_id=tokenizer.pad_token_id,
            max_length=len(inputs["input_ids"][0]) + 100,
            num_return_sequences=1,
            temperature=0.5,
            do_sample=True,
            top_k=50,   # Optional: for more diverse output
            top_p=0.95      # Optional: for more diverse output
        )
    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example user input
user_input = your_user_input

# Generate and print the response
response = generate_response(user_input, model, tokenizer)
print("Chatbot:", response.replace(your_user_input, ""))


### 7 - Test

In [None]:
your_test_dataset = "kigner/ruozhiba-llama3"
split = 'train'

In [None]:
from datasets import load_dataset

# Load the dataset
test_dataset = load_dataset(your_test_dataset)[split]

# ---------------------------------------------------

# Tokenize the formatted dataset
def tokenize_function(examples):
    inputs           = [f"{instruction} {input}" for instruction, input in zip(examples['instruction'], examples['input'])]
    # Tokenize combined inputs
    tokenized_inputs = tokenizer(inputs            , padding="max_length", truncation=True, max_length=512, return_attention_mask=True , return_tensors='pt')
    # Tokenize outputs (labels)
    tokenized_labels = tokenizer(examples['output'], padding="max_length", truncation=True, max_length=512, return_attention_mask=True , return_tensors='pt')
    combined_inputs = {
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask'],
        'labels': tokenized_labels['input_ids']  # Include labels
    }
    return combined_inputs
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove unused columns
test_dataset = test_dataset.remove_columns(["instruction", "input", "output"])

# Now train_dataset contains the columns `input_ids`, `attention_mask`, and `labels`
print(test_dataset)




# ------------------------------------------------------------------------------------------------------




from transformers import TrainingArguments, Trainer, TrainerCallback

testing_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=1
)

model = model.to('cuda')

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=testing_args,
    eval_dataset=test_dataset
)

# Evaluate the model
trainer_stats = trainer.evaluate()

# Print evaluation results
print(trainer_stats)



### 8 - Save & Upload

In [None]:
your_repo_name  = "Brownwang0426"
your_model_name = "Brownwang0426/Llama-3-Taiwan-8B-Instruct-to-1B"
your_huggingface_write_token = "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"

In [None]:
model.save_pretrained(your_model_name)
tokenizer.save_pretrained(your_model_name)
model.push_to_hub(your_model_name, token = your_huggingface_write_token) # Online saving
tokenizer.push_to_hub(your_model_name, token = your_huggingface_write_token) # Online saving

### 9- Download

In [1]:
your_model_name              =  "Brownwang0426/Llama-3-Taiwan-8B-Instruct-to-1B"

In [2]:
from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast, BitsAndBytesConfig
import torch
import torch.nn as nn

model_name = your_model_name # "meta-llama/Llama-2-7b-chat-hf"
model = LlamaForCausalLM.from_pretrained(model_name)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)

# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at Brownwang0426/Llama-3-Taiwan-8B-Instruct-to-1B were not used when initializing LlamaForCausalLM: ['model.layers.0.self_attn.custom_attn.W_k_0.weight', 'model.layers.0.self_attn.custom_attn.W_k_1.weight', 'model.layers.0.self_attn.custom_attn.W_k_2.weight', 'model.layers.0.self_attn.custom_attn.W_k_final.weight', 'model.layers.0.self_attn.custom_attn.W_o_0.weight', 'model.layers.0.self_attn.custom_attn.W_o_1.weight', 'model.layers.0.self_attn.custom_attn.W_o_2.weight', 'model.layers.0.self_attn.custom_attn.W_o_final.weight', 'model.layers.0.self_attn.custom_attn.W_q_0.weight', 'model.layers.0.self_attn.custom_attn.W_q_1.weight', 'model.layers.0.self_attn.custom_attn.W_q_2.weight', 'model.layers.0.self_attn.custom_attn.W_q_final.weight', 'model.layers.0.self_attn.custom_attn.W_v_0.weight', 'model.layers.0.self_attn.custom_attn.W_v_1.weight', 'model.layers.0.self_attn.custom_attn.W_v_2.weight', 'model.layers.0.self_attn.custom_attn.W_v_final.weight'

#### *** Check

In [None]:
try:
  print(model.model.layers[0])
  print(model.model.layers[0].self_attn.state_dict())
except:
  print(model.base_model.model.model.layers[0])
  print(model.base_model.model.model.layers[0].self_attn.state_dict())

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())
num_params = count_parameters(model)
print(f"Number of parameters: {num_params}")

### 10 - Customize your attention

#### 10.1 - Append your attn to llama's attn

In [None]:
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention

customization = True

if customization:

    class custom_attn(nn.Module):
        def __init__(self, d_model, num_heads = 8):
            super(custom_attn, self).__init__()

            assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

            self.bias      = False
            self.d_model   = d_model
            self.num_heads = num_heads
            self.d_k       = d_model // num_heads

            self.W_q_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_0  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_1  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_2  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_final  = nn.Linear(d_model, d_model, bias=self.bias)

        def scaled_dot_product_attention(self, Q, K, V, mask=None):

            attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask

            attn_probs = torch.softmax(attn_scores, dim=-1)
            output     = torch.matmul(attn_probs, V)

            return output

        def split_heads(self, x):
            batch_size, seq_length, d_model = x.size()
            return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
            #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

        def combine_heads(self, x):
            batch_size, _, seq_length, d_k = x.size()
            return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        def forward(self, Q, K, V, mask=None):
            # Q    -> (batch_size, seq_length, d_model)
            # mask -> (batch_size, 1, seq_length, d_model)
            Q_0 = self.split_heads(self.W_q_0(Q))
            K_0 = self.split_heads(self.W_k_0(K))
            V_0 = self.split_heads(self.W_v_0(V))
            attn_output_0 = self.scaled_dot_product_attention(Q_0, K_0, V_0, mask)
            output_0      = self.W_o_0(self.combine_heads(attn_output_0))

            Q_1 = self.split_heads(self.W_q_1(Q))
            K_1 = self.split_heads(self.W_k_1(K))
            V_1 = self.split_heads(self.W_v_1(V))
            attn_output_1 = self.scaled_dot_product_attention(Q_1, K_1, V_1, mask)
            output_1      = self.W_o_1(self.combine_heads(attn_output_1))

            Q_2 = self.split_heads(self.W_q_2(Q))
            K_2 = self.split_heads(self.W_k_2(K))
            V_2 = self.split_heads(self.W_v_2(V))
            attn_output_2 = self.scaled_dot_product_attention(Q_2, K_2, V_2, mask)
            output_2      = self.W_o_2(self.combine_heads(attn_output_2))

            Q_final = self.split_heads(self.W_q_final(output_0))
            K_final = self.split_heads(self.W_k_final(output_1))
            V_final = self.split_heads(self.W_v_final(output_2))
            attn_output_final = self.scaled_dot_product_attention(Q_final, K_final, V_final, mask)
            output_final      = self.W_o_final(self.combine_heads(attn_output_final))

            return output_final






    class self_attn(LlamaAttention):
        def __init__(self, config, layer_idx):
            super().__init__(config, layer_idx)
            self.layer_idx   = layer_idx
            self.custom_attn = custom_attn(config.hidden_size, 2)

        def forward(self, hidden_states,
                          attention_mask=None,
                          position_ids=None,
                          past_key_value=None,
                          output_attentions=None,
                          use_cache=None,
                          cache_position=None,
                          position_embeddings=None,
                          layer_idx = None,
                          **kwargs):

            # Apply the custom attention layer
            hidden_states = self.custom_attn(hidden_states,hidden_states,hidden_states,attention_mask)
            # self_attn_weights, present_key_value skipped

            # Continue with the original LLaMA attention mechanism
            return super().forward(hidden_states       ,
                                  attention_mask=attention_mask,
                                  position_ids=position_ids,
                                  past_key_value=past_key_value,
                                  output_attentions=output_attentions,
                                  use_cache=use_cache,
                                  cache_position=cache_position,
                                  position_embeddings=position_embeddings,
                                  layer_idx=self.layer_idx,
                                  **kwargs)




    for i in range(model.config.num_hidden_layers):
      print(f"Replacing attention layer: {i}")
      try:
        model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)
      except:
        model.base_model.model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)




#### 10.2 - Append traditional attn to llama's attn

In [None]:
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention

customization = True

if customization:

    class custom_attn(nn.Module):
        def __init__(self, d_model, num_heads = 8):
            super(custom_attn, self).__init__()

            assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

            self.bias      = False
            self.d_model   = d_model
            self.num_heads = num_heads
            self.d_k       = d_model // num_heads

            self.W_q  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o  = nn.Linear(d_model, d_model, bias=self.bias)


        def scaled_dot_product_attention(self, Q, K, V, mask=None):


            attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask

            attn_probs = torch.softmax(attn_scores, dim=-1)
            output     = torch.matmul(attn_probs, V)

            return output

        def split_heads(self, x):
            batch_size, seq_length, d_model = x.size()
            return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
            #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

        def combine_heads(self, x):
            batch_size, _, seq_length, d_k = x.size()
            return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        def forward(self, Q, K, V, mask=None):
            # Q    -> (batch_size, seq_length, d_model)
            # mask -> (batch_size, 1, seq_length, d_model)
            Q = self.split_heads(self.W_q(Q))
            K = self.split_heads(self.W_k(K))
            V = self.split_heads(self.W_v(V))
            attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
            output      = self.W_o(self.combine_heads(attn_output))
            return output






    class self_attn(LlamaAttention):
        def __init__(self, config, layer_idx):
            super().__init__(config, layer_idx)
            self.layer_idx   = layer_idx
            self.brown_layer = custom_attn(config.hidden_size, 2)

        def forward(self, hidden_states,
                          attention_mask=None,
                          position_ids=None,
                          past_key_value=None,
                          output_attentions=None,
                          use_cache=None,
                          cache_position=None,
                          max_position_embeddings=None,
                          layer_idx = None,
                          **kwargs):

            # Apply the custom attention layer
            hidden_states = self.brown_layer(hidden_states,hidden_states,hidden_states,attention_mask)
            # self_attn_weights, present_key_value skipped

            # Continue with the original LLaMA attention mechanism
            return super().forward(hidden_states       ,
                                  attention_mask=attention_mask,
                                  position_ids=position_ids,
                                  past_key_value=past_key_value,
                                  output_attentions=output_attentions,
                                  use_cache=use_cache,
                                  cache_position=cache_position,
                                  max_position_embeddings=max_position_embeddings,
                                  layer_idx=self.layer_idx,
                                  **kwargs)




    for i in range(model.config.num_hidden_layers):
      print(f"Replacing attention layer: {i}")
      try:
        model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)
      except:
        model.base_model.model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)


#### 10.3 - Replace llama's attn with your attn (multi-multi-head attn)

In [3]:
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention

customization = True

if customization:

    class custom_attn(nn.Module):
        def __init__(self, d_model, num_heads = 8):
            super(custom_attn, self).__init__()

            assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

            self.bias      = False
            self.d_model   = d_model
            self.num_heads = num_heads
            self.d_k       = d_model // num_heads

            self.W_q_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_0  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_1  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_2  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_final  = nn.Linear(d_model, d_model, bias=self.bias)

        def scaled_dot_product_attention(self, Q, K, V, mask=None):

            attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask

            attn_probs = torch.softmax(attn_scores, dim=-1)
            output     = torch.matmul(attn_probs, V)

            return output

        def split_heads(self, x):
            batch_size, seq_length, d_model = x.size()
            return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
            #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

        def combine_heads(self, x):
            batch_size, _, seq_length, d_k = x.size()
            return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        def forward(self, Q, K, V, mask=None):
            # Q    -> (batch_size, seq_length, d_model)
            # mask -> (batch_size, 1, seq_length, d_model)
            Q_0 = self.split_heads(self.W_q_0(Q))
            K_0 = self.split_heads(self.W_k_0(K))
            V_0 = self.split_heads(self.W_v_0(V))
            attn_output_0 = self.scaled_dot_product_attention(Q_0, K_0, V_0, mask)
            output_0      = self.W_o_0(self.combine_heads(attn_output_0))

            Q_1 = self.split_heads(self.W_q_1(Q))
            K_1 = self.split_heads(self.W_k_1(K))
            V_1 = self.split_heads(self.W_v_1(V))
            attn_output_1 = self.scaled_dot_product_attention(Q_1, K_1, V_1, mask)
            output_1      = self.W_o_1(self.combine_heads(attn_output_1))

            Q_2 = self.split_heads(self.W_q_2(Q))
            K_2 = self.split_heads(self.W_k_2(K))
            V_2 = self.split_heads(self.W_v_2(V))
            attn_output_2 = self.scaled_dot_product_attention(Q_2, K_2, V_2, mask)
            output_2      = self.W_o_2(self.combine_heads(attn_output_2))

            Q_final = self.split_heads(self.W_q_final(output_0))
            K_final = self.split_heads(self.W_k_final(output_1))
            V_final = self.split_heads(self.W_v_final(output_2))
            attn_output_final = self.scaled_dot_product_attention(Q_final, K_final, V_final, mask)
            output_final      = self.W_o_final(self.combine_heads(attn_output_final))

            return output_final






    class self_attn(nn.Module):
        def __init__(self, config, layer_idx):
            super(self_attn, self).__init__()
            self.layer_idx   = layer_idx
            self.custom_attn = custom_attn(config.hidden_size, 8)

        def forward(self, hidden_states,
                          attention_mask=None,
                          position_ids=None,
                          past_key_value=None,
                          output_attentions=False,
                          use_cache=False,
                          cache_position=None,
                          max_position_embeddings  =None,
                          layer_idx = None,
                          **kwargs):

            # Apply the custom attention layer
            hidden_states = self.custom_attn(hidden_states,hidden_states,hidden_states,attention_mask)
            attn_weights = None
            past_key_value = None

            # Continue with the original LLaMA attention mechanism
            return hidden_states, attn_weights, past_key_value



    for i in range(model.config.num_hidden_layers):
      print(f"Replacing attention layer: {i}")
      try:
        model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)
      except:
        model.base_model.model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)




Replacing attention layer: 0
Replacing attention layer: 1


#### 10.4 - Replace llama's attn with your attn (consecutive softmax)

In [None]:
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention

customization = True

if customization:

    class custom_attn(nn.Module):
        def __init__(self, d_model, num_heads = 8):
            super(custom_attn, self).__init__()

            assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

            self.bias      = False
            self.d_model   = d_model
            self.num_heads = num_heads
            self.d_k       = d_model // num_heads

            self.W_q  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v1 = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v2 = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v3 = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v4 = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o  = nn.Linear(d_model, d_model, bias=self.bias)


        def scaled_dot_product_attention(self, Q, K, V1, V2, V3, mask=None):

            attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask
            attn_probs = torch.softmax(attn_scores, dim=-1)

            output     = torch.matmul(attn_probs, V1)

            attn_scores = torch.matmul(output, V2.transpose(-2, -1)) / (self.d_k ** 0.5)
            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask
            attn_probs = torch.softmax(attn_scores, dim=-1)

            output     = torch.matmul(attn_probs, V3)

            return output


        def split_heads(self, x):
            batch_size, seq_length, d_model = x.size()
            return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
            #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

        def combine_heads(self, x):
            batch_size, _, seq_length, d_k = x.size()
            return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        def forward(self, Q, K, V1, V2, V3, mask=None):
            # Q    -> (batch_size, seq_length, d_model)
            # mask -> (batch_size, 1, seq_length, d_model)
            Q  = self.split_heads(self.W_q(Q))
            K  = self.split_heads(self.W_k(K))
            V1 = self.split_heads(self.W_v1(V1))
            V2 = self.split_heads(self.W_v2(V2))
            V3 = self.split_heads(self.W_v3(V3))
            attn_output = self.scaled_dot_product_attention(Q, K, V1, V2, V3, mask)
            output      = self.W_o(self.combine_heads(attn_output))

            return output






    class self_attn(nn.Module):
        def __init__(self, config, layer_idx):
            super(self_attn, self).__init__()
            self.layer_idx   = layer_idx
            self.custom_attn = custom_attn(config.hidden_size, 8)

        def forward(self, hidden_states,
                          attention_mask=None,
                          position_ids=None,
                          past_key_value=None,
                          output_attentions=False,
                          use_cache=False,
                          cache_position=None,
                          max_position_embeddings  =None,
                          layer_idx = None,
                          **kwargs):

            # Apply the custom attention layer
            hidden_states = self.custom_attn(hidden_states,hidden_states,hidden_states,hidden_states,hidden_states,attention_mask)
            attn_weights = None
            past_key_value = None

            # Continue with the original LLaMA attention mechanism
            return hidden_states, attn_weights, past_key_value



    for i in range(model.config.num_hidden_layers):
      print(f"Replacing attention layer: {i}")
      try:
        model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)
      except:
        model.base_model.model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)




#### 10.5 - No Customization! Only fine-tune!

In [None]:
model

In [None]:
target_modules = ["q_proj", "k_proj", "v_proj","o_proj"] # I don't know why these parameters are called "modules"... They should be "parameters"!

In [None]:
from peft import get_peft_model, LoraConfig

lora_config = LoraConfig(
    r=8,  # Rank of the low-rank decomposition
    lora_alpha=16,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate for LoRA
    target_modules= target_modules,  # Target modules to apply LoRA
    bias="none"  # Bias handling, can be "none", "lora_only", or "both"
)

model       = get_peft_model(model, lora_config)

#### *** Check

In [None]:
try:
  print(model.model.layers[0])
  print(model.model.layers[0].self_attn.state_dict())
except:
  print(model.base_model.model.model.layers[0])
  print(model.base_model.model.model.layers[0].self_attn.state_dict())

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())
num_params = count_parameters(model)
print(f"Number of parameters: {num_params}")

### 11 - Retreive safetensor

In [4]:
your_model_name  = "Brownwang0426/Llama-3-Taiwan-8B-Instruct-to-1B"
your_safetensors = ["model-00001-of-00002.safetensors", "model-00002-of-00002.safetensors"]
your_huggingface_read_token = "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"

In [5]:
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
import torch
import os

# Replace with your model's repository ID
repo_id = your_model_name

# Specify the filenames of the SafeTensor files you want to download
filenames = your_safetensors

# Specify dir
custom_cache_dir =  "./"

# Download each SafeTensor file
for filename in filenames:
    try:
        filepath = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=custom_cache_dir, token = your_huggingface_read_token )
    except:
        print(f'safetenstor not found: {filename}')

state_dict_combined = {}

# Iterate over all safetensor files in the directory
for filename in os.listdir(os.path.join(custom_cache_dir, "models--" + your_model_name.replace("/", "--") ,"blobs")):
    file_path = os.path.join(custom_cache_dir, "models--" + your_model_name.replace("/", "--") ,"blobs", filename)
    print(f"Loading {file_path}...")

    # Load the safetensor file
    state_dict = load_file(file_path)

    # Merge the state_dict with the merged_state_dict
    state_dict_combined.update(state_dict)

# Merge the weights from the current SafeTensor
for key, tensor in state_dict_combined.items():
    if key in model.state_dict():
        print(f"Merging {key}.")
        model.state_dict()[key].copy_(tensor)
    else:
        print(f"Warning: {key} not found in the model. Skipping.")



Loading ./models--Brownwang0426--Llama-3-Taiwan-8B-Instruct-to-1B/blobs/9f6e706b9e91d5614b3b8d1a6eb9e1a6dc4890e62fdbbed613ed814725c5a3af...
Loading ./models--Brownwang0426--Llama-3-Taiwan-8B-Instruct-to-1B/blobs/c4e05a23c1a1166bcf547324e8e500acee1eaff5949fc390f3bf15b16c1d549c...
Merging model.embed_tokens.weight.
Merging model.layers.0.input_layernorm.weight.
Merging model.layers.0.mlp.down_proj.weight.
Merging model.layers.0.mlp.gate_proj.weight.
Merging model.layers.0.mlp.up_proj.weight.
Merging model.layers.0.post_attention_layernorm.weight.
Merging model.layers.0.self_attn.custom_attn.W_k_0.weight.
Merging model.layers.0.self_attn.custom_attn.W_k_1.weight.
Merging model.layers.0.self_attn.custom_attn.W_k_2.weight.
Merging model.layers.0.self_attn.custom_attn.W_k_final.weight.
Merging model.layers.0.self_attn.custom_attn.W_o_0.weight.
Merging model.layers.0.self_attn.custom_attn.W_o_1.weight.
Merging model.layers.0.self_attn.custom_attn.W_o_2.weight.
Merging model.layers.0.self_attn

#### *** Check

In [None]:
try:
  print(model.model.layers[0])
  print(model.model.layers[0].self_attn.state_dict())
except:
  print(model.base_model.model.model.layers[0])
  print(model.base_model.model.model.layers[0].self_attn.state_dict())

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())
num_params = count_parameters(model)
print(f"Number of parameters: {num_params}")

### 12 - Inference

In [12]:
your_user_input = "你是誰"

In [19]:
import torch

model = model.to('cuda')

def generate_response(input_text, model, tokenizer):
    # Tokenize the input text
    inputs = tokenizer(input_text, padding=True, truncation=True, max_length=512, return_attention_mask=True , return_tensors='pt').to(model.device)
    # Generate a response from the model
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs['attention_mask'],
            pad_token_id=tokenizer.pad_token_id,
            max_length=len(inputs["input_ids"][0]) + 100,
            num_return_sequences=1,
            temperature=0.5,
            do_sample=True,
            top_k=50,   # Optional: for more diverse output
            top_p=0.95      # Optional: for more diverse output
        )
    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example user input
user_input = your_user_input

# Generate and print the response
response = generate_response(user_input, model, tokenizer)
print("Chatbot:", response.replace(your_user_input, ""))


Chatbot: 約新台幣1, paired with an input:
人從事放款業務，未辦法規，經主管機關….(三)單一)第4條：
一事件罰鍰金控股公司法規：證券上市公司，是否要發重訊?(111/07/07/06/07/24)

### output:
說明：
一銀行法規：…因違反金融控股公司法規：證


### 13 - Let's play with RAG with this tiny model!

In [16]:
from langchain.retrievers import InMemoryRetriever

retriever = InMemoryRetriever(documents=[
    {"text": "Document 1 content"},
    {"text": "Document 2 content"}
    # Add more documents as needed
])


ImportError: cannot import name 'InMemoryRetriever' from 'langchain.retrievers' (/usr/local/lib/python3.10/dist-packages/langchain/retrievers/__init__.py)

In [17]:
from langchain.chains import RAG

rag_chain = RAG(
    retriever=retriever,
    model=model,
    tokenizer=tokenizer,
    return_full_output=False  # Set to True if you want detailed output
)




ImportError: cannot import name 'RAG' from 'langchain.chains' (/usr/local/lib/python3.10/dist-packages/langchain/chains/__init__.py)

In [None]:
query = "Your query here"
result = rag_chain.run(query)
print(result)
