<a href="https://colab.research.google.com/github/Brownwang0426/customLlama/blob/main/Cullama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 0 - Pip install

In [None]:
!pip install huggingface_hub transformers trl peft bitsandbytes datasets openpyxl xlrd protobuf safetensors

### 1 - Log in

In [None]:
from huggingface_hub import login

login(token = "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" )

### 2 - Download pretrained model and tokenizer

In [None]:
from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast, BitsAndBytesConfig
import torch

model_name        = "yentinglin/Llama-3-Taiwan-8B-Instruct" # "meta-llama/Llama-2-7b-chat-hf"
num_hidden_layers = 2

config = LlamaConfig.from_pretrained(model_name, cache_dir="./")
config.num_hidden_layers = num_hidden_layers
new_model      = LlamaForCausalLM(config)
new_state_dict = {}

old_model      = LlamaForCausalLM.from_pretrained(model_name, cache_dir="./")
old_state_dict = old_model.state_dict()

for name, param in new_model.state_dict().items():
    if name in old_state_dict:
        if 'layers.' in name:
            # Check if the layer index is within the range of the new model
            original_layer_index = int(name.split('.')[2])
            if original_layer_index < config.num_hidden_layers:
                new_state_dict[name] = old_state_dict[name]
        else:
            new_state_dict[name] = old_state_dict[name]

# Load the filtered state dictionary into the new model
new_model.load_state_dict(new_state_dict, strict=False)

model = new_model

tokenizer   = PreTrainedTokenizerFast.from_pretrained(model_name, cache_dir="./" )

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

#### *** Check layers, modules, params

In [None]:
try:
  print(model.model.layers[0])
  print(model.model.layers[0].self_attn.state_dict())
except:
  print(model.base_model.model.model.layers[0])
  print(model.base_model.model.model.layers[0].self_attn.state_dict())

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())
num_params = count_parameters(model)
print(f"Number of parameters: {num_params}")

### 3 - Customize your attention

#### 3.1 - Append your attn to llama's attn

In [None]:
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention

customization = True

if customization:

    class custom_attn(nn.Module):
        def __init__(self, d_model, num_heads = 8):
            super(custom_attn, self).__init__()

            assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

            self.bias      = False
            self.d_model   = d_model
            self.num_heads = num_heads
            self.d_k       = d_model // num_heads

            self.W_q_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_0  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_1  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_2  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_final  = nn.Linear(d_model, d_model, bias=self.bias)

        def scaled_dot_product_attention(self, Q, K, V, mask=None):

            attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask

            attn_probs = torch.softmax(attn_scores, dim=-1)
            output     = torch.matmul(attn_probs, V)

            return output

        def split_heads(self, x):
            batch_size, seq_length, d_model = x.size()
            return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
            #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

        def combine_heads(self, x):
            batch_size, _, seq_length, d_k = x.size()
            return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        def forward(self, Q, K, V, mask=None):
            # Q    -> (batch_size, seq_length, d_model)
            # mask -> (batch_size, 1, seq_length, d_model)
            Q_0 = self.split_heads(self.W_q_0(Q))
            K_0 = self.split_heads(self.W_k_0(K))
            V_0 = self.split_heads(self.W_v_0(V))
            attn_output_0 = self.scaled_dot_product_attention(Q_0, K_0, V_0, mask)
            output_0      = self.W_o_0(self.combine_heads(attn_output_0))

            Q_1 = self.split_heads(self.W_q_1(Q))
            K_1 = self.split_heads(self.W_k_1(K))
            V_1 = self.split_heads(self.W_v_1(V))
            attn_output_1 = self.scaled_dot_product_attention(Q_1, K_1, V_1, mask)
            output_1      = self.W_o_1(self.combine_heads(attn_output_1))

            Q_2 = self.split_heads(self.W_q_2(Q))
            K_2 = self.split_heads(self.W_k_2(K))
            V_2 = self.split_heads(self.W_v_2(V))
            attn_output_2 = self.scaled_dot_product_attention(Q_2, K_2, V_2, mask)
            output_2      = self.W_o_2(self.combine_heads(attn_output_2))

            Q_final = self.split_heads(self.W_q_final(output_0))
            K_final = self.split_heads(self.W_k_final(output_1))
            V_final = self.split_heads(self.W_v_final(output_2))
            attn_output_final = self.scaled_dot_product_attention(Q_final, K_final, V_final, mask)
            output_final      = self.W_o_final(self.combine_heads(attn_output_final))

            return output_final






    class self_attn(LlamaAttention):
        def __init__(self, config, layer_idx):
            super().__init__(config, layer_idx)
            self.layer_idx   = layer_idx
            self.custom_attn = custom_attn(config.hidden_size, 2)

        def forward(self, hidden_states,
                          attention_mask=None,
                          position_ids=None,
                          past_key_value=None,
                          output_attentions=None,
                          use_cache=None,
                          cache_position=None,
                          max_position_embeddings=None,
                          layer_idx = None,
                          **kwargs):

            # Apply the custom attention layer
            hidden_states = self.custom_attn(hidden_states,hidden_states,hidden_states,attention_mask)
            # self_attn_weights, present_key_value skipped

            # Continue with the original LLaMA attention mechanism
            return super().forward(hidden_states       ,
                                  attention_mask=attention_mask,
                                  position_ids=position_ids,
                                  past_key_value=past_key_value,
                                  output_attentions=output_attentions,
                                  use_cache=use_cache,
                                  cache_position=cache_position,
                                  max_position_embeddings=max_position_embeddings,
                                  layer_idx=self.layer_idx,
                                  **kwargs)




    for i in range(model.config.num_hidden_layers):
      print(f"Replacing attention layer: {i}")
      try:
        model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)
      except:
        model.base_model.model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)




#### 3.2 - Append traditional attn to llama's attn

In [None]:
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention

customization = True

if customization:

    class custom_attn(nn.Module):
        def __init__(self, d_model, num_heads = 8):
            super(custom_attn, self).__init__()

            assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

            self.bias      = False
            self.d_model   = d_model
            self.num_heads = num_heads
            self.d_k       = d_model // num_heads

            self.W_q  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o  = nn.Linear(d_model, d_model, bias=self.bias)


        def scaled_dot_product_attention(self, Q, K, V, mask=None):


            attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask

            attn_probs = torch.softmax(attn_scores, dim=-1)
            output     = torch.matmul(attn_probs, V)

            return output

        def split_heads(self, x):
            batch_size, seq_length, d_model = x.size()
            return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
            #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

        def combine_heads(self, x):
            batch_size, _, seq_length, d_k = x.size()
            return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        def forward(self, Q, K, V, mask=None):
            # Q    -> (batch_size, seq_length, d_model)
            # mask -> (batch_size, 1, seq_length, d_model)
            Q = self.split_heads(self.W_q(Q))
            K = self.split_heads(self.W_k(K))
            V = self.split_heads(self.W_v(V))
            attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
            output      = self.W_o(self.combine_heads(attn_output))
            return output






    class self_attn(LlamaAttention):
        def __init__(self, config, layer_idx):
            super().__init__(config, layer_idx)
            self.layer_idx   = layer_idx
            self.brown_layer = custom_attn(config.hidden_size, 2)

        def forward(self, hidden_states,
                          attention_mask=None,
                          position_ids=None,
                          past_key_value=None,
                          output_attentions=None,
                          use_cache=None,
                          cache_position=None,
                          max_position_embeddings=None,
                          layer_idx = None,
                          **kwargs):

            # Apply the custom attention layer
            hidden_states = self.brown_layer(hidden_states,hidden_states,hidden_states,attention_mask)
            # self_attn_weights, present_key_value skipped

            # Continue with the original LLaMA attention mechanism
            return super().forward(hidden_states       ,
                                  attention_mask=attention_mask,
                                  position_ids=position_ids,
                                  past_key_value=past_key_value,
                                  output_attentions=output_attentions,
                                  use_cache=use_cache,
                                  cache_position=cache_position,
                                  max_position_embeddings=max_position_embeddings,
                                  layer_idx=self.layer_idx,
                                  **kwargs)




    for i in range(model.config.num_hidden_layers):
      print(f"Replacing attention layer: {i}")
      try:
        model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)
      except:
        model.base_model.model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)


#### 3.3 - Replace llama's attn with your attn (multi-multi-head attn)

In [None]:
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention

customization = True

if customization:

    class custom_attn(nn.Module):
        def __init__(self, d_model, num_heads = 8):
            super(custom_attn, self).__init__()

            assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

            self.bias      = False
            self.d_model   = d_model
            self.num_heads = num_heads
            self.d_k       = d_model // num_heads

            self.W_q_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_0  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_1  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_2  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_final  = nn.Linear(d_model, d_model, bias=self.bias)

        def scaled_dot_product_attention(self, Q, K, V, mask=None):

            attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask

            attn_probs = torch.softmax(attn_scores, dim=-1)
            output     = torch.matmul(attn_probs, V)

            return output

        def split_heads(self, x):
            batch_size, seq_length, d_model = x.size()
            return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
            #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

        def combine_heads(self, x):
            batch_size, _, seq_length, d_k = x.size()
            return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        def forward(self, Q, K, V, mask=None):
            # Q    -> (batch_size, seq_length, d_model)
            # mask -> (batch_size, 1, seq_length, d_model)
            Q_0 = self.split_heads(self.W_q_0(Q))
            K_0 = self.split_heads(self.W_k_0(K))
            V_0 = self.split_heads(self.W_v_0(V))
            attn_output_0 = self.scaled_dot_product_attention(Q_0, K_0, V_0, mask)
            output_0      = self.W_o_0(self.combine_heads(attn_output_0))

            Q_1 = self.split_heads(self.W_q_1(Q))
            K_1 = self.split_heads(self.W_k_1(K))
            V_1 = self.split_heads(self.W_v_1(V))
            attn_output_1 = self.scaled_dot_product_attention(Q_1, K_1, V_1, mask)
            output_1      = self.W_o_1(self.combine_heads(attn_output_1))

            Q_2 = self.split_heads(self.W_q_2(Q))
            K_2 = self.split_heads(self.W_k_2(K))
            V_2 = self.split_heads(self.W_v_2(V))
            attn_output_2 = self.scaled_dot_product_attention(Q_2, K_2, V_2, mask)
            output_2      = self.W_o_2(self.combine_heads(attn_output_2))

            Q_final = self.split_heads(self.W_q_final(output_0))
            K_final = self.split_heads(self.W_k_final(output_1))
            V_final = self.split_heads(self.W_v_final(output_2))
            attn_output_final = self.scaled_dot_product_attention(Q_final, K_final, V_final, mask)
            output_final      = self.W_o_final(self.combine_heads(attn_output_final))

            return output_final






    class self_attn(nn.Module):
        def __init__(self, config, layer_idx):
            super(self_attn, self).__init__()
            self.layer_idx   = layer_idx
            self.custom_attn = custom_attn(config.hidden_size, 8)

        def forward(self, hidden_states,
                          attention_mask=None,
                          position_ids=None,
                          past_key_value=None,
                          output_attentions=False,
                          use_cache=False,
                          cache_position=None,
                          max_position_embeddings  =None,
                          layer_idx = None,
                          **kwargs):

            # Apply the custom attention layer
            hidden_states = self.custom_attn(hidden_states,hidden_states,hidden_states,attention_mask)
            attn_weights = None
            past_key_value = None

            # Continue with the original LLaMA attention mechanism
            return hidden_states, attn_weights, past_key_value



    for i in range(model.config.num_hidden_layers):
      print(f"Replacing attention layer: {i}")
      try:
        model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)
      except:
        model.base_model.model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)




#### 3.4 - Replace llama's attn with your attn (consecutive softmax)

In [None]:
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention

customization = True

if customization:

    class custom_attn(nn.Module):
        def __init__(self, d_model, num_heads = 8):
            super(custom_attn, self).__init__()

            assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

            self.bias      = False
            self.d_model   = d_model
            self.num_heads = num_heads
            self.d_k       = d_model // num_heads

            self.W_q  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v1 = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v2 = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v3 = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v4 = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o  = nn.Linear(d_model, d_model, bias=self.bias)


        def scaled_dot_product_attention(self, Q, K, V1, V2, V3, mask=None):

            attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask
            attn_probs = torch.softmax(attn_scores, dim=-1)

            output     = torch.matmul(attn_probs, V1)

            attn_scores = torch.matmul(output, V2.transpose(-2, -1)) / (self.d_k ** 0.5)
            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask
            attn_probs = torch.softmax(attn_scores, dim=-1)

            output     = torch.matmul(attn_probs, V3)

            return output


        def split_heads(self, x):
            batch_size, seq_length, d_model = x.size()
            return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
            #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

        def combine_heads(self, x):
            batch_size, _, seq_length, d_k = x.size()
            return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        def forward(self, Q, K, V1, V2, V3, mask=None):
            # Q    -> (batch_size, seq_length, d_model)
            # mask -> (batch_size, 1, seq_length, d_model)
            Q  = self.split_heads(self.W_q(Q))
            K  = self.split_heads(self.W_k(K))
            V1 = self.split_heads(self.W_v1(V1))
            V2 = self.split_heads(self.W_v2(V2))
            V3 = self.split_heads(self.W_v3(V3))
            attn_output = self.scaled_dot_product_attention(Q, K, V1, V2, V3, mask)
            output      = self.W_o(self.combine_heads(attn_output))

            return output






    class self_attn(nn.Module):
        def __init__(self, config, layer_idx):
            super(self_attn, self).__init__()
            self.layer_idx   = layer_idx
            self.custom_attn = custom_attn(config.hidden_size, 8)

        def forward(self, hidden_states,
                          attention_mask=None,
                          position_ids=None,
                          past_key_value=None,
                          output_attentions=False,
                          use_cache=False,
                          cache_position=None,
                          max_position_embeddings  =None,
                          layer_idx = None,
                          **kwargs):

            # Apply the custom attention layer
            hidden_states = self.custom_attn(hidden_states,hidden_states,hidden_states,hidden_states,hidden_states,attention_mask)
            attn_weights = None
            past_key_value = None

            # Continue with the original LLaMA attention mechanism
            return hidden_states, attn_weights, past_key_value



    for i in range(model.config.num_hidden_layers):
      print(f"Replacing attention layer: {i}")
      try:
        model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)
      except:
        model.base_model.model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)




#### *** Check

In [None]:
try:
  print(model.model.layers[0])
  print(model.model.layers[0].self_attn.state_dict())
except:
  print(model.base_model.model.model.layers[0])
  print(model.base_model.model.model.layers[0].self_attn.state_dict())

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())
num_params = count_parameters(model)
print(f"Number of parameters: {num_params}")

In [None]:
# from peft import get_peft_model, LoraConfig, TaskType
#
# # Define LoRA configuration
# lora_config = LoraConfig(
#     task_type=TaskType.CAUSAL_LM,
#     inference_mode=False,
#     r=100,
#     lora_alpha=32,
#     lora_dropout=0.1,
# )
#
# # Apply LoRA to the quantized model
# model = get_peft_model(model, lora_config)

In [None]:
# try:
#   print(model.model.layers[0])
#   print(model.model.layers[0].self_attn.state_dict())
# except:
#   print(model.base_model.model.model.layers[0])
#   print(model.base_model.model.model.layers[0].self_attn.state_dict())
#
# def count_parameters(model):
#     return sum(p.numel() for p in model.parameters())
# num_params = count_parameters(model)
# print(f"Number of parameters: {num_params}")

### 4 - Prep data

#### 4.1 Use HF data

In [None]:
from datasets import load_dataset

# Define the prompt and EOS token
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### instruction:
{}

### input:
{}

### output:
{}"""
EOS_TOKEN = tokenizer.eos_token

# Function to format the dataset into prompts
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts }

# ---------------------------------------------------

# Load the dataset
# ikala/tmmluplus
# kigner/ruozhiba-llama3
dataset = load_dataset("kigner/ruozhiba-llama3", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

# ---------------------------------------------------

# Tokenize the formatted dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
train_dataset = dataset.map(tokenize_function, batched=True)

# Prepare labels (same as input_ids)
def prepare_labels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples
train_dataset = train_dataset.map(prepare_labels, batched=True)

# Remove unused columns
train_dataset = train_dataset.remove_columns(["instruction", "input", "output", "text"])

# Now train_dataset contains the columns `input_ids`, `attention_mask`, and `labels`
print(train_dataset)


#### 4.2 Use local data

In [None]:
from datasets import load_dataset

# Define the prompt and EOS token
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### instruction:
{}

### input:
{}

### output:
{}"""
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

# Function to format the dataset into prompts
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

# ---------------------------------------------------

# Read local xlsx
import pandas as pd
excel_file = pd.ExcelFile('./fintech_qa.xlsx', engine='openpyxl')
for sheet_name in excel_file.sheet_names:
    df = pd.read_excel(excel_file, sheet_name=sheet_name)
    df.to_csv(f'./{sheet_name}.csv', index=False)

# Load the dataset
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'train': f'./{sheet_name}.csv'})
dataset = dataset.map(formatting_prompts_func, batched=True)['train']

# ---------------------------------------------------

# Tokenize the formatted dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
train_dataset = dataset.map(tokenize_function, batched=True)

# Prepare labels (same as input_ids)
def prepare_labels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples
train_dataset = train_dataset.map(prepare_labels, batched=True)

# Remove unused columns
train_dataset = train_dataset.remove_columns(["instruction", "input", "output", "text"])

# Now train_dataset contains the columns `input_ids`, `attention_mask`, and `labels`
print(train_dataset)


### 5 - Train

In [None]:
from transformers import TrainingArguments, Trainer, TrainerCallback

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    # per_device_eval_batch_size=1,
    num_train_epochs=100,
    remove_unused_columns=False,
    save_strategy='no'
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=eval_dataset
)

trainer_stats = trainer.train()


#### *** Check

In [None]:
try:
  print(model.model.layers[0])
  print(model.model.layers[0].self_attn.state_dict())
except:
  print(model.base_model.model.model.layers[0])
  print(model.base_model.model.model.layers[0].self_attn.state_dict())

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())
num_params = count_parameters(model)
print(f"Number of parameters: {num_params}")

### 6 - Inference

In [None]:
import torch

def generate_response(input_text, model, tokenizer, max_length=100):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    # Generate a response from the model
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=max_length,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            top_k=50,  # Optional: for more diverse output
            top_p=0.95  # Optional: for more diverse output
        )

    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example user input
user_input = "金控負責人以個人名義兼海外，是否要董事會決議?"

# Generate and print the response
response = generate_response(user_input, model, tokenizer)
print("Chatbot:", response)


### 7 - Test

In [None]:
from datasets import load_dataset

# Define the prompt and EOS token
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### instruction:
{}

### input:
{}

### output:
{}"""
EOS_TOKEN = tokenizer.eos_token

# Function to format the dataset into prompts
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts }

# ---------------------------------------------------

# Load the dataset
dataset = load_dataset("kigner/ruozhiba-llama3", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

# ---------------------------------------------------

# Tokenize the formatted dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
test_dataset = dataset.map(tokenize_function, batched=True)

# Prepare labels (same as input_ids)
def prepare_labels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples
test_dataset = test_dataset.map(prepare_labels, batched=True)

# Remove unused columns
test_dataset = test_dataset.remove_columns(["instruction", "input", "output", "text"])

# Now test_dataset contains the columns `input_ids`, `attention_mask`, and `labels`
print(test_dataset)




# ------------------------------------------------------------------------------------------------------




from transformers import TrainingArguments, Trainer, TrainerCallback

testing_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=1
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=testing_args,
    eval_dataset=test_dataset
)

# Evaluate the model
trainer_stats = trainer.evaluate()

# Print evaluation results
print(trainer_stats)



### 8 - Save & Upload

In [None]:
model.save_pretrained("Llama-3-Taiwan-8B-Instruct-to-1B")
tokenizer.save_pretrained("Llama-3-Taiwan-8B-Instruct-to-1B")
model.push_to_hub("Brownwang0426/Llama-3-Taiwan-8B-Instruct-to-1B", token = "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") # Online saving
tokenizer.push_to_hub("Brownwang0426/Llama-3-Taiwan-8B-Instruct-to-1B", token = "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") # Online saving

### 9- Download

In [None]:
from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast, BitsAndBytesConfig
import torch

model_name = "Brownwang0426/Llama-3-Taiwan-8B-Instruct-to-1B" # "meta-llama/Llama-2-7b-chat-hf"
model = LlamaForCausalLM.from_pretrained(model_name)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

#### *** Check

In [None]:
try:
  print(model.model.layers[0])
  print(model.model.layers[0].self_attn.state_dict())
except:
  print(model.base_model.model.model.layers[0])
  print(model.base_model.model.model.layers[0].self_attn.state_dict())

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())
num_params = count_parameters(model)
print(f"Number of parameters: {num_params}")

### 10 - Customize your attention

#### 10-1 Append your attn to llama's attn

In [None]:
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention

customization = True

if customization:

    class custom_attn(nn.Module):
        def __init__(self, d_model, num_heads = 8):
            super(custom_attn, self).__init__()

            assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

            self.bias      = False
            self.d_model   = d_model
            self.num_heads = num_heads
            self.d_k       = d_model // num_heads

            self.W_q_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_0  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_1  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_2  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_final  = nn.Linear(d_model, d_model, bias=self.bias)

        def scaled_dot_product_attention(self, Q, K, V, mask=None):

            attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask

            attn_probs = torch.softmax(attn_scores, dim=-1)
            output     = torch.matmul(attn_probs, V)

            return output

        def split_heads(self, x):
            batch_size, seq_length, d_model = x.size()
            return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
            #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

        def combine_heads(self, x):
            batch_size, _, seq_length, d_k = x.size()
            return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        def forward(self, Q, K, V, mask=None):
            # Q    -> (batch_size, seq_length, d_model)
            # mask -> (batch_size, 1, seq_length, d_model)
            Q_0 = self.split_heads(self.W_q_0(Q))
            K_0 = self.split_heads(self.W_k_0(K))
            V_0 = self.split_heads(self.W_v_0(V))
            attn_output_0 = self.scaled_dot_product_attention(Q_0, K_0, V_0, mask)
            output_0      = self.W_o_0(self.combine_heads(attn_output_0))

            Q_1 = self.split_heads(self.W_q_1(Q))
            K_1 = self.split_heads(self.W_k_1(K))
            V_1 = self.split_heads(self.W_v_1(V))
            attn_output_1 = self.scaled_dot_product_attention(Q_1, K_1, V_1, mask)
            output_1      = self.W_o_1(self.combine_heads(attn_output_1))

            Q_2 = self.split_heads(self.W_q_2(Q))
            K_2 = self.split_heads(self.W_k_2(K))
            V_2 = self.split_heads(self.W_v_2(V))
            attn_output_2 = self.scaled_dot_product_attention(Q_2, K_2, V_2, mask)
            output_2      = self.W_o_2(self.combine_heads(attn_output_2))

            Q_final = self.split_heads(self.W_q_final(output_0))
            K_final = self.split_heads(self.W_k_final(output_1))
            V_final = self.split_heads(self.W_v_final(output_2))
            attn_output_final = self.scaled_dot_product_attention(Q_final, K_final, V_final, mask)
            output_final      = self.W_o_final(self.combine_heads(attn_output_final))

            return output_final






    class self_attn(LlamaAttention):
        def __init__(self, config, layer_idx):
            super().__init__(config, layer_idx)
            self.layer_idx   = layer_idx
            self.custom_attn = custom_attn(config.hidden_size, 2)

        def forward(self, hidden_states,
                          attention_mask=None,
                          position_ids=None,
                          past_key_value=None,
                          output_attentions=None,
                          use_cache=None,
                          cache_position=None,
                          position_embeddings=None,
                          layer_idx = None,
                          **kwargs):

            # Apply the custom attention layer
            hidden_states = self.custom_attn(hidden_states,hidden_states,hidden_states,attention_mask)
            # self_attn_weights, present_key_value skipped

            # Continue with the original LLaMA attention mechanism
            return super().forward(hidden_states       ,
                                  attention_mask=attention_mask,
                                  position_ids=position_ids,
                                  past_key_value=past_key_value,
                                  output_attentions=output_attentions,
                                  use_cache=use_cache,
                                  cache_position=cache_position,
                                  position_embeddings=position_embeddings,
                                  layer_idx=self.layer_idx,
                                  **kwargs)




    for i in range(model.config.num_hidden_layers):
      print(f"Replacing attention layer: {i}")
      try:
        model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)
      except:
        model.base_model.model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)




#### 10.2 - Append traditional attn to llama's attn

In [None]:
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention

customization = True

if customization:

    class custom_attn(nn.Module):
        def __init__(self, d_model, num_heads = 8):
            super(custom_attn, self).__init__()

            assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

            self.bias      = False
            self.d_model   = d_model
            self.num_heads = num_heads
            self.d_k       = d_model // num_heads

            self.W_q  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o  = nn.Linear(d_model, d_model, bias=self.bias)


        def scaled_dot_product_attention(self, Q, K, V, mask=None):


            attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask

            attn_probs = torch.softmax(attn_scores, dim=-1)
            output     = torch.matmul(attn_probs, V)

            return output

        def split_heads(self, x):
            batch_size, seq_length, d_model = x.size()
            return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
            #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

        def combine_heads(self, x):
            batch_size, _, seq_length, d_k = x.size()
            return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        def forward(self, Q, K, V, mask=None):
            # Q    -> (batch_size, seq_length, d_model)
            # mask -> (batch_size, 1, seq_length, d_model)
            Q = self.split_heads(self.W_q(Q))
            K = self.split_heads(self.W_k(K))
            V = self.split_heads(self.W_v(V))
            attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
            output      = self.W_o(self.combine_heads(attn_output))
            return output






    class self_attn(LlamaAttention):
        def __init__(self, config, layer_idx):
            super().__init__(config, layer_idx)
            self.layer_idx   = layer_idx
            self.brown_layer = custom_attn(config.hidden_size, 2)

        def forward(self, hidden_states,
                          attention_mask=None,
                          position_ids=None,
                          past_key_value=None,
                          output_attentions=None,
                          use_cache=None,
                          cache_position=None,
                          max_position_embeddings=None,
                          layer_idx = None,
                          **kwargs):

            # Apply the custom attention layer
            hidden_states = self.brown_layer(hidden_states,hidden_states,hidden_states,attention_mask)
            # self_attn_weights, present_key_value skipped

            # Continue with the original LLaMA attention mechanism
            return super().forward(hidden_states       ,
                                  attention_mask=attention_mask,
                                  position_ids=position_ids,
                                  past_key_value=past_key_value,
                                  output_attentions=output_attentions,
                                  use_cache=use_cache,
                                  cache_position=cache_position,
                                  max_position_embeddings=max_position_embeddings,
                                  layer_idx=self.layer_idx,
                                  **kwargs)




    for i in range(model.config.num_hidden_layers):
      print(f"Replacing attention layer: {i}")
      try:
        model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)
      except:
        model.base_model.model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)


#### 10.3 - Replace llama's attn with your attn (multi-multi-head attn)

In [None]:
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention

customization = True

if customization:

    class custom_attn(nn.Module):
        def __init__(self, d_model, num_heads = 8):
            super(custom_attn, self).__init__()

            assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

            self.bias      = False
            self.d_model   = d_model
            self.num_heads = num_heads
            self.d_k       = d_model // num_heads

            self.W_q_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_0  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_0  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_1  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_1  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_2  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_2  = nn.Linear(d_model, d_model, bias=self.bias)

            self.W_q_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v_final  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o_final  = nn.Linear(d_model, d_model, bias=self.bias)

        def scaled_dot_product_attention(self, Q, K, V, mask=None):

            attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask

            attn_probs = torch.softmax(attn_scores, dim=-1)
            output     = torch.matmul(attn_probs, V)

            return output

        def split_heads(self, x):
            batch_size, seq_length, d_model = x.size()
            return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
            #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

        def combine_heads(self, x):
            batch_size, _, seq_length, d_k = x.size()
            return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        def forward(self, Q, K, V, mask=None):
            # Q    -> (batch_size, seq_length, d_model)
            # mask -> (batch_size, 1, seq_length, d_model)
            Q_0 = self.split_heads(self.W_q_0(Q))
            K_0 = self.split_heads(self.W_k_0(K))
            V_0 = self.split_heads(self.W_v_0(V))
            attn_output_0 = self.scaled_dot_product_attention(Q_0, K_0, V_0, mask)
            output_0      = self.W_o_0(self.combine_heads(attn_output_0))

            Q_1 = self.split_heads(self.W_q_1(Q))
            K_1 = self.split_heads(self.W_k_1(K))
            V_1 = self.split_heads(self.W_v_1(V))
            attn_output_1 = self.scaled_dot_product_attention(Q_1, K_1, V_1, mask)
            output_1      = self.W_o_1(self.combine_heads(attn_output_1))

            Q_2 = self.split_heads(self.W_q_2(Q))
            K_2 = self.split_heads(self.W_k_2(K))
            V_2 = self.split_heads(self.W_v_2(V))
            attn_output_2 = self.scaled_dot_product_attention(Q_2, K_2, V_2, mask)
            output_2      = self.W_o_2(self.combine_heads(attn_output_2))

            Q_final = self.split_heads(self.W_q_final(output_0))
            K_final = self.split_heads(self.W_k_final(output_1))
            V_final = self.split_heads(self.W_v_final(output_2))
            attn_output_final = self.scaled_dot_product_attention(Q_final, K_final, V_final, mask)
            output_final      = self.W_o_final(self.combine_heads(attn_output_final))

            return output_final






    class self_attn(nn.Module):
        def __init__(self, config, layer_idx):
            super(self_attn, self).__init__()
            self.layer_idx   = layer_idx
            self.custom_attn = custom_attn(config.hidden_size, 8)

        def forward(self, hidden_states,
                          attention_mask=None,
                          position_ids=None,
                          past_key_value=None,
                          output_attentions=False,
                          use_cache=False,
                          cache_position=None,
                          max_position_embeddings  =None,
                          layer_idx = None,
                          **kwargs):

            # Apply the custom attention layer
            hidden_states = self.custom_attn(hidden_states,hidden_states,hidden_states,attention_mask)
            attn_weights = None
            past_key_value = None

            # Continue with the original LLaMA attention mechanism
            return hidden_states, attn_weights, past_key_value



    for i in range(model.config.num_hidden_layers):
      print(f"Replacing attention layer: {i}")
      try:
        model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)
      except:
        model.base_model.model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)




#### 10.4 - Replace llama's attn with your attn (consecutive softmax)

In [None]:
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention

customization = True

if customization:

    class custom_attn(nn.Module):
        def __init__(self, d_model, num_heads = 8):
            super(custom_attn, self).__init__()

            assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

            self.bias      = False
            self.d_model   = d_model
            self.num_heads = num_heads
            self.d_k       = d_model // num_heads

            self.W_q  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_k  = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v1 = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v2 = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v3 = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_v4 = nn.Linear(d_model, d_model, bias=self.bias)
            self.W_o  = nn.Linear(d_model, d_model, bias=self.bias)


        def scaled_dot_product_attention(self, Q, K, V1, V2, V3, mask=None):

            attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5)

            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask
            attn_probs = torch.softmax(attn_scores, dim=-1)

            output     = torch.matmul(attn_probs, V1)

            attn_scores = torch.matmul(output, V2.transpose(-2, -1)) / (self.d_k ** 0.5)
            if mask is not None:
                # llama's values in un-masked poistions are 0
                attn_scores += mask
            attn_probs = torch.softmax(attn_scores, dim=-1)

            output     = torch.matmul(attn_probs, V3)

            return output


        def split_heads(self, x):
            batch_size, seq_length, d_model = x.size()
            return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
            #  (batch_size, seq_length, d_model) - > (batch_size, seq_length, self.num_heads, self.d_k) -> (batch_size, self.num_heads, seq_length, self.d_k)

        def combine_heads(self, x):
            batch_size, _, seq_length, d_k = x.size()
            return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

        def forward(self, Q, K, V1, V2, V3, mask=None):
            # Q    -> (batch_size, seq_length, d_model)
            # mask -> (batch_size, 1, seq_length, d_model)
            Q  = self.split_heads(self.W_q(Q))
            K  = self.split_heads(self.W_k(K))
            V1 = self.split_heads(self.W_v1(V1))
            V2 = self.split_heads(self.W_v2(V2))
            V3 = self.split_heads(self.W_v3(V3))
            attn_output = self.scaled_dot_product_attention(Q, K, V1, V2, V3, mask)
            output      = self.W_o(self.combine_heads(attn_output))

            return output






    class self_attn(nn.Module):
        def __init__(self, config, layer_idx):
            super(self_attn, self).__init__()
            self.layer_idx   = layer_idx
            self.custom_attn = custom_attn(config.hidden_size, 8)

        def forward(self, hidden_states,
                          attention_mask=None,
                          position_ids=None,
                          past_key_value=None,
                          output_attentions=False,
                          use_cache=False,
                          cache_position=None,
                          max_position_embeddings  =None,
                          layer_idx = None,
                          **kwargs):

            # Apply the custom attention layer
            hidden_states = self.custom_attn(hidden_states,hidden_states,hidden_states,hidden_states,hidden_states,attention_mask)
            attn_weights = None
            past_key_value = None

            # Continue with the original LLaMA attention mechanism
            return hidden_states, attn_weights, past_key_value



    for i in range(model.config.num_hidden_layers):
      print(f"Replacing attention layer: {i}")
      try:
        model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)
      except:
        model.base_model.model.model.layers[i].self_attn = self_attn(model.config, layer_idx=i)




#### *** Check

In [None]:
try:
  print(model.model.layers[0])
  print(model.model.layers[0].self_attn.state_dict())
except:
  print(model.base_model.model.model.layers[0])
  print(model.base_model.model.model.layers[0].self_attn.state_dict())

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())
num_params = count_parameters(model)
print(f"Number of parameters: {num_params}")

### 11 - Retreive safetensor

In [None]:
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
import torch
import os

# Replace with your model's repository ID
repo_id = "Brownwang0426/Llama-3-Taiwan-8B-Instruct-to-1B"  # Example: "Brownwang0426/Llama-3-Taiwan-8B-Instruct-to-1B"

# Specify the filenames of the SafeTensor files you want to download
filenames = ["model-00001-of-00002.safetensors", "model-00002-of-00002.safetensors"]

# Specify dir
custom_cache_dir =  "./"

# Download each SafeTensor file
for filename in filenames:
    filepath = hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=custom_cache_dir, token = "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")



state_dict_combined = {}

# Iterate over all safetensor files in the directory
for filename in os.listdir(os.path.join(custom_cache_dir, "models--Brownwang0426--Llama-3-Taiwan-8B-Instruct-to-1B" ,"blobs")):
    file_path = os.path.join(custom_cache_dir, "models--Brownwang0426--Llama-3-Taiwan-8B-Instruct-to-1B" ,"blobs", filename)
    print(f"Loading {file_path}...")

    # Load the safetensor file
    state_dict = load_file(file_path)

    # Merge the state_dict with the merged_state_dict
    state_dict_combined.update(state_dict)

# Merge the weights from the current SafeTensor
for key, tensor in state_dict_combined.items():
    if key in model.state_dict():
        print(f"Merging {key}.")
        model.state_dict()[key].copy_(tensor)
    else:
        print(f"Warning: {key} not found in the model. Skipping.")



#### *** Check

In [None]:
try:
  print(model.model.layers[0])
  print(model.model.layers[0].self_attn.state_dict())
except:
  print(model.base_model.model.model.layers[0])
  print(model.base_model.model.model.layers[0].self_attn.state_dict())

def count_parameters(model):
    return sum(p.numel() for p in model.parameters())
num_params = count_parameters(model)
print(f"Number of parameters: {num_params}")

### 12 - Inference

In [None]:
import torch

def generate_response(input_text, model, tokenizer, max_length=100):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    # Generate a response from the model
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=max_length,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            top_k=50,  # Optional: for more diverse output
            top_p=0.95  # Optional: for more diverse output
        )

    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example user input
user_input = "金控負責人以個人名義兼海外，是否要董事會決議?"

# Generate and print the response
response = generate_response(user_input, model, tokenizer)
print("Chatbot:", response)
