In [None]:
# 讀取Tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    'meta-llama/Meta-Llama-3-8B-Instruct',
    trust_remote_code=True,
    add_special_tokens=False
)
tokenizer.pad_token = tokenizer.eos_token

system_format = {"role": "system", "content": '這是系統指令'}
question_format = {"role": "user", "content": '這是用戶的輸入'}
answer_format = {"role": "assistant", "content": '這是模型回復'}

chat_format = tokenizer.apply_chat_template([system_format, question_format, answer_format])
print(tokenizer.decode(chat_format))

In [None]:
from transformers import BitsAndBytesConfig
import torch

quantization_params = {
            'load_in_4bit': True,
            'bnb_4bit_quant_type': "nf4",
            'bnb_4bit_use_double_quant': True,
            'bnb_4bit_compute_dtype': torch.bfloat16
        }
bnb_config = BitsAndBytesConfig(**quantization_params)

In [None]:
from accelerate import Accelerator
from transformers import AutoModelForCausalLM

device_map = {"": Accelerator().local_process_index}
model = AutoModelForCausalLM.from_pretrained(
        'meta-llama/Meta-Llama-3-8B-Instruct',
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map=device_map,
        use_cache=False,
    )
print(model)

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_params = {
            'r': 32,        
            'target_modules': ["q_proj", "k_proj", "v_proj", "o_proj"],
            'lora_dropout': 0.1,
            'task_type': "CAUSAL_LM",
        }
peft_config = LoraConfig(**peft_params)

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
model = get_peft_model(model, peft_config)
print(model)

In [None]:
from transformers.modeling_utils import unwrap_model

def activate_neftune(model, neftune_noise_alpha = 5):
        unwrapped_model = unwrap_model(model)
        embeddings = unwrapped_model.base_model.model.get_input_embeddings()
        embeddings.neftune_noise_alpha = neftune_noise_alpha # 讓Embedding層的__init__多一個neftune_noise_alpha參數
        # hook embedding layer
        hook_handle = embeddings.register_forward_hook(neftune_post_forward_hook)
        
        return model
        
def neftune_post_forward_hook(module, input, output):
    # 公式來源:https://github.com/neelsjain/NEFTune
    # 論文網址:https://arxiv.org/abs/2310.05914
    if module.training: # 讓他再訓練時有用而已
        # 實現NEFtune公式
        dims = torch.tensor(output.size(1) * output.size(2))
        mag_norm = module.neftune_noise_alpha / torch.sqrt(dims) # 這裡的neftune_noise_alpha就是在__init__的參數
        output = output + torch.zeros_like(output).uniform_(-mag_norm, mag_norm)
            
    return output
model = activate_neftune(model)

In [None]:
import pandas as pd

def transform_format(questions, answers, system='你是一個zh-tw版本的聊天機器人'):
    context = []
    for q, a in zip(questions, answers):
        system_format = {"role": "system", "content": system}
        question_format = {"role": "user", "content": q}
        answer_format = {"role": "assistant", "content": a}
        context.append([system_format, question_format, answer_format])
    return context

# 讀取CSV檔案
df = pd.read_csv('Gossiping-QA-Dataset-2_0.csv')

# 提取問題和答案的列表
questions = df['question'].tolist()[:5000]
answers = df['answer'].tolist()[:5000]

# 轉換格式
formatted_context = transform_format(questions, answers)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# 定義自定義 Dataset
class PTTDataset(Dataset):
    def __init__(self, formatted_context, tokenizer):
        self.formatted_context = formatted_context
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        return self.formatted_context[index]
       
    def __len__(self):
        return len(self.formatted_context)

    def collate_fn(self, batch):
        formatted_contexts = self.tokenizer.apply_chat_template(batch, padding=True, return_dict=True, max_length=8192, return_tensors='pt', truncation=True)
        attention_mask = formatted_contexts['attention_mask']
        labels = formatted_contexts['input_ids'].clone()
        labels[attention_mask == 0] = -100
        formatted_contexts['labels'] = labels
        return formatted_contexts

# 建立資料集
trainset = PTTDataset(formatted_context, tokenizer)
validset = PTTDataset(formatted_context, tokenizer)

# 創建 DataLoader
train_loader = DataLoader(trainset, batch_size=4, shuffle=True, collate_fn=trainset.collate_fn)
valid_loader = DataLoader(validset, batch_size=4, shuffle=True, collate_fn=validset.collate_fn)

In [None]:
import torch.optim as optim
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup
from Trainer import Trainer

# 訓練設置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

optimizer = optim.AdamW(model.parameters(), lr=5e-5)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=len(train_loader) * 0.2, 
        num_training_steps=len(train_loader) * 10, 
        num_cycles=1, 
)

trainer = Trainer(
    epochs=10, 
    train_loader=train_loader, 
    valid_loader=valid_loader,
    model=model, 
    optimizer=[optimizer],
    scheduler=[scheduler],
    early_stopping=3,
)
trainer.train()