### Defalut Setting 

In [1]:
import torch
import os 
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import LlamaForCausalLM, LlamaTokenizer, LlamaTokenizerFast, BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
from transformers import GenerationConfig

In [2]:
default_path = os.getcwd()
data_path = os.path.join(default_path, '../../data')
model_path = os.path.join(default_path, '../../models')
local_mistral_dir = os.path.join(model_path, "mistral_origin")
config_path = os.path.join(default_path, '../../config')

In [8]:
seed = 42

In [3]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda', index=0)

In [18]:
dataset = load_dataset("kaitchup/ultrachat-100k-flattened")
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 5140
    })
})

In [29]:
dataset['train'][0].keys()

dict_keys(['text'])

In [32]:
dataset['train'][0]['text']

"### Human: Create a salsa song with lyrics that capture the intensity and fervor of a passionate love affair between two people. The song should have a fast-paced tempo and incorporate traditional salsa instrumentation such as brass, percussion, and piano. The lyrics should paint a vivid picture of the emotions involved in a passionate romance, including love, desire, yearning, and heartbreak. Feel free to draw inspiration from real-life experiences or fictional narratives. At the end of the song, listeners should be left feeling captivated and moved by the powerful message of the song.### Assistant: Verse 1:\nDesde el momento en que te vi\nYo supe que ibas a ser para mí\nTu cariño me envuelve \nCon tu amor siempre quiero seguir\n\nChorus:\nEl fuego arde en mi pecho\nCuando estoy contigo, mi amor\nEste romance es intenso\nY nunca se acabará, jamás\n\nVerse 2:\nTu sonrisa me encanta\nY tus ojos me hacen sentir \nQue eres mi otra mitad\nY no puedo vivir sin ti \n\nChorus:\nEl fuego arde

### Load Tokenizer & Model

In [5]:
tokenizer = AutoTokenizer.from_pretrained(os.path.join(local_mistral_dir, 'tokenizer'), use_fast=True)

In [30]:
tokenizer.unk_token, tokenizer.pad_token

('<unk>', '<unk>')

In [33]:
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id
tokenizer.padding_side = 'right'   # right, left

#### Quantize model with BitsAndBytes

In [34]:
compute_dtype = getattr(torch, "bfloat16")   # float16 -> bfloat16 : training more stable 
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

In [None]:
#Load the model and quantize it on the fly
model = AutoModelForCausalLM.from_pretrained(
      local_mistral_dir, quantization_config=bnb_config, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map={"": 0}
)

loading configuration file /rag/jupyter/llm/../../models/mistral_origin/config.json
Model config MistralConfig {
  "_name_or_path": "/rag/jupyter/llm/../../models/mistral_origin",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.36.2",
  "use_cache": false,
  "vocab_size": 32000
}

loading weights file /rag/jupyter/llm/../../models/mistral_origin/model.safetensors.index.json
Instantiating MistralForCausalLM model under default dtype torch.float16.
Generate config GenerationConfig {
  "bos_token_id": 1,
  

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# Cast some modules of the model to fp32 
model = prepare_model_for_kbit_training(model)

In [None]:
# Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False # Gradient checkpointing is used by default but not compatible with caching

### QLoRA Setting (PEFT) 

In [None]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
)

In [None]:
training_arguments = TrainingArguments(
        output_dir=model_path,
        #evaluation_strategy="steps",
        #do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        per_device_eval_batch_size=4,
        log_level="debug",
        save_steps=50,
        logging_steps=10,
        learning_rate=4e-4,
        #eval_steps=200,
        num_train_epochs=100,
        max_steps=100,
        warmup_steps=100,
        lr_scheduler_type="linear",
)

In [None]:
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset['train'],
        #eval_dataset=dataset['test'],
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arguments,
)

In [None]:
trainer.train()

In [None]:
compute_dtype = getattr(torch, "bfloat16")   # float16 -> bfloat16 : training more stable 
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

In [None]:
# Load the model and quantize it on the fly
model = AutoModelForCausalLM.from_pretrained(
      model_dir, quantization_config=bnb_config, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map={"": 0}
)

In [None]:
model.config.use_cache = True
model = PeftModel.from_pretrained(model, "../../models/checkpoint-100/")   # top layer에 fine-tuned adapter 추가

In [None]:
def generate(instruction):
    prompt = "### Human: "+instruction+"### Assistant: "
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()
    generation_output = model.generate(
            input_ids=input_ids,
            generation_config=GenerationConfig(pad_token_id=tokenizer.pad_token_id, temperature=1.0, top_p=1.0, top_k=50, num_beams=1),
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=256
    )
    for seq in generation_output.sequences:
        output = tokenizer.decode(seq)
        print(output.split("### Assistant: ")[1].strip())
        
generate("금융 상품 추천해줘")