### Defalut Setting 

In [1]:
import torch
import os 
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import LlamaForCausalLM, LlamaTokenizer, LlamaTokenizerFast, BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
from transformers import GenerationConfig

In [2]:
default_path = os.getcwd()
data_path = os.path.join(default_path, '../../data')
model_path = os.path.join(default_path, '../../models')
local_mistral_dir = os.path.join(model_path, "mistral_origin")
config_path = os.path.join(default_path, '../../config')

In [3]:
seed = 42

In [4]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda', index=0)

In [18]:
dataset = load_dataset("kaitchup/ultrachat-100k-flattened")
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 5140
    })
})

In [29]:
dataset['train'][0].keys()

dict_keys(['text'])

In [32]:
dataset['train'][0]['text']

"### Human: Create a salsa song with lyrics that capture the intensity and fervor of a passionate love affair between two people. The song should have a fast-paced tempo and incorporate traditional salsa instrumentation such as brass, percussion, and piano. The lyrics should paint a vivid picture of the emotions involved in a passionate romance, including love, desire, yearning, and heartbreak. Feel free to draw inspiration from real-life experiences or fictional narratives. At the end of the song, listeners should be left feeling captivated and moved by the powerful message of the song.### Assistant: Verse 1:\nDesde el momento en que te vi\nYo supe que ibas a ser para mí\nTu cariño me envuelve \nCon tu amor siempre quiero seguir\n\nChorus:\nEl fuego arde en mi pecho\nCuando estoy contigo, mi amor\nEste romance es intenso\nY nunca se acabará, jamás\n\nVerse 2:\nTu sonrisa me encanta\nY tus ojos me hacen sentir \nQue eres mi otra mitad\nY no puedo vivir sin ti \n\nChorus:\nEl fuego arde

### Load Tokenizer & Model

In [5]:
tokenizer = AutoTokenizer.from_pretrained(os.path.join(local_mistral_dir, 'tokenizer'), use_fast=True)

In [30]:
tokenizer.unk_token, tokenizer.pad_token

('<unk>', '<unk>')

In [33]:
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id
tokenizer.padding_side = 'right'   # right, left

#### Quantize model with BitsAndBytes

In [34]:
compute_dtype = getattr(torch, "bfloat16")   # float16 -> bfloat16 : training more stable 
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

In [35]:
#Load the model and quantize it on the fly
model = AutoModelForCausalLM.from_pretrained(
      local_mistral_dir, quantization_config=bnb_config, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map={"": 0}
)

loading configuration file /rag/jupyter/llm/../../models/mistral_origin/config.json
Model config MistralConfig {
  "_name_or_path": "/rag/jupyter/llm/../../models/mistral_origin",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.36.2",
  "use_cache": false,
  "vocab_size": 32000
}

loading weights file /rag/jupyter/llm/../../models/mistral_origin/model.safetensors.index.json
Instantiating MistralForCausalLM model under default dtype torch.float16.
Generate config GenerationConfig {
  "bos_token_id": 1,
  

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing MistralForCausalLM.

All the weights of MistralForCausalLM were initialized from the model checkpoint at /rag/jupyter/llm/../../models/mistral_origin.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MistralForCausalLM for predictions without further training.
loading configuration file /rag/jupyter/llm/../../models/mistral_origin/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}



In [36]:
# Cast some modules of the model to fp32 
model = prepare_model_for_kbit_training(model)

In [37]:
# Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False   # Gradient checkpointing is used by default but not compatible with caching

### QLoRA Setting (PEFT) 

In [38]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
)

In [49]:
training_arguments = TrainingArguments(
        output_dir=os.path.join(model_path, 'mistral-QLoRA'),
        #evaluation_strategy="steps",
        #do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        per_device_eval_batch_size=4,
        log_level="debug",
        save_steps=50,
        logging_steps=10,
        learning_rate=4e-4,
        #eval_steps=200,
        num_train_epochs=100,
        max_steps=100,
        warmup_steps=100,
        lr_scheduler_type="linear",
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


### QLoRA Tuning 

In [40]:
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset['train'],
        #eval_dataset=dataset['test'],
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arguments,
)

PyTorch: setting up devices


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [41]:
trainer.train()

Currently training with a batch size of: 4
***** Running training *****
  Num examples = 100,000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 100
  Number of trainable parameters = 41,943,040
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,1.3952
20,1.2898
30,1.2943
40,1.2046
50,1.178
60,1.163
70,1.1519
80,1.1565
90,1.1534
100,1.1393


Checkpoint destination directory /rag/jupyter/llm/../../models/checkpoint-20 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Saving model checkpoint to /rag/jupyter/llm/../../models/checkpoint-20
tokenizer config file saved in /rag/jupyter/llm/../../models/checkpoint-20/tokenizer_config.json
Special tokens file saved in /rag/jupyter/llm/../../models/checkpoint-20/special_tokens_map.json
Checkpoint destination directory /rag/jupyter/llm/../../models/checkpoint-40 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Saving model checkpoint to /rag/jupyter/llm/../../models/checkpoint-40
tokenizer config file saved in /rag/jupyter/llm/../../models/checkpoint-40/tokenizer_config.json
Special tokens file saved in /rag/jupyter/llm/../../models/checkpoint-40/special_tokens_map.json
Checkpoint destination directory /rag/jupyter/llm/../../models/checkpoint-60 already exists and is non-empty.Saving will proceed but saved resul

TrainOutput(global_step=100, training_loss=1.2126054763793945, metrics={'train_runtime': 1051.0047, 'train_samples_per_second': 1.522, 'train_steps_per_second': 0.095, 'total_flos': 3.51564749340672e+16, 'train_loss': 1.2126054763793945, 'epoch': 0.02})

### Load QLoRA tuned model 

In [5]:
tokenizer = AutoTokenizer.from_pretrained(os.path.join(local_mistral_dir, 'tokenizer'), use_fast=True)

In [6]:
tokenizer.unk_token, tokenizer.pad_token

('<unk>', '</s>')

In [7]:
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id
tokenizer.padding_side = 'right'   # right, left

In [8]:
local_mistral_dir

'/rag/jupyter/llm/../../models/mistral_origin'

In [9]:
compute_dtype = getattr(torch, "bfloat16")   # float16 -> bfloat16 : training more stable 
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

In [10]:
# Load the model and quantize it on the fly
model = AutoModelForCausalLM.from_pretrained(
      local_mistral_dir, quantization_config=bnb_config, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map={"": 0}
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
model.config.use_cache = True
model = PeftModel.from_pretrained(model, "../../models/checkpoint-100/")   # top layer에 fine-tuned adapter 추가

In [12]:
def generate(instruction):
    prompt = "### Human: "+instruction+"### Assistant: "
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()
    generation_output = model.generate(
            input_ids=input_ids,
            generation_config=GenerationConfig(pad_token_id=tokenizer.pad_token_id, temperature=1.0, top_p=1.0, top_k=50, num_beams=1),
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=256
    )
    for seq in generation_output.sequences:
        output = tokenizer.decode(seq)
        print(output.split("### Assistant: ")[1].strip())
        
generate("금융 상품 추천해줘")

저는 인공지능 모델이기 때문에 개인의 상황에 따라 추천할 수 있는 금융 상품이 다를 수 있습니다. 하지만 일반적으로 추천할 수 있는 몇 가지 금융 상품은 다음과 같습니다:

1. 저금리 대출: 저금리 대출은 대출 금리를 낮게 주는 대출 상품입니다. 이는 대출 금리를 낮게 주는 대출 상품으로 대출 금리를 낮게 주는 대출 상품입니다.

2. 저금리 예금: 저금리 예금은 예금 금리를 낮게 주는 예금 상품입니다. 이는 예금 금리를


In [13]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [14]:
count_parameters(model)

0

In [16]:
torch.randn(1, 2, 3, 4)

tensor([[[[ 0.2798, -1.3275, -2.5542,  0.5418],
          [-0.3713,  0.2289,  0.0246, -1.0159],
          [ 0.2753,  0.9804,  0.7752,  0.1774]],

         [[ 0.5081, -2.3607,  0.5605, -0.4808],
          [-1.9072, -0.8845, -0.3196,  0.1949],
          [ 0.1367,  0.0303, -0.3085, -0.3469]]]])