In [None]:
! pip install transformers peft datasets

In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

from datasets import Dataset
dataset = Dataset.from_json('/home/ubuntu/remove_link_500_500.jsonl')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "kakaocorp/kanana-nano-2.1b-instruct"
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",  # Ï∂îÍ∞Ä
    low_cpu_mem_usage=True  # Ï∂îÍ∞Ä
).to("cuda")

tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

In [3]:
def formatting_prompts_func(examples):
    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

    # üí° 'instruction' ÎåÄÏã† Îç∞Ïù¥ÌÑ∞ÏÖã ÌîºÏ≤òÏù∏ 'introduction'ÏùÑ ÏÇ¨Ïö©Ìï©ÎãàÎã§.
    introductions = examples["introduction"]
    inputs = examples["input"]
    outputs = examples["output"]

    # tokenizerÎäî Ìï¥Îãπ ÏÖÄÏù¥ Ïã§ÌñâÎêòÍ∏∞ Ï†ÑÏóê Ï†ïÏùòÎêòÏñ¥ ÏûàÏñ¥Ïïº Ìï©ÎãàÎã§.
    EOS_TOKEN = tokenizer.eos_token 

    texts = []
    # üí° 'instruction' ÎåÄÏã† 'introduction'ÏùÑ ÏÇ¨Ïö©ÌïòÏó¨ Î∞òÎ≥µÎ¨∏ÏùÑ Ïã§ÌñâÌï©ÎãàÎã§.
    for introduction, input, output in zip(introductions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        # üí° instruction ÎåÄÏã† introduction Î≥ÄÏàòÎ•º formatÏóê Ï†ÑÎã¨Ìï©ÎãàÎã§.
        text = alpaca_prompt.format(introduction, input, output) + EOS_TOKEN
        texts.append(text)

    # Ïù¥ Ìï®ÏàòÎäî ÏÉàÎ°úÏö¥ 'text' Ïó¥ÏùÑ Îç∞Ïù¥ÌÑ∞ÏÖãÏóê Ï∂îÍ∞ÄÌï©ÎãàÎã§.
    return {"text": texts}
dataset = dataset.map(formatting_prompts_func, batched=True)
dataset

Dataset({
    features: ['introduction', 'input', 'output', 'text'],
    num_rows: 1002
})

In [4]:
def tokenize_function(examples):
    tokens = tokenizer(
        examples["text"], 
        padding=True, 
        truncation=True,  # Ï∂îÍ∞Ä
        max_length=4096,  # Ï∂îÍ∞Ä (ÌïÑÏöîÏóê Îî∞Îùº Ï°∞Ï†ï)
        return_tensors="pt"
    )
    tokens["labels"] = tokens["input_ids"]
    return tokens

dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1002/1002 [00:03<00:00, 286.86 examples/s]


In [26]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Î™®Îç∏ Ï§ÄÎπÑ
base_model.gradient_checkpointing_enable()

lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=16,
    lora_alpha=64,
    lora_dropout=0.1,
    target_modules=[
        "q_proj", 
        "k_proj", 
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj" 
    ],
    bias="none",
)
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters() 

trainable params: 23,003,136 || all params: 2,109,982,464 || trainable%: 1.0902


In [27]:
from transformers import Trainer, TrainingArguments

trainer = Trainer(
    model=model,
    train_dataset=dataset,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        warmup_steps=5,
        num_train_epochs=3,
        learning_rate=2e-4, 
        bf16=True,
        logging_steps=1,
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=1234,
        output_dir="outputs",
        report_to="none",
        ddp_find_unused_parameters=False
    )
)

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [28]:
trainer_stats = trainer.train()

Step,Training Loss
1,7.6604
2,6.9429
3,4.387
4,2.6654
5,1.1184
6,0.7116
7,1.0995
8,1.0194
9,0.8401
10,1.2765


In [30]:
FINAL_ADAPTER_PATH = "./final_lora_adapter"

model.save_pretrained(FINAL_ADAPTER_PATH) 

print(f" ÏµúÏ¢Ö LoRA Ïñ¥ÎåëÌÑ∞ Í∞ÄÏ§ëÏπòÍ∞Ä {FINAL_ADAPTER_PATH}Ïóê Ï†ÄÏû•ÎêòÏóàÏäµÎãàÎã§.")

 ÏµúÏ¢Ö LoRA Ïñ¥ÎåëÌÑ∞ Í∞ÄÏ§ëÏπòÍ∞Ä ./final_lora_adapterÏóê Ï†ÄÏû•ÎêòÏóàÏäµÎãàÎã§.


In [33]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

BASE_MODEL_ID = "kakaocorp/kanana-nano-2.1b-instruct"
MERGED_MODEL_DIR = "./merged_full_model_for_gguf" # Î≥ëÌï©Îêú Î™®Îç∏ÏùÑ Ï†ÄÏû•Ìï† ÎîîÎ†âÌÜ†Î¶¨ Ïù¥Î¶Ñ
# ----------------------------------------------------------------------

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    dtype=torch.float16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

# 2. Í∏∞Î≥∏ Î™®Îç∏Ïóê LoRA Ïñ¥ÎåëÌÑ∞Î•º Ïó∞Í≤∞Ìï©ÎãàÎã§.
model = PeftModel.from_pretrained(base_model, FINAL_ADAPTER_PATH)

# 3. LoRA Í∞ÄÏ§ëÏπòÎ•º Í∏∞Î≥∏ Î™®Îç∏Ïóê ÏòÅÍµ¨Ï†ÅÏúºÎ°ú Î≥ëÌï©Ìï©ÎãàÎã§.
model = model.merge_and_unload() 

# 4. Î≥ëÌï©Îêú Full Î™®Îç∏ÏùÑ Hugging Face ÌòïÏãùÏúºÎ°ú Ï†ÄÏû•Ìï©ÎãàÎã§.
model.save_pretrained(MERGED_MODEL_DIR, safe_serialization=True)
tokenizer.save_pretrained(MERGED_MODEL_DIR)

print(f" Î™®Îç∏ Î≥ëÌï© ÏôÑÎ£å. ÏµúÏ¢Ö ÌååÏùºÏùÄ {MERGED_MODEL_DIR}Ïóê Ï†ÄÏû•ÎêòÏóàÏäµÎãàÎã§.")

‚úÖ Î™®Îç∏ Î≥ëÌï© ÏôÑÎ£å. ÏµúÏ¢Ö ÌååÏùºÏùÄ ./merged_full_model_for_ggufÏóê Ï†ÄÏû•ÎêòÏóàÏäµÎãàÎã§.


In [46]:
# llama.cpp Ï†ÄÏû•ÏÜå ÌÅ¥Î°† Î∞è Ïù¥Îèô
!cd llama && pip install -r ./requirements/requirements-convert_hf_to_gguf.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/nightly
Ignoring torch: markers 'platform_machine == "s390x"' don't match your environment
Collecting numpy~=1.26.4 (from -r /home/ubuntu/llama.cpp/requirements/requirements-convert_legacy_llama.txt (line 1))
  Downloading https://download.pytorch.org/whl/nightly/numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m18.0/18.0 MB[0m [31m14.1 MB/s[0m  [33m0:00:01[0m eta [36m0:00:01[0m
[?25hCollecting sentencepiece~=0.2.0 (from -r /home/ubuntu/llama.cpp/requirements/requirements-convert_legacy_llama.txt (line 2))
  Downloading https://download.pytorch.org/whl/nightly/sentencepiece-0.2.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting gguf>=0.1.0 (from -r /hom

In [53]:
MERGED_MODEL_DIR="./merged_full_model_for_gguf" 
OUTPUT_GGUF_FILE="./my_lora_finetuned_model.gguf"

# 1. Hugging Face Î™®Îç∏ÏùÑ GGUF ÌòïÏãùÏúºÎ°ú Î≥ÄÌôòÌï©ÎãàÎã§. (FP16 Ï†ïÎ∞ÄÎèÑÎ°ú Ï†ÄÏû•)
# Ïù¥ Í≥ºÏ†ïÏùÄ ÏñëÏûêÌôî Ï†Ñ Ï§ÄÎπÑ Îã®Í≥ÑÏûÖÎãàÎã§.
!python ./llama.cpp/convert_hf_to_gguf.py $MERGED_MODEL_DIR --outtype f16

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:hf-to-gguf:Loading model: merged_full_model_for_gguf
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:hf-to-gguf:gguf: indexing model part 'model.safetensors'
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {1792, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.float16 --> F32, shape = {1792}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float16 --> F16, shape = {8064, 1792}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float16 --> F16, shape = {1792, 8064}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float16 --> F16, shape = {1792, 8064}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.float16 --> F32, shape = {1792}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.float16 --> F16, shape = {1792, 1024}
INFO:hf-to-gguf:blk.0.attn_output.weight,    torch.float16 --> F16, shape = {3072, 1792}
INFO:hf-to-gguf:blk.0

In [62]:
!cd llama.cpp && mkdir build && cd build && cmake .. && make -j4

/home/ubuntu/llama.cpp


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


-- The C compiler identification is GNU 13.3.0
-- The CXX compiler identification is GNU 13.3.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
[0mCMAKE_BUILD_TYPE=Release[0m
-- Found Git: /usr/bin/git (found version "2.43.0") 
-- The ASM compiler identification is GNU
-- Found assembler: /usr/bin/cc
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE  
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- GGML_SYSTEM_ARCH: x86
-- Including CPU backend
-- Found OpenMP_C: -fopenmp (found version "4.5") 
-- Found OpenMP_CXX: -fopenmp (found version "4.5") 
-- Found OpenMP: TRUE (fou

In [71]:
!./llama.cpp/build/bin/llama-quantize ./merged_full_model_for_gguf/Merged_Full_Model_For_Gguf-2.1B-F16.gguf ./model-Q5_K_M.gguf Q5_K_M

main: build = 6912 (961660b8c)
main: built with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
main: quantizing './merged_full_model_for_gguf/Merged_Full_Model_For_Gguf-2.1B-F16.gguf' to './model-Q5_K_M.gguf' as Q5_K_M
llama_model_loader: loaded meta data with 29 key-value pairs and 290 tensors from ./merged_full_model_for_gguf/Merged_Full_Model_For_Gguf-2.1B-F16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Merged_Full_Model_For_Gguf
llama_model_loader: - kv   3:                         general.size_label str              = 2.1B
llama_model_loader: - kv   4:                          llama.block_count u32 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[   1/ 290]                   output_norm.weight - [ 1792,     1,     1,     1], type =    f32, size =    0.007 MiB
[   2/ 290]                    token_embd.weight - [ 1792, 128256,     1,     1], type =    f16, converting to q6_K .. size =   438.38 MiB ->   179.80 MiB
[   3/ 290]                  blk.0.attn_k.weight - [ 1792,  1024,     1,     1], type =    f16, converting to q5_K .. size =     3.50 MiB ->     1.20 MiB
[   4/ 290]               blk.0.attn_norm.weight - [ 1792,     1,     1,     1], type =    f32, size =    0.007 MiB
[   5/ 290]             blk.0.attn_output.weight - [ 3072,  1792,     1,     1], type =    f16, converting to q5_K .. size =    10.50 MiB ->     3.61 MiB
[   6/ 290]                  blk.0.attn_q.weight - [ 1792,  3072,     1,     1], type =    f16, converting to q5_K .. size =    10.50 MiB ->     3.61 MiB
[   7/ 290]                  blk.0.attn_v.weight - [ 1792,  1024,     1,     1], type =    f16, converting to q6_K .. size =     3.50 MiB ->     1.44 M