## OS: Ubuntu 22.04LTS
## Kernel: Python (gguf)

## 모델 다운로드 및 변환(로컬)

### 1. Download model

In [1]:
# 모델 이름 지정
model_id = "sh2orc/Llama-3.1-Korean-8B-Instruct"
model_name = "Llama-3.1-Korean-8B-Instruct"

# model_id = "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
# model_name = "EXAONE-3.0-7.8B-Instruct"

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
import os
import json
import shutil

# WARNING: download new key.json in notion
curpath = os.path.abspath(os.getcwd())
key_path = os.path.abspath(os.path.join(curpath + "../../../key/key.json"))
with open(key_path, 'r', encoding='utf-8') as f:
        key_data = json.load(f)

login(token=key_data["HUGGING-FACE"])

# 모델과 토크나이저 로드
cache_path = os.path.join(curpath, "./.cache")
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, cache_dir=cache_path)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, cache_dir=cache_path)

# 로컬 디렉토리에 모델 저장
save_directory = os.path.join(curpath, "./save")
save_file = os.path.join(save_directory, model_name)
if not os.path.exists(save_directory):
        os.mkdir(save_directory)
model.save_pretrained(save_file)
tokenizer.save_pretrained(save_file)

# 다운로드 캐시 삭제
if os.path.exists(cache_path):
    shutil.rmtree(cache_path)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/abcd/.cache/huggingface/token
Login successful


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:  15%|#5        | 755M/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

### 2. Convert into Float16 gguf

In [8]:
!python llama.cpp/convert_hf_to_gguf.py ./save/{model_name}/ --outfile ./save/{model_name}-fp16.gguf


INFO:hf-to-gguf:Loading model: Llama-3.1-Korean-8B-Instruct
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00007.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.float32 --> F16, shape = {4096, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.float32 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float32 --> F16, shape = {14336, 4096}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float32 --> F16, shape = {4096, 14336}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float32 --> F16, shape = {4096, 14336}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.float32 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.float32 --> F16, shape = {4096, 1024}
INFO:hf-to-gguf:blk.0.attn_output.weight,    torch.float32 --

### 3. Quantize

In [11]:

quantize = "Q8_0"
# quantize = "Q4_K_M"

In [12]:
!llama.cpp/llama-quantize ./save/{model_name}-fp16.gguf ./save/{model_name}-{quantize}.gguf {quantize}

main: build = 3608 (50addec9)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: quantizing './save/Llama-3.1-Korean-8B-Instruct-fp16.gguf' to './save/Llama-3.1-Korean-8B-Instruct-Q8_0.gguf' as Q8_0
llama_model_loader: loaded meta data with 28 key-value pairs and 292 tensors from ./save/Llama-3.1-Korean-8B-Instruct-fp16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Llama 3.1 Korean 8B Instruct
llama_model_loader: - kv   3:                       general.organization str              = Sh2Orc
llama_model_loader: - kv   4:                           general.finetune str              = Instru