## OS: Ubuntu 22.04LTS
## Kernel: Python (gguf)

## 모델 다운로드 및 변환(로컬)

### 1. Download model

In [None]:
# 모델 이름 지정
# model_id = "sh2orc/Llama-3.1-Korean-8B-Instruct"
# model_name = "Llama-3.1-Korean-8B-Instruct"

model_id = "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
model_name = "EXAONE-3.0-7.8B-Instruct"

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
import os
import json
import shutil

# WARNING: download new key.json in notion
curpath = os.path.abspath(os.getcwd())
key_path = os.path.abspath(os.path.join(curpath + "../../../key/key.json"))
with open(key_path, 'r', encoding='utf-8') as f:
        key_data = json.load(f)

login(token=key_data["HUGGING-FACE"])

# 모델과 토크나이저 로드
cache_path = os.path.join(curpath, "./.cache")
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, cache_dir=cache_path)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, cache_dir=cache_path)

# 로컬 디렉토리에 모델 저장
save_directory = os.path.join(curpath, "./save")
save_file = os.path.join(save_directory, model_name)
if not os.path.exists(save_directory):
        os.mkdir(save_directory)
model.save_pretrained(save_file)
tokenizer.save_pretrained(save_file)

# 다운로드 캐시 삭제
if os.path.exists(cache_path):
    shutil.rmtree(cache_path)

### 2. Convert into Float16 gguf

In [None]:
!python llama.cpp/convert_hf_to_gguf.py ./save/{model_name}/ --outfile ./save/{model_name}-fp16.gguf


### 3. Quantize

In [None]:

# quantize = "Q8_0"
quantize = "Q4_K_M"

In [None]:
!llama.cpp/llama-quantize ./save/{model_name}-fp16.gguf ./save/{model_name}-{quantize}.gguf {quantize}