### Quantize GGUF
Based on TheBloke's script for ggml conversion and quantization

In [None]:
# Install llama.cpp
%cd /kaggle/
!git clone https://github.com/ggerganov/llama.cpp
%cd /kaggle/llama.cpp
!pip install -r requirements.txt
!make
%cd /kaggle/

In [None]:
# Login to hub
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Full repo download model

# Select model
repo_id = "TheBloke/Llama-2-13B-fp16"

# Select branch
revision="main"

# Download model
from huggingface_hub import snapshot_download
snapshot_download(repo_id=repo_id, revision=revision, local_dir=f"./{repo_id.replace('/', '_')}")

print(f"Model dir: './{repo_id.replace('/', '_')}'")

In [None]:
# This step is necessary only if your base model is a standard 32000 vocab model AND the uploader accidentally kept added_tokens.json in the repo

# Remove added_tokens.json
%cd /kaggle/TheBloke_Llama-2-13B-fp16
%rm added_tokens.json
%cd /kaggle/

In [None]:
# Set variables
input_dir = "./TheBloke_Llama-2-13B-fp16"
base_model_name = "Llama-2-13B"
remove_fp16 = True

# Run quantize
import os
import subprocess

def quantize(model, outbase, outdir):
    llamabase = "/kaggle/llama.cpp"
    ggml_version = "ggufv1"

    if not os.path.isdir(model):
        raise Exception(f"Could not find model dir at {model}")

    if not os.path.isfile(f"{model}/config.json"):
        raise Exception(f"Could not find config.json in {model}")

    os.makedirs(outdir, exist_ok=True)
    fp16 = f"{outdir}/{outbase}.fp16.gguf"

    print(f"Making unquantised GGUF at {fp16}")
    if not os.path.isfile(fp16):
        subprocess.run(f"python {llamabase}/convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
    else:
        print(f"Unquantised GGUF already exists at: {fp16}")

    print("Making quants")
    for type in ["q4_K_S", "q5_K_M"]:
        outfile = f"{outdir}/{outbase}.{type}.gguf"
        print(f"Making {type} : {outfile}")
        subprocess.run(f"{llamabase}/quantize {fp16} {outfile} {type}", shell=True, check=True)
        
    if remove_fp16:
        os.remove(fp16)

quantize(input_dir, base_model_name, "quantized")

In [None]:
# Set variables
username = "username"
base_model_name = "Llama-2-13B"

# Push to hub
from huggingface_hub import create_repo, HfApi
api = HfApi()

create_repo(repo_id = f"{username}/{base_model_name}-GGUF", private = True, repo_type = "model", exist_ok = True)
api.upload_folder(
    folder_path="/kaggle/quantized",
    repo_id=f"{username}/{base_model_name}-GGUF",
    allow_patterns=f"{base_model_name}*.gguf"
)