In [1]:
model_name='llama3-8b-spaetzle-v13' # model name that we download
repo_orig='cstr'  # account of the original repo from where we download the unquantized model
username = 'cstr' # account where we upload the quantized model to
fix_pretokenizer = True # must we use the pretokenizer fix with the update-script?
#only for the use of the convert.py script:
vocab_type = "bpe"
pad_vocab = True

In [None]:
!git clone https://github.com/ggerganov/llama.cpp
%cd 'llama.cpp'
!make

In [None]:
!pip install huggingface_hub

token = 'HF_TOKEN' # you must set the token in Add-ons/Secrets and attach it to this notebook

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
token_value = user_secrets.get_secret("HF_TOKEN")

from huggingface_hub import snapshot_download
outpath = snapshot_download(repo_id=repo_orig+"/"+model_name, token=token_value)

In [4]:

import subprocess

if not fix_pretokenizer:
    # Start command with basic parameters
    quantize_command = f"python ./convert.py {outpath}/ --outtype f16 --outfile /kaggle/model/{model_name}.bin"
    
    # Add vocab-type parameter if applicable
    if vocab_type != "":
        quantize_command += f" --vocab-type {vocab_type}"  
    
    # Add pad-vocab parameter if true
    if pad_vocab:
        quantize_command += " --pad-vocab"
    
    # Execute the command
    result = subprocess.run(quantize_command, shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Check if the command was successful
    if result.returncode == 0:
        print("Command executed successfully.")
        print("Output:", result.stdout)
    else:
        print("Error in command execution.")
        print("Error:", result.stderr)

#!python ./convert.py {outpath}/ --outtype f16 --outfile /kaggle/model/{model_name}.bin --vocab-type bpe --pad-vocab
#!python ./convert.py {outpath}/ --outtype f16 --outfile /kaggle/model/{model_name}.bin --pad-vocab

In [None]:
%%writefile "convert-hf-to-gguf-update.py"
#!/usr/bin/env python3

# This script downloads the tokenizer models of the specified models from Huggingface and
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
#
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
# provide the necessary information to llama.cpp via the GGUF header in order to implement
# the same pre-tokenizer.
#
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
#
# Instructions:
#
# - Add a new model to the "models" list
# - Run the script with your huggingface token:
#
#   python3 convert-hf-to-gguf-update.py <huggingface_token>
#
# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
# - Update llama.cpp with the new pre-tokenizer if necessary
#
# TODO: generate tokenizer tests for llama.cpp
# TODO: automate the update of convert-hf-to-gguf.py
#

import logging
import os
import requests
import sys
import json

from hashlib import sha256
from enum import IntEnum, auto
from transformers import AutoTokenizer

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("convert-hf-to-gguf-update")


class TOKENIZER_TYPE(IntEnum):
    SPM = auto()
    BPE = auto()
    WPM = auto()


# TODO: this string has to exercise as much pre-tokenizer functionality as possible
#       will be updated with time - contributions welcome
chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'

if len(sys.argv) == 2:
    token = sys.argv[1]
else:
    logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
    sys.exit(1)

# TODO: add models here, base models preferred
models = [
    {"name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
    {"name": "phi-3",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
    {"name": "deepseek-llm",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
    {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
    {"name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
    {"name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
    {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
    {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
    {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
    {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
    #{"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
]

# make directory "models/tokenizers" if it doesn't exist
if not os.path.exists("models/tokenizers"):
    os.makedirs("models/tokenizers")


def download_file_with_auth(url, token, save_path):
    headers = {"Authorization": f"Bearer {token}"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
        logger.info(f"File {save_path} downloaded successfully")
    else:
        logger.info(f"Failed to download file. Status code: {response.status_code}")


# download the tokenizer models
for model in models:
    name = model["name"]
    repo = model["repo"]
    tokt = model["tokt"]

    if not os.path.exists(f"models/tokenizers/{name}"):
        os.makedirs(f"models/tokenizers/{name}")
    else:
        logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
        continue

    logger.info(f"Downloading {name} to models/tokenizers/{name}")

    url = f"{repo}/raw/main/config.json"
    save_path = f"models/tokenizers/{name}/config.json"
    download_file_with_auth(url, token, save_path)

    url = f"{repo}/raw/main/tokenizer.json"
    save_path = f"models/tokenizers/{name}/tokenizer.json"
    download_file_with_auth(url, token, save_path)

    # if downloaded file is less than 1KB, we likely need to download an LFS instead
    if os.path.getsize(save_path) < 1024:
        # remove the file
        os.remove(save_path)
        url = f"{repo}/resolve/main/tokenizer.json"
        save_path = f"models/tokenizers/{name}/tokenizer.json"
        download_file_with_auth(url, token, save_path)

    if tokt == TOKENIZER_TYPE.SPM:
        url = f"{repo}/resolve/main/tokenizer.model"
        save_path = f"models/tokenizers/{name}/tokenizer.model"
        download_file_with_auth(url, token, save_path)

    url = f"{repo}/raw/main/tokenizer_config.json"
    save_path = f"models/tokenizers/{name}/tokenizer_config.json"
    download_file_with_auth(url, token, save_path)

# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
# TODO: auto-update convert-hf-to-gguf.py with the generated function

src_ifs = ""
for model in models:
    name = model["name"]
    tokt = model["tokt"]

    if tokt == TOKENIZER_TYPE.SPM:
        continue

    # create the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")

    chktok = tokenizer.encode(chktxt)
    chkhsh = sha256(str(chktok).encode()).hexdigest()

    logger.info(f"model: {name}")
    logger.info(f"tokt: {tokt}")
    logger.info(f"repo: {model['repo']}")
    logger.info(f"chktok: {chktok}")
    logger.info(f"chkhsh: {chkhsh}")

    # print the "pre_tokenizer" content from the tokenizer.json
    with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
        cfg = json.load(f)
        pre_tokenizer = cfg["pre_tokenizer"]
        logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))

    logger.info("")

    src_ifs += f"        if chkhsh == \"{chkhsh}\":\n"
    src_ifs += f"            # ref: {model['repo']}\n"
    src_ifs += f"            res = \"{name}\"\n"

src_func = f"""
    def get_vocab_base_pre(self, tokenizer) -> str:
        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
        # is specific for the BPE pre-tokenizer used by the model
        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
        # use in llama.cpp to implement the same pre-tokenizer

        chktxt = {repr(chktxt)}

        chktok = tokenizer.encode(chktxt)
        chkhsh = sha256(str(chktok).encode()).hexdigest()

        logger.debug(f"chktok: {{chktok}}")
        logger.debug(f"chkhsh: {{chkhsh}}")

        res = None

        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
        #       or pull the latest version of the model from Huggingface
        #       don't edit the hashes manually!
{src_ifs}
        if res is None:
            logger.warning("\\n")
            logger.warning("**************************************************************************************")
            logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
            logger.warning("**          There are 2 possible reasons for this:")
            logger.warning("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
            logger.warning("**          - the pre-tokenization config has changed upstream")
            logger.warning("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
            logger.warning("**")
            logger.warning(f"** chkhsh:  {{chkhsh}}")
            logger.warning("**************************************************************************************")
            logger.warning("\\n")
            raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")

        logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
        logger.debug(f"chkhsh: {{chkhsh}}")

        return res
"""

print(src_func) # noqa: NP100

logger.info("\n")
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
logger.info("\n")

# generate tests for each tokenizer model

tests = [
    "ied 4 ½ months",
    "Führer",
    "",
    " ",
    "  ",
    "   ",
    "\t",
    "\n",
    "\n\n",
    "\n\n\n",
    "\t\n",
    "Hello world",
    " Hello world",
    "Hello World",
    " Hello World",
    " Hello World!",
    "Hello, world!",
    " Hello, world!",
    " this is 🦙.cpp",
    "w048 7tuijk dsdfhu",
    "нещо на Български",
    "កាន់តែពិសេសអាចខលចេញ",
    "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
    "Hello",
    " Hello",
    "  Hello",
    "   Hello",
    "    Hello",
    "    Hello\n    Hello",
    " (",
    "\n =",
    "' era",
    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
    "3",
    "33",
    "333",
    "3333",
    "33333",
    "333333",
    "3333333",
    "33333333",
    "333333333",
    chktxt,
]

# write the tests to ./models/ggml-vocab-{name}.gguf.inp
# the format is:
#
# test0
# __ggml_vocab_test__
# test1
# __ggml_vocab_test__
# ...
#

# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
# for each test, write the resulting tokens on a separate line

for model in models:
    name = model["name"]
    tokt = model["tokt"]

    # create the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")

    with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
        for text in tests:
            f.write(f"{text}")
            f.write("\n__ggml_vocab_test__\n")

    with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
        for text in tests:
            res = tokenizer.encode(text, add_special_tokens=False)
            for r in res:
                f.write(f" {r}")
            f.write("\n")

    logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")

# generate commands for creating vocab files

logger.info("\nRun the following commands to generate the vocab files for testing:\n")

for model in models:
    name = model["name"]

    print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100

logger.info("\n")


In [None]:
import subprocess

if fix_pretokenizer:
    # Read and modify the existing script
    with open("convert-hf-to-gguf-update.py", "r") as file:
        script_contents = file.read()

    new_repo_name = repo_orig + "/" + model_name
    
    # Define the new line to be inserted
    new_line = f'    {{"name": "llama-bpe-1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/{new_repo_name}", }},'
    marker_line = '{"name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },'
    # Insert the new line after the marker line
    marker_found = False
    lines = script_contents.split('\n')
    for i, line in enumerate(lines):
        if marker_line in line:
            lines.insert(i + 1, new_line)
            marker_found = True
            break

    if not marker_found:
        print("Marker line not found in the file.")

    else:
        updated_script = "\n".join(lines)
        # Write the updated script to the file
        with open("convert-hf-to-gguf-update.py", "w") as file:
            file.write(updated_script)
        print("Script updated successfully.")


In [None]:
if fix_pretokenizer:
    # Execute the script and capture output
    command = f"python convert-hf-to-gguf-update.py {token_value} > output.txt"
    subprocess.run(command, shell=True)

In [None]:
if fix_pretokenizer:
    # Read the output from the file and trim it after a specific line
    end_line = "return res"
    with open('output.txt', 'r') as file:
        lines = file.readlines()
        trimmed_output = []
        for line in lines:
            trimmed_output.append(line)
            if end_line in line:
                break  # Stop adding lines after the end_line is found
    
    # Join the trimmed lines back into a single string
    script_output = "".join(trimmed_output)
    print("Captured Output:", script_output)

    # Function replacement from update script output
    def replace_function(source_path, output_path, start_marker, end_marker, new_function_content):
        with open(source_path, 'r') as source_file, open(output_path, 'w') as output_file:
            in_old_function = False
            for line in source_file:
                if start_marker in line:
                    in_old_function = True
                    output_file.write(new_function_content + "\n")
                    continue
                if in_old_function and end_marker in line:
                    in_old_function = False
                    continue
                if not in_old_function:
                    output_file.write(line)

    # Setup of the function replacement
    start_marker = "def get_vocab_base_pre(self, tokenizer) -> str:"
    end_marker = "return res"
    new_function_content = script_output  # Use the trimmed script output as the new content

    source_path = "convert-hf-to-gguf.py"
    output_path = "convert-hf-to-gguf-updated.py"
    replace_function(source_path, output_path, start_marker, end_marker, new_function_content)
    print("Function replacement complete.")


In [55]:
!mkdir /kaggle/model

In [None]:
#subprocess.run(f"python convert.py {outpath}/ --vocab-only --outfile {outpath}/tokenizer.model --vocab-type bpe", shell = True)

In [None]:
#!python3 convert-hf-to-gguf.py models/tokenizers/llama-bpe-1/ --outfile models/ggml-vocab-llama-bpe-1.gguf --vocab-only

In [70]:
!rm {outpath}/tokenizer.model

In [None]:
if fix_pretokenizer:
    # Execute the shell commands
    subprocess.run(f"cp -Lf models/tokenizers/llama-bpe-1/* {outpath}/", shell=True)
    
    # Create new tokenizer.model
    # subprocess.run(f"python convert.py {outpath}/ --vocab-only --outfile {outpath}/tokenizer.model --vocab-type bpe", shell = True)

    subprocess.run(f"python convert-hf-to-gguf-updated.py {outpath}/ --outtype f16 --outfile /kaggle/model/{model_name}.bin", shell=True)


In [None]:
#!./quantize /kaggle/model/{model_name}.bin {model_name}-q4-k-m.gguf 15    
import os
import subprocess

# Define the model name and paths
original_model_path = f"/kaggle/model/{model_name}.bin"
quantized_model_path = f"{model_name}-q4-k-m.gguf"

# Run the quantization command
quantize_command = f"./quantize {original_model_path} {quantized_model_path} 15"
subprocess.run(quantize_command, shell=True)

# Check if the quantized model exists
if os.path.exists(quantized_model_path):
    print(f"Quantized model {quantized_model_path} exists.")

    # If the quantized model exists, delete the original .bin file
    os.remove(original_model_path)
    print(f"Deleted the original model file: {original_model_path}")
else:
    print(f"Quantized model {quantized_model_path} does not exist.")

In [None]:
from huggingface_hub import HfApi, create_repo

api = HfApi(token=token_value)
repo_name = username + "/" + model_name + "-GGUF"

try:
    # Attempt to fetch the repository details.
    repo_info = api.repo_info(repo_name)
    print(f"Repository '{repo_name}' already exists.")
    # Check if the repository is private
    if not repo_info.private:
        print(f"Repository '{repo_name}' is public. Updating to private.")
        api.update_repo_visibility(repo_id=repo_name, private=True, token=token_value)
        print(f"Repository '{repo_name}' has been updated to private.")
except:
    # If the repository does not exist, create it.
    create_repo(repo_name, token=token_value, private=True)
    print(f"Repository '{repo_name}' has been created.")


In [None]:
api = HfApi(token=token_value)
api.upload_file(
    path_or_fileobj="/kaggle/working/llama.cpp/"+model_name+"-q4-k-m.gguf",
    path_in_repo=model_name+"-q4-k-m.gguf",
    repo_id=repo_name,
    repo_type="model",
)

In [None]:
ggufpath = "/kaggle/working/llama.cpp/"+model_name+"-q4-k-m.gguf"
!./main -m {ggufpath} -c 512 -b 1024 -n 256 --keep 48 \
    --repeat_penalty 1.0 --color \
    -p "<|im_start|>user \
    I have 10 apples. I find 3 gold coins in the bottom of a river. The river runs near a big city that has something to do with what I can spend the coins on. I then lose 4 apples but gain a gold coin. Three birds run into my path and drop 6 apples each. I play an online game and win 6 gold coins but I have to share them equally with my 2 teammates. I buy apples for all the coins I have. The price of an apple is 0.5 coins. How many apples do I have? And where is the river?  \
    <|im_start|>assistant"
!./main -m {ggufpath} -c 512 -b 1024 -n 256 --keep 48 \
    --repeat_penalty 1.0 --color \
    -p "<|im_start|>user \
    Samantha has 3 brothers. Each brother has 2 sisters. How many sisters does Samantha have? \
    <|im_start|>assistant"