In [None]:
model_name='phi-2-spaetzle-v4' # model name that we download
repo_orig='cstr'  # account of the original repo from where we download the unquantized model
username = 'cstr' # account where we upload the quantized model to
fix_pretokenizer = True # must we use the pretokenizer fix with the update-script?
#only for the use of the convert.py script:
vocab_type = "bpe" # "bpe" for llama3 eg, spm
pad_vocab = True
quantization = "Q4_K_M" # good compromise is often "Q4_K_M"
model_type = "phi2" #will be used in bpe pretokenizer setup "llama-bpe" for llama3!
save_space = False
print ("parameters initialized.")

These are the quantization types we can use:
```
   2  or  Q4_0    :  3.56G, +0.2166 ppl @ LLaMA-v1-7B
   3  or  Q4_1    :  3.90G, +0.1585 ppl @ LLaMA-v1-7B
   8  or  Q5_0    :  4.33G, +0.0683 ppl @ LLaMA-v1-7B
   9  or  Q5_1    :  4.70G, +0.0349 ppl @ LLaMA-v1-7B
  19  or  IQ2_XXS :  2.06 bpw quantization
  20  or  IQ2_XS  :  2.31 bpw quantization
  10  or  Q2_K    :  2.63G, +0.6717 ppl @ LLaMA-v1-7B
  21  or  Q2_K_S  :  2.16G, +9.0634 ppl @ LLaMA-v1-7B
  23  or  IQ3_XXS :  3.06 bpw quantization
  12  or  Q3_K    : alias for Q3_K_M
  22  or  Q3_K_XS : 3-bit extra small quantization
  11  or  Q3_K_S  :  2.75G, +0.5551 ppl @ LLaMA-v1-7B
  12  or  Q3_K_M  :  3.07G, +0.2496 ppl @ LLaMA-v1-7B
  13  or  Q3_K_L  :  3.35G, +0.1764 ppl @ LLaMA-v1-7B
  15  or  Q4_K    : alias for Q4_K_M
  14  or  Q4_K_S  :  3.59G, +0.0992 ppl @ LLaMA-v1-7B
  15  or  Q4_K_M  :  3.80G, +0.0532 ppl @ LLaMA-v1-7B
  17  or  Q5_K    : alias for Q5_K_M
  16  or  Q5_K_S  :  4.33G, +0.0400 ppl @ LLaMA-v1-7B
  17  or  Q5_K_M  :  4.45G, +0.0122 ppl @ LLaMA-v1-7B
  18  or  Q6_K    :  5.15G, +0.0008 ppl @ LLaMA-v1-7B
   7  or  Q8_0    :  6.70G, +0.0004 ppl @ LLaMA-v1-7B
   1  or  F16     : 13.00G              @ 7B
   0  or  F32     : 26.00G              @ 7B
```

In [None]:
!git clone https://github.com/ggerganov/llama.cpp
%cd 'llama.cpp'
!make

In [None]:
!pip install huggingface_hub

token = 'HF_TOKEN' # you must set the token in Add-ons/Secrets and attach it to this notebook

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
token_value = user_secrets.get_secret("HF_TOKEN")

from huggingface_hub import snapshot_download
outpath = snapshot_download(repo_id=repo_orig+"/"+model_name, token=token_value)

In [None]:
print("downloaded to: ", outpath)

In [None]:
import subprocess

if not fix_pretokenizer and not model_type == "phi2":
    # Start command with basic parameters
    quantize_command = f"python ./convert.py {outpath}/ --outtype f16 --outfile /kaggle/model/{model_name}.bin"
    
 #   if model_type == "phi2":
 #        quantize_command = f"python ./convert-hf-to-gguf.py {outpath}/ --outfile /kaggle/model/{model_name}.bin --outtype f32"
    
    # Add vocab-type parameter if applicable
    if vocab_type != "":
        quantize_command += f" --vocab-type {vocab_type}"  
    
    # Add pad-vocab parameter if true
    if pad_vocab:
        quantize_command += " --pad-vocab"
    
    # Execute the command
    result = subprocess.run(quantize_command, shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Check if the command was successful
    if result.returncode == 0:
        print("Command executed successfully.")
        print("Output:", result.stdout)
    else:
        print("Error in command execution.")
        print("Error:", result.stderr)

#!python ./convert.py {outpath}/ --outtype f16 --outfile /kaggle/model/{model_name}.bin --vocab-type bpe --pad-vocab
#!python ./convert.py {outpath}/ --outtype f16 --outfile /kaggle/model/{model_name}.bin --pad-vocab

In [None]:
# in case we must revert:
#!wget "https://raw.githubusercontent.com/ggerganov/llama.cpp/master/convert-hf-to-gguf-update.py" -O convert-hf-to-gguf-update.py

In [None]:
import subprocess

if fix_pretokenizer:
    # Read and modify the existing script
    with open("convert-hf-to-gguf-update.py", "r") as file:
        script_contents = file.read()

    new_repo_name = repo_orig + "/" + model_name # from which repo originates the model which we newly add to the list 
    
    # Define the new line to be inserted
    new_line = f'    {{"name": "{model_type}", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/{new_repo_name}", }},'
    marker_line = '{"name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },'
    # Insert the new line after the marker line
    marker_found = False
    lines = script_contents.split('\n')
    for i, line in enumerate(lines):
        if marker_line in line:
            lines.insert(i + 1, new_line)
            marker_found = True
            break

    if not marker_found:
        print("Marker line not found in the file.")

    else:
        updated_script = "\n".join(lines)
        # Write the updated script to the file
        with open("convert-hf-to-gguf-update.py", "w") as file:
            file.write(updated_script)
        print("Script updated successfully.")


In [None]:
if fix_pretokenizer:
    # Execute the script and capture output
    command = f"python convert-hf-to-gguf-update.py {token_value} > output.txt"
    subprocess.run(command, shell=True)

In [None]:
if fix_pretokenizer:
    # Read the output from the file and trim it after a specific line
    end_line = "return res"
    with open('output.txt', 'r') as file:
        lines = file.readlines()
        trimmed_output = []
        remainder_output = []
        capture_remainder = False
        for line in lines:
            if capture_remainder:
                remainder_output.append(line)  # Save the rest of lines after the specified line
            else:
                trimmed_output.append(line)
                if end_line in line:
                    capture_remainder = True  # Start capturing the remainder after this line

    # Join the trimmed lines back into a single string
    script_output = "".join(trimmed_output)
    remainder_script_output = "".join(remainder_output)
    print("Captured Output:", script_output)
    print("Remainder Output:", remainder_script_output)  # Optionally print or process remainder output

    # Function replacement from update script output
    def replace_function(source_path, output_path, start_marker, end_marker, new_function_content):
        with open(source_path, 'r') as source_file, open(output_path, 'w') as output_file:
            in_old_function = False
            for line in source_file:
                if start_marker in line:
                    in_old_function = True
                    output_file.write(new_function_content + "\n")
                    continue
                if in_old_function and end_marker in line:
                    in_old_function = False
                    continue
                if not in_old_function:
                    output_file.write(line)

    # Setup of the function replacement
    start_marker = "def get_vocab_base_pre(self, tokenizer) -> str:"
    end_marker = "return res"
    new_function_content = script_output  # Use the trimmed script output as the new content

    source_path = "convert-hf-to-gguf.py"
    output_path = "convert-hf-to-gguf-updated.py"
    replace_function(source_path, output_path, start_marker, end_marker, new_function_content)
    print("Function replacement complete.")

In [None]:
if fix_pretokenizer:
    # Assume remainder_script_output contains the commands, one per line
    commands = remainder_script_output.strip().split('\n')

    for command in commands:
        if command.strip():  # Ensure the command is not just whitespace
            try:
                # Execute the command
                print(f"Executing: {command}")
                result = subprocess.run(command, shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                
                # Check if the command was successful
                if result.returncode == 0:
                    print("Command executed successfully.")
                    print("Output:", result.stdout)
                else:
                    print("Error in command execution.")
                    print("Error:", result.stderr)
            except Exception as e:
                print(f"An error occurred while executing {command}: {str(e)}")

In [10]:
!mkdir /kaggle/model

In [None]:
if fix_pretokenizer:
    # Execute the shell commands
    subprocess.run(f"cp -Lf models/tokenizers/{model_type}/* {outpath}/", shell=True)
    
    # Create new tokenizer.model
    # subprocess.run(f"python convert.py {outpath}/ --vocab-only --outfile {outpath}/tokenizer.model --vocab-type bpe", shell = True)

    subprocess.run(f"python convert-hf-to-gguf-updated.py {outpath}/ --outtype f16 --outfile /kaggle/model/{model_name}.bin", shell=True)


In [None]:
#!./quantize /kaggle/model/{model_name}.bin {model_name}-q4-k-m.gguf 15    
import os
import subprocess

# Define the model name and paths
original_model_path = f"/kaggle/model/{model_name}.bin"
quantized_model_path = f"{model_name}_{quantization}.gguf"

# Run the quantization command
quantize_command = f"./quantize {original_model_path} {quantized_model_path} {quantization}"

#if model_type == "phi2":
#    quantize_command = f"python ./convert-hf-to-gguf.py {outpath}/ --outfile /kaggle/model/{model_name}.bin --outtype f16 && {quantize_command}"
 
subprocess.run(quantize_command, shell=True)

# Check if the quantized model exists
if os.path.exists(quantized_model_path):
    print(f"Quantized model {quantized_model_path} exists.")

    # If the quantized model exists, delete the original .bin file
    if save_space:
        os.remove(original_model_path)
        print(f"Deleted the original model file: {original_model_path}")
else:
    print(f"Quantized model {quantized_model_path} does not exist.")

In [None]:
from huggingface_hub import HfApi, create_repo

api = HfApi(token=token_value)
repo_name = username + "/" + model_name + "-" + quantization + "-GGUF"

try:
    # Attempt to fetch the repository details.
    repo_info = api.repo_info(repo_name)
    print(f"Repository '{repo_name}' already exists.")
    # Check if the repository is private
    if not repo_info.private:
        print(f"Repository '{repo_name}' is public. Updating to private.")
        api.update_repo_visibility(repo_id=repo_name, private=True, token=token_value)
        print(f"Repository '{repo_name}' has been updated to private.")
except:
    # If the repository does not exist, create it.
    create_repo(repo_name, token=token_value, private=True)
    print(f"Repository '{repo_name}' has been created.")


In [None]:
api = HfApi(token=token_value)
api.upload_file(
    path_or_fileobj="/kaggle/working/llama.cpp/"+ model_name + "_" + quantization + ".gguf",
    path_in_repo=model_name + "_" + quantization + ".gguf",
    repo_id=repo_name,
    repo_type="model",
)

In [None]:
ggufpath = f"/kaggle/model/{model_name}.bin"
!./main -m {ggufpath} -c 512 -b 1024 -n 256 --keep 48 \
    --repeat_penalty 1.0 --color \
    -p "<|im_start|>user \
    I have 10 apples. I find 3 gold coins in the bottom of a river. The river runs near a big city that has something to do with what I can spend the coins on. I then lose 4 apples but gain a gold coin. Three birds run into my path and drop 6 apples each. I play an online game and win 6 gold coins but I have to share them equally with my 2 teammates. I buy apples for all the coins I have. The price of an apple is 0.5 coins. How many apples do I have? And where is the river?  \
    <|im_start|>assistant"


In [None]:
ggufpath = "/kaggle/working/llama.cpp/"+model_name+"_" + quantization + ".gguf"
!./main -m {ggufpath} -c 512 -b 1024 -n 256 --keep 48 \
    --repeat_penalty 1.0 --color \
    -p "<|im_start|>user \
    I have 10 apples. I find 3 gold coins in the bottom of a river. The river runs near a big city that has something to do with what I can spend the coins on. I then lose 4 apples but gain a gold coin. Three birds run into my path and drop 6 apples each. I play an online game and win 6 gold coins but I have to share them equally with my 2 teammates. I buy apples for all the coins I have. The price of an apple is 0.5 coins. How many apples do I have? And where is the river?  \
    <|im_start|>assistant"
!./main -m {ggufpath} -c 512 -b 1024 -n 256 --keep 48 \
    --repeat_penalty 1.0 --color \
    -p "<|im_start|>user \
    Samantha has 3 brothers. Each brother has 2 sisters. How many sisters does Samantha have? \
    <|im_start|>assistant"