<a href="https://colab.research.google.com/github/AiratGaliev/llama-cpp-server-colab/blob/main/llama_cpp_server_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Start llama-cpp-server

#@markdown If unsure about the branch, write "main" or leave it blank.
%cd /content
!apt-get -y install -qq aria2
!curl -s https://ngrok-agent.s3.amazonaws.com/ngrok.asc | tee /etc/apt/trusted.gpg.d/ngrok.asc >/dev/null && echo "deb https://ngrok-agent.s3.amazonaws.com buster main" | tee /etc/apt/sources.list.d/ngrok.list
!apt update
!apt list --upgradable
!apt dist-upgrade
!apt install ngrok
!CUDACXX=/usr/local/cuda/bin/nvcc CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python[server]==0.2.39
import json

import uvicorn
from llama_cpp.server.app import create_app
from llama_cpp.server.settings import ConfigFileSettings, ServerSettings

# Parameters
# chat_format look here https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/llama_chat_format.py#L613
ngrok_auth_token = "" # @param {type:"string"}
ngrok_domain = "wasp-immortal-factually.ngrok-free.app" # @param {type:"string"}
n_gpu_layers = -1 # @param {type:"integer"}
offload_kqv = True # @param {type:"boolean"}
n_threads = 12 # @param {type:"integer"}
n_batch = 1024 # @param {type:"integer"}
n_ctx = 16384 # @param {type:"integer"}
cache = False # @param {type:"boolean"}
quant_method = "Q6_K" # @param ["Q4_K_M", "Q5_K_M", "Q6_K", "Q8_0"]
# @markdown 1. Model
model_url_1 = "TheBloke/dolphin-2.6-mistral-7B-dpo-laser-GGUF" # @param {type:"string"}
model_alias_1 = "dolphin" # @param {type:"string"}
chat_format_1 = "chatml" # @param ["openchat", "chatml", "llama-2", "alpaca", "qwen", "vicuna", "oasst_llama", "baichuan-2", "baichuan", "openbuddy", "redpajama-incite", "snoozy", "phind", "intel", "open-orca", "mistrallite", "zephyr", "pygmalion", "mistral-instruct", "chatglm3", "saiga", "functionary"]
# @markdown 2. Model
model_url_2 = "TheBloke/openchat-3.5-0106-GGUF" # @param {type:"string"}
model_alias_2 = "openchat" # @param {type:"string"}
chat_format_2 = "openchat" # @param ["openchat", "chatml", "llama-2", "alpaca", "qwen", "vicuna", "oasst_llama", "baichuan-2", "baichuan", "openbuddy", "redpajama-incite", "snoozy", "phind", "intel", "open-orca", "mistrallite", "zephyr", "pygmalion", "mistral-instruct", "chatglm3", "saiga", "functionary"]
include_model_2 = True #@param {type:"boolean"}
# @markdown 3. Model
model_url_3 = "TheBloke/OpenHermes-2.5-Mistral-7B-16k-GGUF" # @param {type:"string"}
model_alias_3 = "openhermes" # @param {type:"string"}
chat_format_3 = "chatml" # @param ["openchat", "chatml", "llama-2", "alpaca", "qwen", "vicuna", "oasst_llama", "baichuan-2", "baichuan", "openbuddy", "redpajama-incite", "snoozy", "phind", "intel", "open-orca", "mistrallite", "zephyr", "pygmalion", "mistral-instruct", "chatglm3", "saiga", "functionary"]
include_model_3 = True # @param {type:"boolean"}
# @markdown 4. Model
model_url_4 = "TheBloke/xDAN-L1-Chat-RL-v1-GGUF" # @param {type:"string"}
model_alias_4 = "xdan" # @param {type:"string"}
chat_format_4 = "chatml" # @param ["openchat", "chatml", "llama-2", "alpaca", "qwen", "vicuna", "oasst_llama", "baichuan-2", "baichuan", "openbuddy", "redpajama-incite", "snoozy", "phind", "intel", "open-orca", "mistrallite", "zephyr", "pygmalion", "mistral-instruct", "chatglm3", "saiga", "functionary"]
include_model_4 = True # @param {type:"boolean"}
# @markdown 5. Model
model_url_5 = "TheBloke/Starling-LM-7B-alpha-GGUF" # @param {type:"string"}
model_alias_5 = "starling" # @param {type:"string"}
chat_format_5 = "openchat" # @param ["openchat", "chatml", "llama-2", "alpaca", "qwen", "vicuna", "oasst_llama", "baichuan-2", "baichuan", "openbuddy", "redpajama-incite", "snoozy", "phind", "intel", "open-orca", "mistrallite", "zephyr", "pygmalion", "mistral-instruct", "chatglm3", "saiga", "functionary"]
include_model_5 = True # @param {type:"boolean"}
# @markdown 6. Model
model_url_6 = "TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF" # @param {type:"string"}
model_alias_6 = "capybarahermes" # @param {type:"string"}
chat_format_6 = "chatml" # @param ["openchat", "chatml", "llama-2", "alpaca", "qwen", "vicuna", "oasst_llama", "baichuan-2", "baichuan", "openbuddy", "redpajama-incite", "snoozy", "phind", "intel", "open-orca", "mistrallite", "zephyr", "pygmalion", "mistral-instruct", "chatglm3", "saiga", "functionary"]
include_model_6 = True # @param {type:"boolean"}

json_cfg = {
    "host": "0.0.0.0",
    "port": 1234,
    "models": [
        {
            "model": model_url_1,
            "model_alias": model_alias_1,
            "chat_format": chat_format_1,
            "include_model": True
        },
        {
            "model": model_url_2,
            "model_alias": model_alias_2,
            "chat_format": chat_format_2,
            "include_model": include_model_2
        },
        {
            "model": model_url_3,
            "model_alias": model_alias_3,
            "chat_format": chat_format_3,
            "include_model": include_model_3
        },
        {
            "model": model_url_4,
            "model_alias": model_alias_4,
            "chat_format": chat_format_4,
            "include_model": include_model_4
        },
        {
            "model": model_url_5,
            "model_alias": model_alias_5,
            "chat_format": chat_format_5,
            "include_model": include_model_5
        },
        {
            "model": model_url_6,
            "model_alias": model_alias_6,
            "chat_format": chat_format_6,
            "include_model": include_model_6
        },
    ]
}

models_dir = "/content/models"
json_cfg["models"] = [model for model in json_cfg["models"] if model.get("include_model", True)]
for model_dict in json_cfg["models"]:
    del model_dict["include_model"]
    model = model_dict["model"].strip()
    model_url = ""
    if model != "":
      if model.startswith('https') and model.endswith('.gguf'):
        model_url = model
      else:
        model_name = model.split('/')[-1].lower().rstrip("-gguf") + f".{quant_method}.gguf"
        model_url = f"https://huggingface.co/{model}/resolve/main/{model_name}"
    download_cmd = f"aria2c --console-log-level=error -c -x 16 -s 16 -k 1M {model_url} -d {models_dir} -o {model_name}"
    print(download_cmd)
    !$download_cmd
    model_dict["model"]: str = f"{models_dir}/{model_name}"
    model_dict["n_gpu_layers"]: int = n_gpu_layers
    model_dict["offload_kqv"] = offload_kqv
    model_dict["n_threads"]: int = n_threads
    model_dict["n_batch"]: int = n_batch
    model_dict["n_ctx"]: int = n_ctx
    model_dict["cache"] = cache

port = json_cfg["port"]
with open('CONFIG_FILE.json', 'w') as json_file:
    json.dump(json_cfg, json_file)

!python3 -m llama_cpp.server --config_file CONFIG_FILE.json > server.log 2>&1 &

# Start ngrok tunnel
# add secret variable NGROK_AUTHTOKEN with Authtoken value from https://dashboard.ngrok.com/get-started/your-authtoken
!ngrok config add-authtoken {ngrok_auth_token}
print(f"Check API: https://{ngrok_domain}/docs")
!ngrok http --domain={ngrok_domain} {port}