#### Cloning necessary repo and installation

In [None]:
!git clone https://github.com/ggerganov/llama.cpp.git
!cd llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make
!pip install -r llama.cpp/requirements.txt

In [None]:
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
!sudo apt-get install git-lfs

##  Quantization

In [14]:
import os
os.environ['HF_TOKEN'] = "[YOUR HF TOKEN HERE]"

In [3]:
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model_name = model_id.split('/')[-1]

In [15]:
!git clone https://huggingface.co/{model_id}

Cloning into 'TinyLlama-1.1B-Chat-v1.0'...



In [24]:
fp16 = f"{model_name}/{model_name.lower()}.fp16.bin"
!python llama.cpp/convert.py {model_name} --outtype f16 --outfile {fp16}

Loading model file TinyLlama-1.1B-Chat-v1.0/model.safetensors
params = Params(n_vocab=32000, n_embd=2048, n_layer=22, n_ctx=2048, n_ff=5632, n_head=32, n_head_kv=4, n_experts=None, n_experts_used=None, f_norm_eps=1e-05, rope_scaling_type=None, f_rope_freq_base=10000.0, f_rope_scale=None, n_orig_ctx=None, rope_finetuned=None, ftype=<GGMLFileType.MostlyF16: 1>, path_model=PosixPath('TinyLlama-1.1B-Chat-v1.0'))
Loaded vocab file PosixPath('TinyLlama-1.1B-Chat-v1.0/tokenizer.model'), type 'spm'
Vocab info: <SentencePieceVocab with 32000 base tokens and 0 added tokens>
Special vocab info: <SpecialVocab with 0 merges, special tokens {'bos': 1, 'eos': 2, 'unk': 0, 'pad': 2}, add special tokens unset>
Permuting layer 0
Permuting layer 1
Permuting layer 2
Permuting layer 3
Permuting layer 4
Permuting layer 5
Permuting layer 6
Permuting layer 7
Permuting layer 8
Permuting layer 9
Permuting layer 10
Permuting layer 11
Permuting layer 12
Permuting layer 13
Permuting layer 14
Permuting layer 15
Per

In [25]:
QUANTIZATION_METHODS = ["q4_k_m", "q5_k_m"]

for method in QUANTIZATION_METHODS:
    qtype = f"{model_name}/{model_name.lower()}.{method.upper()}.gguf"
    !./llama.cpp/quantize {fp16} {qtype} {method}

main: build = 2675 (17e98d4c)
main: built with cc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0 for x86_64-linux-gnu
main: quantizing 'TinyLlama-1.1B-Chat-v1.0/tinyllama-1.1b-chat-v1.0.fp16.bin' to 'TinyLlama-1.1B-Chat-v1.0/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf' as Q4_K_M
llama_model_loader: loaded meta data with 22 key-value pairs and 201 tensors from TinyLlama-1.1B-Chat-v1.0/tinyllama-1.1b-chat-v1.0.fp16.bin (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = .
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 32000
llama_model_loader: - kv   3:                       llama.context_length u32              = 2048
llama_model_loader: - kv   4:                     llama.embedding_length u32       

#### At this point you should see in the model_name folder two file .gguf

In [7]:
import os
model_list = [file for file in os.listdir(model_name) if "gguf" in file]
model_list

['tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
 'tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf']

## Inference

In [3]:
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model_name = model_id.split('/')[-1]

In [9]:
prompt = "Write me a bubble sort in C#:"
gguf_model = "tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf"

In [10]:
qtype = os.getcwd()+f"/TinyLlama-1.1B-Chat-v1.0/{gguf_model}"
qtype

'/teamspace/studios/this_studio/pratical-llms/TinyLlama-1.1B-Chat-v1.0/tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf'

In [9]:
!./llama.cpp/main -m {qtype} -n 128 --color -ngl 35 -p "{prompt}"

Log start
main: build = 2675 (17e98d4c)
main: built with cc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0 for x86_64-linux-gnu
main: seed  = 1713182873
llama_model_loader: loaded meta data with 23 key-value pairs and 201 tensors from /teamspace/studios/this_studio/pratical-llms/TinyLlama-1.1B-Chat-v1.0/tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = .
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 32000
llama_model_loader: - kv   3:                       llama.context_length u32              = 2048
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv   5:                          lla

##### It worked and is also super fast

## Let's run the Server

In [10]:
!./llama.cpp/server --threads 1 --threads-batch 1 --threads-http 1 -m {qtype} -n 128 --batch-size 1 -ngl 23 --main-gpu 0 --port 8080

{"tid":"140136584015872","timestamp":1713182889,"level":"INFO","function":"main","line":2921,"msg":"build info","build":2675,"commit":"17e98d4c"}
{"tid":"140136584015872","timestamp":1713182889,"level":"INFO","function":"main","line":2926,"msg":"system info","n_threads":1,"n_threads_batch":1,"total_threads":8,"system_info":"AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 1 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | "}
llama_model_loader: loaded meta data with 23 key-value pairs and 201 tensors from /teamspace/studios/this_studio/pratical-llms/TinyLlama-1.1B-Chat-v1.0/tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:       

llm_load_tensors: ggml ctx size =    0.15 MiB
llm_load_tensors: offloading 22 repeating layers to GPU
llm_load_tensors: offloading non-repeating layers to GPU
llm_load_tensors: offloaded 23/23 layers to GPU
llm_load_tensors:        CPU buffer size =    42.97 MiB
llm_load_tensors:      CUDA0 buffer size =   702.14 MiB
......................................................................................
llama_new_context_with_model: n_ctx      = 512
llama_new_context_with_model: n_batch    = 1
llama_new_context_with_model: n_ubatch   = 1
llama_new_context_with_model: freq_base  = 10000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init:      CUDA0 KV buffer size =    11.00 MiB
llama_new_context_with_model: KV self size  =   11.00 MiB, K (f16):    5.50 MiB, V (f16):    5.50 MiB
llama_new_context_with_model:  CUDA_Host  output buffer size =     0.24 MiB
llama_new_context_with_model:      CUDA0 compute buffer size =     0.13 MiB
llama_new_context_with_model:  CUDA_Host comp

## The server is up and running

#### Test it with curl 
Open the terminal and send your request like the following:

curl --request POST \
    --url http://localhost:8080/completion \
    --header "Content-Type: application/json" \
    --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'

## Start Server in background and make request from here

In [11]:
import threading
import os

def run_server(qtype):
    os.system(f"./llama.cpp/server --threads 1 --threads-batch 1 --threads-http 1 -m {qtype} -n 1500 --batch-size 1 -ngl 23 --main-gpu 0 --port 8080")

In [12]:
server_thread = threading.Thread(target=run_server,args=(qtype,))
server_thread.start()

{"tid":"140261659508736","timestamp":1713184300,"level":"INFO","function":"main","line":2921,"msg":"build info","build":2675,"commit":"17e98d4c"}
{"tid":"140261659508736","timestamp":1713184300,"level":"INFO","function":"main","line":2926,"msg":"system info","n_threads":1,"n_threads_batch":1,"total_threads":8,"system_info":"AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 1 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | "}


llama_model_loader: loaded meta data with 23 key-value pairs and 201 tensors from /teamspace/studios/this_studio/pratical-llms/TinyLlama-1.1B-Chat-v1.0/tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = .
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 32000
llama_model_loader: - kv   3:                       llama.context_length u32              = 2048
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv   5:                          llama.block_count u32              = 22
llama_model_loader: - kv   6:                  llama.feed_forward_length u32              = 5632
llama_mode

{"tid":"140261659508736","timestamp":1713184301,"level":"INFO","function":"init","line":708,"msg":"initializing slots","n_slots":1}
{"tid":"140261659508736","timestamp":1713184301,"level":"INFO","function":"init","line":717,"msg":"new slot","id_slot":0,"n_ctx_slot":512}
{"tid":"140261659508736","timestamp":1713184301,"level":"INFO","function":"main","line":3021,"msg":"model loaded"}
{"tid":"140261659508736","timestamp":1713184301,"level":"INFO","function":"main","line":3043,"msg":"chat template","chat_example":"<|system|>\nYou are a helpful assistant<|endoftext|>\n<|user|>\nHello<|endoftext|>\n<|assistant|>\nHi there<|endoftext|>\n<|user|>\nHow are you?<|endoftext|>\n<|assistant|>\n","built_in":true}
{"tid":"140261659508736","timestamp":1713184301,"level":"INFO","function":"main","line":3774,"msg":"HTTP server listening","n_threads_http":"1","port":"8080","hostname":"127.0.0.1"}
{"tid":"140261659508736","timestamp":1713184301,"level":"INFO","function":"update_slots","line":1786,"msg":"

In [17]:
!pip install httpx

Collecting httpx
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting httpcore==1.* (from httpx)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: httpcore, httpx
Successfully installed httpcore-1.0.5 httpx-0.27.0


In [13]:
import httpx

In [14]:
async def make_request(prompt,num_tokens=500):
    async with httpx.AsyncClient() as client:
        # Define your JSON data
        json_data = {"prompt": f"{prompt}","n_predict": num_tokens}
        headers = {"Content-Type": "application/json"}
        # Make a POST request with JSON body
        response = await client.post('http://localhost:8080/completion', json=json_data,headers=headers)
        return response

In [19]:
output = await make_request("Here a very large poem about Naples:",1000)
json_output = output.json()

{"tid":"140261659508736","timestamp":1713184360,"level":"INFO","function":"launch_slot_with_task","line":1037,"msg":"slot is processing task","id_slot":0,"id_task":969}
{"tid":"140261659508736","timestamp":1713184360,"level":"INFO","function":"update_slots","line":2066,"msg":"kv cache rm [p0, end)","id_slot":0,"id_task":969,"p0":0}
{"tid":"140261659508736","timestamp":1713184360,"level":"INFO","function":"update_slots","line":2066,"msg":"kv cache rm [p0, end)","id_slot":0,"id_task":969,"p0":1}
{"tid":"140261659508736","timestamp":1713184360,"level":"INFO","function":"update_slots","line":2066,"msg":"kv cache rm [p0, end)","id_slot":0,"id_task":969,"p0":2}
{"tid":"140261659508736","timestamp":1713184360,"level":"INFO","function":"update_slots","line":2066,"msg":"kv cache rm [p0, end)","id_slot":0,"id_task":969,"p0":3}
{"tid":"140261659508736","timestamp":1713184360,"level":"INFO","function":"update_slots","line":2066,"msg":"kv cache rm [p0, end)","id_slot":0,"id_task":969,"p0":4}
{"tid"

In [20]:
json_output

{'content': '\nNaples is a great city,\nA great city, a city of song,\nA city of poems, a city of poets,\nA city of song,\nWhere the wind howls,\nWhere the waves crash,\nWhere the sun sets,\nWhere the moon rises,\nWhere the sun and the moon meet,\nWhere the stars come out\nTo play their magic on the sea,\nWhere the world sings,\nWhere the world is free,\nWhere the world is not.\n\nThe sun and the moon,\nIn Naples, are the same,\nFor they are both the gods,\nThe gods of the sea,\nOf love, of power,\nOf the endless dance,\nOf the eternal beauty,\nOf the eternal youth,\nOf the eternal life.\n\nAnd as the sun sets,\nIn Naples, it is the same,\nAs the moon rises,\nIn Naples, it is the same,\nFor they are both the same,\nFor they are both the same,\nFor they are both the same,\nIn Naples,\nIn Naples,\nIn Naples,\nIn Naples.\n\nAnd in Naples,\nThere is a song,\nA song that goes on and on,\nFor it is the song of the sea,\nOf love, of power,\nOf the endless dance,\nOf the eternal beauty,\nOf th

#### 152 token/sec on a T4, not bad. 