In [1]:
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler

In [2]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [12]:
n_gpu_layers = 1  # The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all to GPU.
n_batch = 1024  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
CTX = 1024
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="./models/llama-2-7b-chat.Q4_K_M.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
    grammar_path="./json.gbnf",
    n_ctx=CTX
    
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ./models/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32       

In [4]:
%%capture captured --no-stdout
result = llm.invoke("Describe a person in JSON format:")

{"name": "John Doe", "age": 32, " occupation": "Software Engineer", "location": "New York"}

In [15]:
result = llm.invoke("Describe the main concepts of probability in JSON format:")

Llama.generate: prefix-match hit


{"probability":1,"randomness":0,"event":[],"sampleSpace":[]}


llama_print_timings:        load time =   27997.82 ms
llama_print_timings:      sample time =     158.21 ms /    22 runs   (    7.19 ms per token,   139.05 tokens per second)
llama_print_timings: prompt eval time =   10863.30 ms /    11 tokens (  987.57 ms per token,     1.01 tokens per second)
llama_print_timings:        eval time =   23905.04 ms /    21 runs   ( 1138.34 ms per token,     0.88 tokens per second)
llama_print_timings:       total time =   35090.62 ms /    32 tokens


# Structured Output with Local Models using Pydantic

In [53]:
from typing import List
from pydantic import BaseModel, Field


class TechnicalArticle(BaseModel):
    tools_resources: str = Field(description="Tools and resources mentioned in the article")

In [55]:
extract_object_json = TechnicalArticle.schema_json()

In [57]:
import json
import json

# Assuming paper_object_json is a string in JSON format

# Convert to Python dictionary
extract_object = json.loads(extract_object_json)


with open('./extract_pydantic_obj_schema.json', 'w') as f:
    json.dump(extract_object, f)

In [58]:
#https://github.com/ggerganov/llama.cpp/blob/master/examples/json-schema-to-grammar.py
!python ./json-schema-to-grammar.py ./extract_pydantic_obj_schema.json >> extract.gbnf

In [3]:
n_gpu_layers = -1  # The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all to GPU.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
CTX = 512
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="./models/llama-2-7b-chat.Q4_K_M.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
    grammar_path="./extract.gbnf",
    n_ctx=CTX
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ./models/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32       

In [4]:
llm.invoke("""           
            Building RAG from Scratch (Open-source only!)
            In this tutorial, we show you how to build a data ingestion pipeline into a vector database, and then build a retrieval pipeline from that vector database, from scratch.

            Notably, we use a fully open-source stack:

            Sentence Transformers as the embedding model
            Postgres as the vector store (we support many other vector stores too!)
            Llama 2 as the LLM (through llama.cpp)            
            source: https://docs.llamaindex.ai/en/stable/examples/low_level/oss_ingestion_retrieval/
           """)

{"tools_resources": "https://llamaindex.ai/en/stable/"}


llama_print_timings:        load time =    6357.89 ms
llama_print_timings:      sample time =     173.99 ms /    27 runs   (    6.44 ms per token,   155.18 tokens per second)
llama_print_timings: prompt eval time =    6357.72 ms /   153 tokens (   41.55 ms per token,    24.07 tokens per second)
llama_print_timings:        eval time =     875.77 ms /    26 runs   (   33.68 ms per token,    29.69 tokens per second)
llama_print_timings:       total time =    7507.06 ms /   179 tokens


'{"tools_resources": "https://llamaindex.ai/en/stable/"}'