In [2]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate

from guidance import gen, select

from llama_index.llms.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

In [3]:
from huggingface_hub import hf_hub_download
from anli.config import DEFAULT_MODEL_IDENTIFIER, DEFAULT_MODEL_FILENAME
model_path = hf_hub_download(repo_id=DEFAULT_MODEL_IDENTIFIER, filename=DEFAULT_MODEL_FILENAME)

In [4]:
from anli.llms import CombinedLlamaCpp

In [5]:
# This is for LangChain
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [7]:
llms = CombinedLlamaCpp(model_path=model_path,
                        verbose=True,     
                        temperature=0.75,
                        max_tokens=2000,
                        top_p=1,
                        n_gpu_layers=-1,
                        lc_kwargs={"callback_manager":callback_manager},
                        li_kwargs={"messages_to_prompt":messages_to_prompt,
    "completion_to_prompt":completion_to_prompt})

ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 2 CUDA devices:
  Device 0: Tesla V100-PCIE-16GB, compute capability 7.0
  Device 1: Tesla V100-PCIE-16GB, compute capability 7.0
llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /home/bwen/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.1-GGUF/snapshots/45167a542b6fa64a14aea61a4c468bbbf9f258a8/mistral-7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_o

# test guidance

In [11]:
lm = (llms.GU_model 
      + "What's the capital of china? I think it is " 
      + select(["beijing", "paris", "New york"])
        + ". It is " +
 gen(name='answer', stop='\n', min_length=1, max_length=10))

In [13]:
print(lm)

What's the capital of china? I think it is beijing. It is the largest city in the world.


# test langchain

In [8]:
prompt = """
Question: where is new york city?
"""
llms.LC_model(prompt)

Answer: New York City is located in the United States, in the state of New York. It is one of the most populous cities in the world and is known for its iconic landmarks such as the Statue of Liberty and Central Park.


llama_print_timings:        load time =      61.75 ms
llama_print_timings:      sample time =      22.63 ms /    53 runs   (    0.43 ms per token,  2341.82 tokens per second)
llama_print_timings: prompt eval time =     100.21 ms /    13 tokens (    7.71 ms per token,   129.72 tokens per second)
llama_print_timings:        eval time =    1153.37 ms /    52 runs   (   22.18 ms per token,    45.09 tokens per second)
llama_print_timings:       total time =    1416.03 ms


'Answer: New York City is located in the United States, in the state of New York. It is one of the most populous cities in the world and is known for its iconic landmarks such as the Statue of Liberty and Central Park.'

# test llama-index

In [9]:
response = llms.LI_model.complete("Question: where is new york city?")
print(response.text)

Llama.generate: prefix-match hit


 New York City is located in the United States, specifically in the state of New York. It is one of the most populous cities in the world and is known for its iconic landmarks such as the Statue of Liberty and Times Square.



llama_print_timings:        load time =      61.75 ms
llama_print_timings:      sample time =      24.10 ms /    51 runs   (    0.47 ms per token,  2116.36 tokens per second)
llama_print_timings: prompt eval time =     364.54 ms /    71 tokens (    5.13 ms per token,   194.77 tokens per second)
llama_print_timings:        eval time =    1080.89 ms /    50 runs   (   21.62 ms per token,    46.26 tokens per second)
llama_print_timings:       total time =    1584.03 ms
