In [1]:
# !pip install llama-cpp-python psycopg2-binary pgvector asyncpg "sqlalchemy[asyncio]" greenlet llama_index PyMuPDF

In [2]:
from llama_index.embeddings import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

Downloading (…)lve/main/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [3]:
from llama_index.llms import LlamaCPP

In [4]:
# !wget https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf -P ./data/

In [5]:
model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=None,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path="./data/llama-2-13b-chat.Q4_0.gguf",
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    verbose=True,
)


llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ./data/llama-2-13b-chat.Q4_0.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_0     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_0     [  5120,  5120,     1,  

In [6]:
from llama_index import ServiceContext

service_context = ServiceContext.from_defaults(
    llm=llm, embed_model=embed_model
)

In [7]:
import psycopg2

db_name = "vector_db"
host = "localhost"
password = ""
port = "5432"
user = "shooty"
# conn = psycopg2.connect(connection_string)
conn = psycopg2.connect(
    dbname="postgres",
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name};")
    c.execute(f"CREATE DATABASE {db_name};")

In [8]:
from sqlalchemy import make_url
from llama_index.vector_stores import PGVectorStore

vector_store = PGVectorStore.from_params(
    database=db_name,
    host=host,
    password=password,
    port=port,
    user=user,
    table_name="test",
    embed_dim=384,  # openai embedding dimension
)

In [9]:

# !wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "./data/llama2.pdf"

--2023-10-20 02:56:52--  https://arxiv.org/pdf/2307.09288.pdf
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving arxiv.org (arxiv.org)... 128.84.21.199
Connecting to arxiv.org (arxiv.org)|128.84.21.199|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13661300 (13M) [application/pdf]
Saving to: ‘./data/llama2.pdf’


2023-10-20 02:57:37 (309 KB/s) - ‘./data/llama2.pdf’ saved [13661300/13661300]



In [10]:
from pathlib import Path
import fitz

In [11]:
file_path = "./data/llama2.pdf"
doc = fitz.open(file_path)

In [12]:
from llama_index.text_splitter import SentenceSplitter

In [13]:
text_splitter = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)

In [14]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = text_splitter.split_text(page_text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [15]:
from llama_index.schema import TextNode
nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc_idx = doc_idxs[idx]
    src_page = doc[src_doc_idx]
    nodes.append(node)

In [16]:

print(len(nodes))

110


In [17]:
print(nodes[0].get_content(metadata_mode="all"))

Llama 2: Open Foundation and Fine-Tuned Chat Models
Hugo Touvron∗
Louis Martin†
Kevin Stone†
Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra
Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen
Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller
Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou
Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev
Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich
Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra
Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi
Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang
Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang
Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic
Sergey Edunov

In [18]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

In [19]:
vector_store.add(nodes)

['1735a34f-0fb7-4164-9983-397e2e27d252',
 'f5dad0f4-8993-4b12-8498-ee49788947ae',
 '509176c7-e87f-4de6-aee6-4b525a148d70',
 '6f3f5aeb-d38d-4d82-ac35-bede3969d3ea',
 'e45a3f7a-d549-4b5e-b469-b89eec35e501',
 '659efc0f-7ab0-4319-9fc1-07ba1d4b7acf',
 '2841783d-c4ef-4f61-93bd-099cbedf9e41',
 '79f8d812-0aa0-4854-bd0b-69ed63a2e94a',
 '39fff096-f047-4274-b32c-c72a4a20540d',
 '4afc0161-55f9-4b1b-bb7f-1e9b32602a7c',
 '7178d777-c5f0-4849-812c-fff3a7332093',
 '6a26da4b-dadd-48d4-bd0d-025d1627cfcb',
 '1164fc1b-c157-4c70-ac69-eb653be221e1',
 '5ca8ad96-1b29-471e-9e19-f10e75d92f8c',
 '10032cf3-e0a3-4e44-8b11-65329a70fc6c',
 '24b75ed6-c8fc-4e65-abc5-aaae48a9ea80',
 'd4325018-7596-4257-9021-3f1cf2b8564b',
 '28c821a9-a7b7-461d-b5db-37b9ad89bc40',
 '39ce6ac2-35e2-4d33-b029-22047bd1f6e7',
 '87f2a6e4-4d58-4f9e-a653-b6e0a00d55b9',
 'da2975fc-83f0-411c-8820-37d477a85188',
 'c1570b24-9360-47c4-911a-d2980cdd7238',
 '55bfbb76-8777-41d6-806b-c9ca73ab2dc5',
 'e7d4fd81-161d-41c0-8626-c29684278f5a',
 'd1f88169-9751-

In [20]:
query_str = "How is Llama better?"

In [21]:
query_embedding = embed_model.get_query_embedding(query_str)

In [22]:
query_embedding

[-0.04711316153407097,
 -0.013195754960179329,
 0.01626790128648281,
 0.0442376434803009,
 0.051039841026067734,
 0.004325954243540764,
 0.025830397382378578,
 0.043357402086257935,
 0.005275543313473463,
 -0.025516021996736526,
 0.0014614594401791692,
 -0.0027818132657557726,
 0.11202631890773773,
 0.043361764401197433,
 0.0301270242780447,
 0.03544178605079651,
 0.01658204011619091,
 -0.0233425572514534,
 -0.0780256986618042,
 0.05841067433357239,
 0.0011741099879145622,
 -0.021807704120874405,
 0.0008599586435593665,
 -0.04505724459886551,
 0.055944446474313736,
 0.000850197859108448,
 -0.05301300063729286,
 -0.001645999844186008,
 -0.02799481898546219,
 -0.17744125425815582,
 0.008523893542587757,
 -0.025998320430517197,
 0.021115107461810112,
 -0.007715263869613409,
 -0.08049935847520828,
 0.025906182825565338,
 0.03500715270638466,
 -0.019048046320676804,
 -0.0227682963013649,
 0.06293562054634094,
 0.03504457697272301,
 0.020488183945417404,
 -0.020726200193166733,
 -0.024651987

In [23]:
from llama_index.vector_stores import VectorStoreQuery

query_mode = "default"
# query_mode = "sparse"
# query_mode = "hybrid"

vector_store_query = VectorStoreQuery(
    query_embedding=query_embedding, similarity_top_k=2, mode=query_mode
)

In [24]:
query_result = vector_store.query(vector_store_query)
print(query_result.nodes[0].get_content())

Figure 1: Helpfulness human evaluation results for Llama
2-Chat compared to other open-source and closed-source
models. Human raters compared model generations on ~4k
prompts consisting of both single and multi-turn prompts.
The 95% confidence intervals for this evaluation are between
1% and 2%. More details in Section 3.4.2. While reviewing
these results, it is important to note that human evaluations
can be noisy due to limitations of the prompt set, subjectivity
of the review guidelines, subjectivity of individual raters,
and the inherent difficulty of comparing generations.
Figure 2: Win-rate % for helpfulness and
safety between commercial-licensed base-
lines and Llama 2-Chat, according to GPT-
4. To complement the human evaluation, we
used a more capable model, not subject to
our own guidance. Green area indicates our
model is better according to GPT-4. To remove
ties, we used win/(win + loss). The orders in
which the model responses are presented to
GPT-4 are randomly swapped to

In [25]:
from llama_index.schema import NodeWithScore
from typing import Optional

nodes_with_scores = []
for index, node in enumerate(query_result.nodes):
    score: Optional[float] = None
    if query_result.similarities is not None:
        score = query_result.similarities[index]
    nodes_with_scores.append(NodeWithScore(node=node, score=score))

In [26]:
from llama_index import QueryBundle
from llama_index.retrievers import BaseRetriever
from typing import Any, List


class VectorDBRetriever(BaseRetriever):
    """Retriever over a postgres vector store."""

    def __init__(
        self,
        vector_store: PGVectorStore,
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k: int = 2,
    ) -> None:
        """Init params."""
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve."""
        query_embedding = embed_model.get_query_embedding(query_str)
        vector_store_query = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode,
        )
        query_result = vector_store.query(vector_store_query)

        nodes_with_scores = []
        for index, node in enumerate(query_result.nodes):
            score: Optional[float] = None
            if query_result.similarities is not None:
                score = query_result.similarities[index]
            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

In [27]:
retriever = VectorDBRetriever(
    vector_store, embed_model, query_mode="default", similarity_top_k=2
)

In [28]:
from llama_index.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(
    retriever, service_context=service_context
)

In [29]:
query_str = "How is Llama better?"

response = query_engine.query(query_str)


llama_print_timings:        load time = 54863.63 ms
llama_print_timings:      sample time =    52.35 ms /    80 runs   (    0.65 ms per token,  1528.09 tokens per second)
llama_print_timings: prompt eval time = 201552.55 ms /  1707 tokens (  118.07 ms per token,     8.47 tokens per second)
llama_print_timings:        eval time = 20643.21 ms /    79 runs   (  261.31 ms per token,     3.83 tokens per second)
llama_print_timings:       total time = 222400.56 ms


In [32]:
print(str(response))

 According to GPT-4, Llama is better than other models in terms of helpfulness and safety. The win rate for helpfulness and safety between commercial-licensed baselines and Llama 2-Chat, according to GPT-4, is shown in Figure 2. The green area indicates that Llama is better according to GPT-4.


In [31]:
print(response.source_nodes[0].get_content())

Figure 1: Helpfulness human evaluation results for Llama
2-Chat compared to other open-source and closed-source
models. Human raters compared model generations on ~4k
prompts consisting of both single and multi-turn prompts.
The 95% confidence intervals for this evaluation are between
1% and 2%. More details in Section 3.4.2. While reviewing
these results, it is important to note that human evaluations
can be noisy due to limitations of the prompt set, subjectivity
of the review guidelines, subjectivity of individual raters,
and the inherent difficulty of comparing generations.
Figure 2: Win-rate % for helpfulness and
safety between commercial-licensed base-
lines and Llama 2-Chat, according to GPT-
4. To complement the human evaluation, we
used a more capable model, not subject to
our own guidance. Green area indicates our
model is better according to GPT-4. To remove
ties, we used win/(win + loss). The orders in
which the model responses are presented to
GPT-4 are randomly swapped to