**SOURCE**: https://docs.llamaindex.ai/en/stable/examples/low_level/oss_ingestion_retrieval/

In [2]:
import os
import glob

# Get the absolute path of the current project directory
project_dir = os.path.abspath('.')

# Get the parent of the parent directory
WORK_DIR = os.path.abspath(os.path.join(project_dir, '../../'))

# Change the working directory to the parent of the parent directory
os.chdir(WORK_DIR)

# Verify the change by printing the current working directory
print("Current Working Directory:", os.getcwd())

Current Working Directory: /Users/david.amat/Documents/david/pdf-search-llm-rag


In [1]:
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"

data/llama2.pdf: No such file or directory


In [3]:
from llama_index.readers.file import PyMuPDFReader
from llama_index.core.node_parser import SentenceSplitter
import tqdm

# 1. Load Data

In [4]:
loader = PyMuPDFReader()
file_title = "llama2"
pdf_path = f"data/{file_title}.pdf"
documents = loader.load(file_path=pdf_path)

### Trying sentence splitter

In [5]:
text_parser = SentenceSplitter(
    chunk_size=10,
    chunk_overlap=2,
    paragraph_separator=".\n",
)

In [6]:
text_parser.split_text("Recurrent neural networks, long short-term memory [13] and gated recurrent [7] neural networks.\n In particular we are interested in knowing what splits of the text this splitter can provide")

['Recurrent neural networks, long short-term memory',
 'memory [13] and gated recurrent [7]',
 'neural networks.\n In particular we are interested in',
 'interested in knowing what splits of the text this splitter',
 'this splitter can provide']

## Split PDF document

In [7]:
text_parser = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)

In [8]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

## Manually Construct Nodes from Text Chunks

In [9]:
from llama_index.core.schema import TextNode

nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

In [10]:
len(nodes)

107

In [11]:
# print(nodes[0].text)

### Generate Embeddings for each Node

In [12]:
from sentence_transformers import SentenceTransformer
import torch
MODEL_SENTENCE_TRANSFORMER = 'all-MiniLM-L6-v2'
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  from tqdm.autonotebook import tqdm, trange


In [13]:
# Load the SentenceTransformer model
model = SentenceTransformer(
    MODEL_SENTENCE_TRANSFORMER,
).to(DEVICE)



In [14]:
node.metadata

{'total_pages': 77, 'file_path': 'data/llama2.pdf', 'source': '77'}

In [15]:
# Generate embeddings
for node in tqdm.tqdm(nodes):
    node_embedding = model.encode(node.text)
    node.embedding = list(node_embedding)

100%|██████████| 107/107 [00:02<00:00, 41.43it/s]


In [16]:
len(node_embedding)

384

# Configure Database

Set Up PostgreSQL with: https://www.sqlshack.com/setting-up-a-postgresql-database-on-mac/

```bash
brew install postgresql
brew services start postgresql
# brew services stop postgresql

# Configure psql
psql postgres

# <Inside PostgreSQL>
CREATE ROLE david WITH LOGIN PASSWORD 'qrks';
ALTER ROLE david WITH SUPERUSER;
ALTER ROLE david CREATEDB;

# New Logins
psql postgres -U david

# List users and roles
\du

```

In [17]:
import psycopg2

db_name = "vector_db"
host = "localhost"
password = "qrks"
port = "5432"
user = "david"
# conn = psycopg2.connect(connection_string)
conn = psycopg2.connect(
    dbname="postgres",
    host=host,
    password=password,
    port=port,
    user=user,
)
conn.autocommit = True

#with conn.cursor() as c:
#    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
#    c.execute(f"CREATE DATABASE {db_name}")

```bash
# New Logins
psql postgres -U david

# in PostgreSQL CLI
\t
# Connect to vector_db
\c vector_db
# list tables
\dt
```

 ### Create a Vector Store

In [19]:
%pip install llama-index-vector-stores-postgres

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting llama-index-vector-stores-postgres
  Downloading llama_index_vector_stores_postgres-0.1.14-py3-none-any.whl.metadata (853 bytes)
Collecting pgvector<0.3.0,>=0.2.4 (from llama-index-vector-stores-postgres)
  Downloading pgvector-0.2.5-py2.py3-none-any.whl.metadata (9.9 kB)
Downloading llama_index_vector_stores_postgres-0.1.14-py3-none-any.whl (8.4 kB)
Downloading pgvector-0.2.5-py2.py3-none-any.whl (9.6 kB)
Installing collected packages: pgvector, llama-index-vector-stores-postgres
  Attempting uninstall: pgvector
    Found existing installation: pgvector 0.3.2
    Uninstalling pgvector-0.3.2:
      Successfully uninstalled pgvector-0.3.2
Successfully installed llama-index-vector-stores-postgres-0.1.14 pgvector-0.2.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may 

In [22]:
%pip install pgvector

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [30]:
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore

vector_store = PGVectorStore.from_params(
    database=db_name,
    host=host,
    password=password,
    port=port,
    user=user,
    table_name="llama2_paper",
    embed_dim=384,  # openai embedding dimension
)

### Configure PGVector locally

```bash
brew install pgvector

# Go to PostgreSQL and run
CREATE EXTENSION vector;

```

### Load Nodes into Vector Store

In [32]:
vector_store.add(nodes)

['44783819-1990-45a2-8956-0e448558eb33',
 'f912c161-b2bb-4f7f-8fee-8c8bc4eaeddc',
 '249b83ef-8dfa-48ed-b511-69cac739a5e2',
 '34d3139c-0770-4382-a69c-ed0506cab9dd',
 '482c3a7d-e4bf-4cea-bd41-ac49cdbeede4',
 'f7442ce2-6a1a-4329-be44-cf7fc1537abc',
 '693af066-e688-4db8-954b-a428455bc51c',
 '183a15ee-2c88-4b23-979c-89a3a61e0f04',
 'f36bc684-cbd4-4b7a-ae3d-c9fc257a1b4a',
 'fe7ba88e-4472-4ab6-924e-f2fc89711975',
 'b36b16a3-3a02-4afc-98f8-04f18cb3efce',
 'ba0ae9a1-2a09-42cf-9708-4104afc7c59c',
 'b86ca3f9-4e71-411d-9bed-8c358a245f98',
 '4c95b058-5951-4489-9bcd-3299fe0f864d',
 '484994df-598c-493d-80e6-918db55159fa',
 '4b030fe5-a752-4eb8-83f2-356b4e8b8633',
 'b2c4837e-f13e-4185-b138-405bb67eccfd',
 'b0c00856-688e-4811-9ecc-02b12c19804c',
 'af94843d-c35a-4a0b-8b99-98e954458816',
 'cbdd94fd-10fd-47e4-b9e7-e66593478460',
 'c24e32d2-6c37-4a16-94d7-5968a62f2a4c',
 '8bffb065-b9bb-4725-923b-51278e60a4c0',
 'f824f358-3636-4b0c-9971-df931cd8f43a',
 '1eb38f0a-7e0f-473f-ba54-65053caf1d22',
 'ccc9f16f-7e07-

# Retrieval Pipeline

In [33]:
query_str = "Can you tell me about the key concepts for safety finetuning"

### Generate query embedding

In [36]:
query_embedding = model.encode(query_str)
query_embedding = list(query_embedding)

### Query the Vector Database


In [38]:
# construct vector store query
from llama_index.core.vector_stores import VectorStoreQuery

query_mode = "default"
# query_mode = "sparse"
# query_mode = "hybrid"

vector_store_query = VectorStoreQuery(
    query_embedding=query_embedding, 
    similarity_top_k=5, 
    mode=query_mode
)

In [39]:
# Run query
query_result = vector_store.query(vector_store_query)

In [40]:
len(query_result.nodes)

5

In [43]:
query_result.nodes[0].text[:100]



In [44]:
query_result.nodes[1].text[:100]

'0\n20\n40\n60\n80\n100\nSafety Data Pct. (%)\n0.01\n0.02\n0.03\n0.04\n0.05\nFalse Refusal Rate (%)\nHelpfulness\n0'

In [45]:
query_result.nodes[2].text[:100]

'advice). The attack vectors explored consist of psychological manipulation (e.g., authority manipula'

In [48]:
for i in range(len(query_result.nodes)):
    print(f"Sum {i}: {query_result.similarities[i]}")


Sum 0: 0.48233445251967333
Sum 1: 0.4793740858139923
Sum 2: 0.4716346697165157
Sum 3: 0.46611180118375883
Sum 4: 0.380216889394391


### Parse Results into a set of Nodes

In [49]:
from llama_index.core.schema import NodeWithScore
from typing import Optional

nodes_with_scores = []
for index, node in enumerate(query_result.nodes):
    score: Optional[float] = None
    if query_result.similarities is not None:
        score = query_result.similarities[index]
    nodes_with_scores.append(NodeWithScore(node=node, score=score))

# Retriever

In [51]:
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List

In [52]:
class VectorDBRetriever(BaseRetriever):
    """Retriever over a postgres vector store."""

    def __init__(
        self,
        vector_store: PGVectorStore,
        embed_model: Any,
        query_mode: str = "default",
        similarity_top_k: int = 2,
    ) -> None:
        """Init params."""
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve."""
        arr_query_embedding = self.embed_model(
            query_bundle.query_str
        )
        query_embedding = list(arr_query_embedding)
        vector_store_query = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode,
        )
        query_result = vector_store.query(vector_store_query)

        nodes_with_scores = []
        for index, node in enumerate(query_result.nodes):
            score: Optional[float] = None
            if query_result.similarities is not None:
                score = query_result.similarities[index]
            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

In [53]:
retriever = VectorDBRetriever(
    vector_store, model, query_mode="default", similarity_top_k=2
)

### Get the main LLM that will respond the answer

In [None]:
%pip install llama-index-llms-llama-cpp

In [60]:
from llama_index.llms.llama_cpp import LlamaCPP

In [61]:
# model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin"
model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    verbose=True,
)

Downloading url https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf to path /Users/david.amat/Library/Caches/llama_index/models/llama-2-13b-chat.Q4_0.gguf
total size (MB): 7365.83


7025it [04:29, 26.10it/s]                          
llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /Users/david.amat/Library/Caches/llama_index/models/llama-2-13b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32         

### Retriever Query Engine

In [62]:
from llama_index.core.query_engine import RetrieverQueryEngine

In [63]:
query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)

In [64]:
query_str = "How does Llama 2 perform compared to other open-source models?"

response = query_engine.query(query_str)


llama_print_timings:        load time =   16245.57 ms
llama_print_timings:      sample time =       1.52 ms /    64 runs   (    0.02 ms per token, 42077.58 tokens per second)
llama_print_timings: prompt eval time =   46628.83 ms /  1766 tokens (   26.40 ms per token,    37.87 tokens per second)
llama_print_timings:        eval time =    9433.39 ms /    63 runs   (  149.74 ms per token,     6.68 tokens per second)
llama_print_timings:       total time =   56095.17 ms /  1829 tokens


In [65]:
print(str(response))

 Llama 2 outperforms all open-source models, with results on par or better than PaLM (540B) on almost all benchmarks, except for coding benchmarks where there is a significant gap.

Please let me know if you need any further information or clarification.


In [67]:
# print(response.source_nodes[0].get_content())