In [1]:
from fastembed import TextEmbedding
import os 
os.chdir("/home/david/Documents/data_science/rag/david/deploy-private-llama-32-api-rag")

In [2]:
embed_model = TextEmbedding(model_name="nomic-ai/nomic-embed-text-v1.5")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model.onnx:   0%|          | 0.00/548M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

In [3]:
directory = "./data"

In [4]:
os.listdir(directory)

['curriculum_v10.pdf']

# Ingest PDFs

In [5]:
pdf_directory = "./data"

### Qdrant Client

In [6]:
from ingestion import setup_qdrant_client

In [7]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, OptimizersConfigDiff, VectorParams

In [8]:
qdrant_url: str = "http://localhost:6333"
collection_name: str = "qdrant_collection"

In [9]:
client = QdrantClient(url=qdrant_url, prefer_grpc=False)

In [10]:
client.collection_exists(collection_name=collection_name)

True

In [22]:
vector_config = VectorParams(
    size=768,
    distance=Distance.DOT,
    on_disk=True,
)

In [23]:
vector_config

VectorParams(size=768, distance=<Distance.DOT: 'Dot'>, hnsw_config=None, quantization_config=None, on_disk=True, datatype=None, multivector_config=None)

In [24]:
Distance.DOT.

<Distance.DOT: 'Dot'>

## Client

In [10]:
client = setup_qdrant_client(qdrant_url, collection_name)

## Process PDFS

In [26]:
directory = pdf_directory
collection_name = collection_name
embedding_model = embed_model

In [43]:
from typing import Iterator, Tuple, Dict, List
from PyPDF2 import PdfReader

class PDFDataLoader:
    def __init__(self, directory: str):
        self.directory = directory
        self.pdf_files = [f for f in os.listdir(directory) if f.endswith('.pdf')]

    def __iter__(self) -> Iterator[Tuple[str, Dict[str, str]]]:
        for pdf_file in self.pdf_files:
            file_path = os.path.join(self.directory, pdf_file)
            reader = PdfReader(file_path)
            for page_num, page in enumerate(reader.pages):
                text = page.extract_text()
                metadata = {
                    'file_name': pdf_file,
                    'page_number': str(page_num + 1),
                    'text': text
                }
                yield text, metadata

    def __len__(self) -> int:
        return len(self.pdf_files)


In [44]:
# Create the data loader
data_loader = PDFDataLoader(directory)

In [31]:
data_loader.pdf_files

['curriculum_v10.pdf']

In [32]:
pdf_file = data_loader.pdf_files[0]
reader = PdfReader(os.path.join(directory, pdf_file))

In [35]:
content = []
metadata = {}
for page_num, page in enumerate(reader.pages):
    text = page.extract_text()
    content.append(text)
    metadata = {
        'file_name': pdf_file,
        'page_number': str(page_num + 1),
        'text': text
    }

In [38]:
# print(content[1])

### Processing in batches

In [40]:
from tqdm import tqdm

In [39]:
def ingest_data(client: QdrantClient, collection_name: str, embeddings: List[List[float]], batch_metadata: List[Dict[str, str]]) -> None:
    client.upload_collection(
        collection_name=collection_name,
        vectors=embeddings,
        payload=batch_metadata,
    )

In [46]:
# Create the data loader
data_loader = PDFDataLoader(directory)
batch_size = 32

# Process the PDFs in batches
batch_texts: List[str] = []
batch_metadata: List[Dict[str, str]] = []

for text, metadata in tqdm(data_loader, desc="Processing PDFs"):
    batch_texts.append(text)
    batch_metadata.append(metadata)
    
    if len(batch_texts) == batch_size:
        # Generate embeddings for the batch
        embeddings = embedding_model.embed(batch_texts)
        
        try:
            ingest_data(client, collection_name, embeddings, batch_metadata)
        except Exception as e:
            print(f"Error ingesting data: {e}")
        
        # Clear the batch
        batch_texts = []
        batch_metadata = []

Processing PDFs: 2it [00:00, 49.95it/s]               


In [49]:
embeddings = embedding_model.embed(batch_texts)

In [50]:
batch_texts

['David\nAmat\nSenior\nData\nScientist \n(Glovo)\nCAT\n|\xa0ENG\n|\nSPA\nLocation:\nBarcelona,\nSpain\nDate\nof\nbirth:\n20-04-1994\nNumber:\n+(34)-608526629\ndaolondrizdaolondriz@gmai.com\nin:\ndavid-amat-olondriz/\ngithub:\nDavidAmat\nFull\nStack\nData\nScientist\nwith\n6+\nyears\nof\nexperience\nin\nend-to-end\nML\ndevelopment,\nspecializing\nin\nrecommender \nsystems ,\ndeep\nlearning,\nNLP,\ncomputer\nvision,\nand\nAI/LLM\ntechnologies .\nCurrently\nleading\nprojects\nat\nGlovo, \ndriving\nimpro vements\nin\nML\nplatfor m\nlifecycle ,\nfeature\nengineering,\nand\nreal-time\nAPI\nserving\nwith\na\nfocus\non \nscalable ,\nproduct-dri ven\nsolutions .\nProven\ntrack\nrecord\nof\ncollaborating\ncross-functionally\nwith\nProduct\nand \nBackend\nteams ,\ndelivering\nimpactful\ndata-dri ven\nproducts .\nConstantly\nadvancing\nskills\nin\nAI/ML\nsystem\ndesign\nand \nstaying\nupdated\nwith\nstate-of-the-ar t\ntechnologies .\nWork\nexperience\nGlo vo\n|\nSenior\nData\nScientist\nBarcelona,

In [54]:
# 3) Convert the generator to a list
embeddings_list = list(embeddings)

In [56]:
len(embeddings_list)

2

In [57]:
emb1 = embeddings_list[0]
emb2 = embeddings_list[1]

In [59]:
emb1.shape

(768,)

In [61]:
ingest_data(client, collection_name, embeddings, batch_metadata)

In [62]:
batch_metadata

[{'file_name': 'curriculum_v10.pdf',
  'page_number': '1',
  'text': 'David\nAmat\nSenior\nData\nScientist \n(Glovo)\nCAT\n|\xa0ENG\n|\nSPA\nLocation:\nBarcelona,\nSpain\nDate\nof\nbirth:\n20-04-1994\nNumber:\n+(34)-608526629\ndaolondrizdaolondriz@gmai.com\nin:\ndavid-amat-olondriz/\ngithub:\nDavidAmat\nFull\nStack\nData\nScientist\nwith\n6+\nyears\nof\nexperience\nin\nend-to-end\nML\ndevelopment,\nspecializing\nin\nrecommender \nsystems ,\ndeep\nlearning,\nNLP,\ncomputer\nvision,\nand\nAI/LLM\ntechnologies .\nCurrently\nleading\nprojects\nat\nGlovo, \ndriving\nimpro vements\nin\nML\nplatfor m\nlifecycle ,\nfeature\nengineering,\nand\nreal-time\nAPI\nserving\nwith\na\nfocus\non \nscalable ,\nproduct-dri ven\nsolutions .\nProven\ntrack\nrecord\nof\ncollaborating\ncross-functionally\nwith\nProduct\nand \nBackend\nteams ,\ndelivering\nimpactful\ndata-dri ven\nproducts .\nConstantly\nadvancing\nskills\nin\nAI/ML\nsystem\ndesign\nand \nstaying\nupdated\nwith\nstate-of-the-ar t\ntechnologies

### Delete a collection and create it again

In [66]:
def recreate_collection(
    client: QdrantClient,
    collection_name: str
):
    # 1) Delete the collection if it exists
    if client.collection_exists(collection_name=collection_name):
        client.delete_collection(collection_name=collection_name)
        print(f"Deleted existing collection: {collection_name}")

    # 2) Create the collection again with desired settings
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=768,               # Dimension of your vectors
            distance=Distance.DOT,  # Cosine similarity or dot-product metric
            on_disk=True           # Store vectors on disk
        ),
        optimizers_config=OptimizersConfigDiff(
            default_segment_number=5,
            indexing_threshold=1000,
        ),
    )
    print(f"Re-created collection: {collection_name}")

In [67]:
# Call the function
recreate_collection(
    client=client,
    collection_name=collection_name
)

Deleted existing collection: qdrant_collection
Re-created collection: qdrant_collection


### Upload to collection the embeddings

In [76]:
embeddings = embedding_model.embed(batch_texts)
client.upload_collection(
    collection_name=collection_name,
    vectors=embeddings,
    payload=batch_metadata,
)

In [73]:
embeddings

<generator object TextEmbedding.embed at 0x7ece081d3a00>

### Retrieve embeddings from qdrant

In [12]:
# Scroll through collection
offset = None
batch_size = 10
records, next_offset = client.scroll(
    collection_name=collection_name,
    offset=offset,
    limit=batch_size,
    with_vectors=True  # Important to actually retrieve the vectors
)

In [13]:
records

[Record(id='0c9a48bd-4eba-47db-84e1-bf97ad6408fc', payload={'file_name': 'curriculum_v10.pdf', 'page_number': '1', 'text': 'David\nAmat\nSenior\nData\nScientist \n(Glovo)\nCAT\n|\xa0ENG\n|\nSPA\nLocation:\nBarcelona,\nSpain\nDate\nof\nbirth:\n20-04-1994\nNumber:\n+(34)-608526629\ndaolondrizdaolondriz@gmai.com\nin:\ndavid-amat-olondriz/\ngithub:\nDavidAmat\nFull\nStack\nData\nScientist\nwith\n6+\nyears\nof\nexperience\nin\nend-to-end\nML\ndevelopment,\nspecializing\nin\nrecommender \nsystems ,\ndeep\nlearning,\nNLP,\ncomputer\nvision,\nand\nAI/LLM\ntechnologies .\nCurrently\nleading\nprojects\nat\nGlovo, \ndriving\nimpro vements\nin\nML\nplatfor m\nlifecycle ,\nfeature\nengineering,\nand\nreal-time\nAPI\nserving\nwith\na\nfocus\non \nscalable ,\nproduct-dri ven\nsolutions .\nProven\ntrack\nrecord\nof\ncollaborating\ncross-functionally\nwith\nProduct\nand \nBackend\nteams ,\ndelivering\nimpactful\ndata-dri ven\nproducts .\nConstantly\nadvancing\nskills\nin\nAI/ML\nsystem\ndesign\nand \ns

In [80]:
import numpy as np

In [79]:
all_embeddings = []

for record in records:
    all_embeddings.append(record.vector)

In [81]:
all_embs = np.array(all_embeddings, dtype=np.float32)

In [83]:
all_embs.shape

(2, 768)

### Retrieval example

In [88]:
original_vector = np.array(all_embeddings[0], dtype=np.float32)
original_record = records[0]

In [89]:
original_record

Record(id='0c9a48bd-4eba-47db-84e1-bf97ad6408fc', payload={'file_name': 'curriculum_v10.pdf', 'page_number': '1', 'text': 'David\nAmat\nSenior\nData\nScientist \n(Glovo)\nCAT\n|\xa0ENG\n|\nSPA\nLocation:\nBarcelona,\nSpain\nDate\nof\nbirth:\n20-04-1994\nNumber:\n+(34)-608526629\ndaolondrizdaolondriz@gmai.com\nin:\ndavid-amat-olondriz/\ngithub:\nDavidAmat\nFull\nStack\nData\nScientist\nwith\n6+\nyears\nof\nexperience\nin\nend-to-end\nML\ndevelopment,\nspecializing\nin\nrecommender \nsystems ,\ndeep\nlearning,\nNLP,\ncomputer\nvision,\nand\nAI/LLM\ntechnologies .\nCurrently\nleading\nprojects\nat\nGlovo, \ndriving\nimpro vements\nin\nML\nplatfor m\nlifecycle ,\nfeature\nengineering,\nand\nreal-time\nAPI\nserving\nwith\na\nfocus\non \nscalable ,\nproduct-dri ven\nsolutions .\nProven\ntrack\nrecord\nof\ncollaborating\ncross-functionally\nwith\nProduct\nand \nBackend\nteams ,\ndelivering\nimpactful\ndata-dri ven\nproducts .\nConstantly\nadvancing\nskills\nin\nAI/ML\nsystem\ndesign\nand \nst

In [90]:
# 2) Add small Gaussian noise
noise_std: float = 0.01
noisy_vector = original_vector + np.random.normal(
    loc=0.0, 
    scale=noise_std, 
    size=original_vector.shape
).astype(np.float32)

In [92]:
noisy_vector[:3]

array([-0.14387539,  1.493144  , -3.3426669 ], dtype=float32)

In [93]:
original_vector[:3]

array([-0.15329383,  1.4898468 , -3.332744  ], dtype=float32)

In [94]:
# 3) Perform similarity search with the noisy vector
search_results = client.search(
    collection_name=collection_name,
    query_vector=noisy_vector.tolist(),  # convert NumPy array -> Python list
    limit=1
)

In [95]:
search_results

[ScoredPoint(id='0c9a48bd-4eba-47db-84e1-bf97ad6408fc', version=0, score=301.38318, payload={'file_name': 'curriculum_v10.pdf', 'page_number': '1', 'text': 'David\nAmat\nSenior\nData\nScientist \n(Glovo)\nCAT\n|\xa0ENG\n|\nSPA\nLocation:\nBarcelona,\nSpain\nDate\nof\nbirth:\n20-04-1994\nNumber:\n+(34)-608526629\ndaolondrizdaolondriz@gmai.com\nin:\ndavid-amat-olondriz/\ngithub:\nDavidAmat\nFull\nStack\nData\nScientist\nwith\n6+\nyears\nof\nexperience\nin\nend-to-end\nML\ndevelopment,\nspecializing\nin\nrecommender \nsystems ,\ndeep\nlearning,\nNLP,\ncomputer\nvision,\nand\nAI/LLM\ntechnologies .\nCurrently\nleading\nprojects\nat\nGlovo, \ndriving\nimpro vements\nin\nML\nplatfor m\nlifecycle ,\nfeature\nengineering,\nand\nreal-time\nAPI\nserving\nwith\na\nfocus\non \nscalable ,\nproduct-dri ven\nsolutions .\nProven\ntrack\nrecord\nof\ncollaborating\ncross-functionally\nwith\nProduct\nand \nBackend\nteams ,\ndelivering\nimpactful\ndata-dri ven\nproducts .\nConstantly\nadvancing\nskills\ni

In [96]:
retrieved_record = search_results[0]
retrieved_id = retrieved_record.id
retrieved_vector = np.array(retrieved_record.vector, dtype=np.float32)

In [101]:
retrieved_record

ScoredPoint(id='0c9a48bd-4eba-47db-84e1-bf97ad6408fc', version=0, score=301.38318, payload={'file_name': 'curriculum_v10.pdf', 'page_number': '1', 'text': 'David\nAmat\nSenior\nData\nScientist \n(Glovo)\nCAT\n|\xa0ENG\n|\nSPA\nLocation:\nBarcelona,\nSpain\nDate\nof\nbirth:\n20-04-1994\nNumber:\n+(34)-608526629\ndaolondrizdaolondriz@gmai.com\nin:\ndavid-amat-olondriz/\ngithub:\nDavidAmat\nFull\nStack\nData\nScientist\nwith\n6+\nyears\nof\nexperience\nin\nend-to-end\nML\ndevelopment,\nspecializing\nin\nrecommender \nsystems ,\ndeep\nlearning,\nNLP,\ncomputer\nvision,\nand\nAI/LLM\ntechnologies .\nCurrently\nleading\nprojects\nat\nGlovo, \ndriving\nimpro vements\nin\nML\nplatfor m\nlifecycle ,\nfeature\nengineering,\nand\nreal-time\nAPI\nserving\nwith\na\nfocus\non \nscalable ,\nproduct-dri ven\nsolutions .\nProven\ntrack\nrecord\nof\ncollaborating\ncross-functionally\nwith\nProduct\nand \nBackend\nteams ,\ndelivering\nimpactful\ndata-dri ven\nproducts .\nConstantly\nadvancing\nskills\nin

In [98]:
retrieved_id == original_record.id

True

In [120]:
original_record.id

'0c9a48bd-4eba-47db-84e1-bf97ad6408fc'

In [102]:
# Retrieve the vector by ID
retrieved_by_id_records = client.retrieve(
    collection_name=collection_name,
    ids=[retrieved_id],  # Pass the ID(s) you want
    with_vectors=True     # Make sure you request vectors
)

In [103]:
retrieved_by_id_records

[Record(id='0c9a48bd-4eba-47db-84e1-bf97ad6408fc', payload={'file_name': 'curriculum_v10.pdf', 'page_number': '1', 'text': 'David\nAmat\nSenior\nData\nScientist \n(Glovo)\nCAT\n|\xa0ENG\n|\nSPA\nLocation:\nBarcelona,\nSpain\nDate\nof\nbirth:\n20-04-1994\nNumber:\n+(34)-608526629\ndaolondrizdaolondriz@gmai.com\nin:\ndavid-amat-olondriz/\ngithub:\nDavidAmat\nFull\nStack\nData\nScientist\nwith\n6+\nyears\nof\nexperience\nin\nend-to-end\nML\ndevelopment,\nspecializing\nin\nrecommender \nsystems ,\ndeep\nlearning,\nNLP,\ncomputer\nvision,\nand\nAI/LLM\ntechnologies .\nCurrently\nleading\nprojects\nat\nGlovo, \ndriving\nimpro vements\nin\nML\nplatfor m\nlifecycle ,\nfeature\nengineering,\nand\nreal-time\nAPI\nserving\nwith\na\nfocus\non \nscalable ,\nproduct-dri ven\nsolutions .\nProven\ntrack\nrecord\nof\ncollaborating\ncross-functionally\nwith\nProduct\nand \nBackend\nteams ,\ndelivering\nimpactful\ndata-dri ven\nproducts .\nConstantly\nadvancing\nskills\nin\nAI/ML\nsystem\ndesign\nand \ns

In [107]:
np.array(retrieved_by_id_records[0].vector, dtype=np.float32)[:3]

array([-0.15329383,  1.4898468 , -3.332744  ], dtype=float32)

In [106]:
original_vector[:3]

array([-0.15329383,  1.4898468 , -3.332744  ], dtype=float32)

# Retriever

Quantization in Qdrant refers to the process of approximating or compressing vector representations to reduce memory and computational requirements during vector search.

In [11]:
from qdrant_client import models

In [12]:
search_params = models.SearchParams(
    quantization=models.QuantizationSearchParams(
        ignore=True,
        rescore=True,
        oversampling=2.0,
    )
),

In [13]:
search_params

(SearchParams(hnsw_ef=None, exact=False, quantization=QuantizationSearchParams(ignore=True, rescore=True, oversampling=2.0), indexed_only=False),)

In [14]:
quant = models.QuantizationSearchParams(
        ignore=True,
        rescore=True,
        oversampling=2.0,
    )

In [15]:
quant

QuantizationSearchParams(ignore=True, rescore=True, oversampling=2.0)

In [114]:
collection_info = client.get_collection(collection_name=collection_name)


In [116]:
print(collection_info.config.params)


vectors=VectorParams(size=768, distance=<Distance.DOT: 'Dot'>, hnsw_config=None, quantization_config=None, on_disk=True, datatype=None, multivector_config=None) shard_number=1 sharding_method=None replication_factor=1 write_consistency_factor=1 read_fan_out_factor=None on_disk_payload=True sparse_vectors=None


In [118]:
result = client.search(
    collection_name=collection_name,
    query_vector=noisy_vector.tolist(),
    search_params=models.SearchParams(
        quantization=models.QuantizationSearchParams(
            ignore=True,
            rescore=True,
            oversampling=2.0,
        )
    ),
    timeout=1000,
)

In [119]:
result

[ScoredPoint(id='0c9a48bd-4eba-47db-84e1-bf97ad6408fc', version=0, score=301.38318, payload={'file_name': 'curriculum_v10.pdf', 'page_number': '1', 'text': 'David\nAmat\nSenior\nData\nScientist \n(Glovo)\nCAT\n|\xa0ENG\n|\nSPA\nLocation:\nBarcelona,\nSpain\nDate\nof\nbirth:\n20-04-1994\nNumber:\n+(34)-608526629\ndaolondrizdaolondriz@gmai.com\nin:\ndavid-amat-olondriz/\ngithub:\nDavidAmat\nFull\nStack\nData\nScientist\nwith\n6+\nyears\nof\nexperience\nin\nend-to-end\nML\ndevelopment,\nspecializing\nin\nrecommender \nsystems ,\ndeep\nlearning,\nNLP,\ncomputer\nvision,\nand\nAI/LLM\ntechnologies .\nCurrently\nleading\nprojects\nat\nGlovo, \ndriving\nimpro vements\nin\nML\nplatfor m\nlifecycle ,\nfeature\nengineering,\nand\nreal-time\nAPI\nserving\nwith\na\nfocus\non \nscalable ,\nproduct-dri ven\nsolutions .\nProven\ntrack\nrecord\nof\ncollaborating\ncross-functionally\nwith\nProduct\nand \nBackend\nteams ,\ndelivering\nimpactful\ndata-dri ven\nproducts .\nConstantly\nadvancing\nskills\ni

In [121]:
collection_info = client.get_collection(collection_name=collection_name)


# Replicate Quantization with FAISS

In [126]:
!pip install -U faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [127]:
import numpy as np
import faiss

In [130]:
# Example: Simulate Product Quantization with FAISS
dimension = 768  # Example vector size
num_centroids = 256  # Number of clusters for each subspace
subvector_count = 8  # Split the vector into 8 subspaces


In [133]:
# Create random training data
num_train_vectors = 10000
train_vectors = np.random.random((num_train_vectors, dimension)).astype('float32')

In [134]:
pq = faiss.ProductQuantizer(dimension, subvector_count, 8)  # 8 bits per subvector

In [135]:
# Train the product quantizer
pq.train(train_vectors)

In [136]:
# Create some test vectors to encode
num_test_vectors = 1000
test_vectors = np.random.random((num_test_vectors, dimension)).astype('float32')

In [138]:
# Encode vectors - corrected version
codes = pq.compute_codes(test_vectors)

In [146]:
# Decode vectors
reconstructed_vectors = pq.decode(codes)

In [140]:
# Calculate reconstruction error
mse = np.mean((test_vectors - reconstructed_vectors) ** 2)
print(f"Mean Squared Error: {mse:.6f}")


# Example of how to compute distances between vectors using the PQ codes
# Select two vectors to compare
vector1 = test_vectors[0:1]  # Take first vector
vector2 = test_vectors[1:2]  # Take second vecto

# Compute their codes
code1 = pq.compute_codes(vector1)
code2 = pq.compute_codes(vector2)


Mean Squared Error: 0.077343


In [149]:
# Compute approximate distance between encoded vectors using decoded vectors
decoded_vector1 = pq.decode(code1)
decoded_vector2 = pq.decode(code2)

In [150]:
# Calculate the approximate L2 distance using decoded vectors
approx_distance = np.linalg.norm(decoded_vector1 - decoded_vector2)
print(f"Approximate L2 Distance using decoded vectors: {approx_distance:.6f}")

# Compare with the true L2 distance for validation
true_distance = np.linalg.norm(vector1 - vector2)
print(f"True L2 Distance: {true_distance:.6f}")

Approximate L2 Distance using decoded vectors: 4.147590
True L2 Distance: 11.538622


# Retriever Generate Context

In [16]:
from qdrant_client import models

In [17]:
def search(query, top_k=10):
    result = client.search(
            collection_name=collection_name,
            query_vector=query,
            search_params=models.SearchParams(
                quantization=models.QuantizationSearchParams(
                    ignore=True,
                    rescore=True,
                    oversampling=2.0,
                )
            ),
            timeout=1000,
        )
    return result

In [19]:
query = "hello"
query_embedding = next(embed_model.embed(query))

In [20]:
type(query_embedding)

numpy.ndarray

In [21]:
query = "Location"
result = search(query=query_embedding, top_k=2)

In [22]:
result

[ScoredPoint(id='da9c3221-1ae8-4632-888e-033d9ed9ceb4', version=0, score=188.23242, payload={'file_name': 'curriculum_v10.pdf', 'page_number': '2', 'text': 'Pr ofessional\nDev elopment\nContin uously\ndeveloping\nexper tise\nin\nAI/ML\nthrough\ncourses\non\nLLMOps ,\nNLP,\nand\nBig\nData\ntechnologies , \nincluding\nhands-on\nexperience\nwith\ncloud\nplatfor ms\nlike\nAWS.\nMOOCs:\nLLMOps\n&\nML\nDeployment:\nUdemy\n|\nCurrent \nNLP\nSpecialization\nCoursera\n|\n01-2022 \nDeployment\nof\nML\nmodels\nUdemy\n|\n12-2020 \nNeo4J\nIntr oduction\nNeo4j\n|\n06-2020 \nPlotl y/Dash\ndashboards\nUdemy\n|\n02-2020 \nBig\nData\nSpecialization\nCoursera\n|\n03-2019 \nMathematical\nBiostatistics\nCoursera\n|\n05-2021\nBooks:\nHands-On\nGenerati ve\nAI\nO’Reilly\n|\nCurrent \nHands-On\nGraph\nNN\nO’Reilly\n|\n01-2023 \nNLP\nwith\nTransf ormer s\nO’Reilly\n|\n07-2022 \nApplied\nrecsys\nwith\nPython\nO’Reilly\n|\n01-2022 \nDeep\nRL\nHands-On\nO’Reilly\n|\n12-2021 \nHands-On\nExplaina ble\nAI\n(XAI)\nO’

In [23]:
text_top = result[0].payload["text"]
text_sec = result[1].payload["text"]

In [24]:
"ocation" in text_top

False

In [25]:
"ocation" in text_sec

True

In [26]:
context = [dict(data) for data in result]

In [27]:
context

[{'id': 'da9c3221-1ae8-4632-888e-033d9ed9ceb4',
  'version': 0,
  'score': 188.23242,
  'payload': {'file_name': 'curriculum_v10.pdf',
   'page_number': '2',
   'text': 'Pr ofessional\nDev elopment\nContin uously\ndeveloping\nexper tise\nin\nAI/ML\nthrough\ncourses\non\nLLMOps ,\nNLP,\nand\nBig\nData\ntechnologies , \nincluding\nhands-on\nexperience\nwith\ncloud\nplatfor ms\nlike\nAWS.\nMOOCs:\nLLMOps\n&\nML\nDeployment:\nUdemy\n|\nCurrent \nNLP\nSpecialization\nCoursera\n|\n01-2022 \nDeployment\nof\nML\nmodels\nUdemy\n|\n12-2020 \nNeo4J\nIntr oduction\nNeo4j\n|\n06-2020 \nPlotl y/Dash\ndashboards\nUdemy\n|\n02-2020 \nBig\nData\nSpecialization\nCoursera\n|\n03-2019 \nMathematical\nBiostatistics\nCoursera\n|\n05-2021\nBooks:\nHands-On\nGenerati ve\nAI\nO’Reilly\n|\nCurrent \nHands-On\nGraph\nNN\nO’Reilly\n|\n01-2023 \nNLP\nwith\nTransf ormer s\nO’Reilly\n|\n07-2022 \nApplied\nrecsys\nwith\nPython\nO’Reilly\n|\n01-2022 \nDeep\nRL\nHands-On\nO’Reilly\n|\n12-2021 \nHands-On\nExplaina ble\n

In [28]:
combined_prompt = []

for entry in context:
    text = entry["payload"]["text"]
    prompt = "" if text is None else text
    combined_prompt.append(prompt)

context = "\n\n---\n\n".join(combined_prompt)

In [29]:
print(len(context))

7048


In [30]:
print(len(text_sec))
print(len(text_top))

4053
2988


In [31]:
from prompt_template import qa_prompt_tmpl_str

In [32]:
print(qa_prompt_tmpl_str)

Context information is below.
---------------------
{context}
---------------------
Given the context information above I want you to think step by step to answer the query in a crisp manner, incase case you don't know the answer say 'I don't know!'.
Query: {query}
Answer: 


In [33]:
query = "What is the Localtion of the person of this CV ?"
prompt = qa_prompt_tmpl_str.format(context=context, query=query)

In [34]:
print(prompt[-300:])

rade:
7.6
Barcelona,
Spain
|
09.2012
-
06.2016
---------------------
Given the context information above I want you to think step by step to answer the query in a crisp manner, incase case you don't know the answer say 'I don't know!'.
Query: What is the Localtion of the person of this CV ?
Answer: 


In [35]:
messages = [{"role": "user", "content": [
    {"type": "text", "text": prompt}
]}]

## Llama 3.2

In [36]:
import transformers
import torch

In [37]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


In [38]:
outputs = pipeline(
    messages,
    max_new_tokens=256,
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [39]:
outputs

[{'generated_text': [{'role': 'user',
    'content': [{'type': 'text',
      'text': "Context information is below.\n---------------------\nPr ofessional\nDev elopment\nContin uously\ndeveloping\nexper tise\nin\nAI/ML\nthrough\ncourses\non\nLLMOps ,\nNLP,\nand\nBig\nData\ntechnologies , \nincluding\nhands-on\nexperience\nwith\ncloud\nplatfor ms\nlike\nAWS.\nMOOCs:\nLLMOps\n&\nML\nDeployment:\nUdemy\n|\nCurrent \nNLP\nSpecialization\nCoursera\n|\n01-2022 \nDeployment\nof\nML\nmodels\nUdemy\n|\n12-2020 \nNeo4J\nIntr oduction\nNeo4j\n|\n06-2020 \nPlotl y/Dash\ndashboards\nUdemy\n|\n02-2020 \nBig\nData\nSpecialization\nCoursera\n|\n03-2019 \nMathematical\nBiostatistics\nCoursera\n|\n05-2021\nBooks:\nHands-On\nGenerati ve\nAI\nO’Reilly\n|\nCurrent \nHands-On\nGraph\nNN\nO’Reilly\n|\n01-2023 \nNLP\nwith\nTransf ormer s\nO’Reilly\n|\n07-2022 \nApplied\nrecsys\nwith\nPython\nO’Reilly\n|\n01-2022 \nDeep\nRL\nHands-On\nO’Reilly\n|\n12-2021 \nHands-On\nExplaina ble\nAI\n(XAI)\nO’Reilly\n|\n07-202

In [42]:
print(outputs[0]["generated_text"][-1]["content"])

To find the location of the person in the CV, I will look for the relevant information in the given context. 

The relevant line in the context is:

"Location: Barcelona, Spain"

Therefore, the answer is: "Barcelona, Spain"
