In [None]:
import os

def chunk_text(text, chunk_size=1024, overlap_ratio=0.1):
    chunks = []
    start = 0
    text_length = len(text)
    overlap = int(chunk_size * overlap_ratio)

    while start < text_length:
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap  # move ahead keeping 10% overlap

    return chunks

all_chunks = []

for filename in os.listdir("data"):
    if filename.endswith(".txt"):
        with open(f"data/{filename}", "r", encoding="utf-8") as f:
            text = f.read()
            chunks = chunk_text(text)
            all_chunks.extend(chunks)


In [None]:
len(all_chunks)

In [None]:
import json
with open("dataChunks.json", "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=4)

In [None]:
import requests
from dotenv import load_dotenv
load_dotenv()
def getEmbeddings(payload):
    API_URL = "https://router.huggingface.co/hf-inference/models/BAAI/bge-base-en-v1.5/pipeline/feature-extraction"
    headers = {
        "Authorization": f"Bearer {os.environ['HF_TOKEN']}",
    }
    response = requests.post(API_URL, headers=headers, json={"inputs": payload})
    return response.json()

embeddings = []
batch_size = 32

for i in range(0, len(all_chunks), batch_size):
    batch = all_chunks[i:i + batch_size]
    batch_emb = getEmbeddings(batch)
    embeddings.extend(batch_emb)
    print(f"embedding for batch {i} done. Chunks processed : {(i + batch_size) - i}")

In [None]:
len(embeddings) 

In [None]:
print(len(embeddings[0]), len(embeddings)) #599 embedding made for each chunk each with dimension 768

In [None]:
type(embeddings)

In [None]:
import pickle

# Combine them in a list of dicts (simple, readable)
data = [{"text": text, "embedding": emb} for text, emb in zip(all_chunks, embeddings)]

# Save to a file
with open("dataEmbeddings.pkl", "wb") as f:
    pickle.dump(data, f)

In [None]:
import pickle
import numpy as np
import faiss

# Load saved data
with open("dataEmbeddings.pkl", "rb") as f:
    data = pickle.load(f)

# Separate text and vectors
texts = [d["text"] for d in data]
embeddings = np.array([d["embedding"] for d in data]).astype("float32")

print(embeddings.shape)

d = embeddings.shape[1]

faiss.normalize_L2(embeddings)

quantizer = faiss.IndexFlatIP(d)
# index = faiss.IndexIVFPQ(quantizer, d, nlist=256, m=8, nbits=4)
index = faiss.IndexIVFPQ(quantizer, d, 256, 8, 4, faiss.METRIC_INNER_PRODUCT)

# nlist - groups all the vectors into nlist clusters -> Higher nlist = better accuracy but more memory and slower training.
# m (pq_m) - Each embedding vector (say 384-D) is split into pq_m smaller parts. Example: 384 dimensions → 8 parts → each subvector = 48-D.
# pq_bits - Defines how many bits FAISS uses to represent each subvector after quantization. If you set pq_bits = 10 and pq_m = 8, then each vector uses 8 × 10 = 80 bits = 10 bytes of storage.


# A bit of debugging shows nx==111 (good) and k==256 (1 << nbits in ProductQuantizer::set_derived_values). I can fix the issue by setting the number of bits per subvector to 4, instead of 8, as in:

index.train(embeddings)
index.add(embeddings)
faiss.write_index(index, "dataIndexed.faiss")


In [None]:
def retrieve_top_k(index, q_vec, k=40, similarity_threshold=0.20):
    index.nprobe = 8
    faiss.normalize_L2(q_vec)
    D, I = index.search(q_vec, k)     # D: distances (if normalized embeddings, distance is 2 - 2*cos); depends on index
    # If normalized and using inner product: D are distances; if using cosine directly compute via dot
    # Here assume we have cosine scores precomputed; otherwise convert appropriately
    # Convert distances to cosine similarity if needed
    # Example: if index uses IndexFlatIP (inner product) and vectors normalized, D are cosines
    sims = D[0]   # array of similarities or distances depending on index setup
    idxs = I[0]
    # Keep only those above threshold
    print( "Before :", len(I[0]))
    candidates = []
    for sim, idx in zip(sims, idxs):
        print(sim)
        if sim >= similarity_threshold:
            candidates.append((idx, sim))
    print("After :", len(candidates))
    return candidates  # list of (index, similarity)

In [None]:
query_vector = getEmbeddings("history of car?")
# I → gives the indices of the top-k most similar vectors (your chunks).
# D → gives the distance values (how far or close each match is).

index.nprobe = 8   # search in 8 clusters
D, I = index.search(query_vector, k=40)
# here k is - number of nearest neighbors (chunks) you want FAISS to return for each query. This is not searching, just controlling the results return and mainly affects output size
# for idx in I[0]:
#     print(all_chunks[idx])

In [None]:
query_vector = getEmbeddings("what are cars?")
retrivedChunksList = retrieve_top_k(index, np.array(query_vector, dtype=np.float32).reshape(1, -1), 20, 0.55)
retrivedChunks = [(all_chunks[k[0]], k[1]) for k in retrivedChunksList]
textPart = [k[0] for k in retrivedChunks]

In [None]:
context = "\n\n".join(textPart)
print(context)

with open("testData.txt", "w", encoding="utf-8") as f:
    f.write(context)

In [None]:
len(query_vector)