In [4]:
import os

def chunk_text(text, chunk_size=1024, overlap_ratio=0.1):
    chunks = []
    start = 0
    text_length = len(text)
    overlap = int(chunk_size * overlap_ratio)

    while start < text_length:
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap  # move ahead keeping 10% overlap

    return chunks

all_chunks = []

for filename in os.listdir("data"):
    if filename.endswith(".txt"):
        with open(f"data/{filename}", "r", encoding="utf-8") as f:
            text = f.read()
            chunks = chunk_text(text)
            all_chunks.extend(chunks)


In [74]:
len(all_chunks)

599

In [76]:
import json
with open("dataChunks.json", "w", encoding="utf-8") as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=4)

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
# Save it locally
model.save('models/all-MiniLM-L6-v2')

In [79]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('models/all-MiniLM-L6-v2')
embeddings = model.encode(all_chunks, show_progress_bar=True)

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

In [80]:
len(embeddings) # we made 92 vectors

599

In [81]:
embeddings.shape

(599, 384)

In [82]:
import pickle

# Combine them in a list of dicts (simple, readable)
data = [{"text": text, "embedding": emb.tolist()} for text, emb in zip(all_chunks, embeddings)]

# Save to a file
with open("dataEmbeddings.pkl", "wb") as f:
    pickle.dump(data, f)

In [83]:
import pickle
import numpy as np
import faiss

# Load saved data
with open("dataEmbeddings.pkl", "rb") as f:
    data = pickle.load(f)

# Separate text and vectors
texts = [d["text"] for d in data]
embeddings = np.array([d["embedding"] for d in data]).astype("float32")

print(embeddings.shape)

d = embeddings.shape[1]

faiss.normalize_L2(embeddings)

quantizer = faiss.IndexFlatIP(d)
# index = faiss.IndexIVFPQ(quantizer, d, nlist=256, m=8, nbits=4)
index = faiss.IndexIVFPQ(quantizer, d, 256, 8, 4, faiss.METRIC_INNER_PRODUCT)

# nlist - groups all thw vectors into nlist clusters -> Higher nlist = better accuracy but more memory and slower training.
# m (pq_m) - Each embedding vector (say 384-D) is split into pq_m smaller parts. Example: 384 dimensions → 8 parts → each subvector = 48-D.
# pq_bits - Defines how many bits FAISS uses to represent each subvector after quantization. If you set pq_bits = 10 and pq_m = 8, then each vector uses 8 × 10 = 80 bits = 10 bytes of storage.


# A bit of debugging shows nx==111 (good) and k==256 (1 << nbits in ProductQuantizer::set_derived_values). I can fix the issue by setting the number of bits per subvector to 4, instead of 8, as in:

index.train(embeddings)
index.add(embeddings)
faiss.write_index(index, "dataIndexed.faiss")


(599, 384)


In [50]:
def retrieve_top_k(index, q_vec, k=40, similarity_threshold=0.20):
    index.nprobe = 8
    faiss.normalize_L2(q_vec)
    D, I = index.search(q_vec, k)     # D: distances (if normalized embeddings, distance is 2 - 2*cos); depends on index
    # If normalized and using inner product: D are distances; if using cosine directly compute via dot
    # Here assume we have cosine scores precomputed; otherwise convert appropriately
    # Convert distances to cosine similarity if needed
    # Example: if index uses IndexFlatIP (inner product) and vectors normalized, D are cosines
    sims = D[0]   # array of similarities or distances depending on index setup
    idxs = I[0]
    # Keep only those above threshold
    print( "Before :", len(I[0]))
    candidates = []
    for sim, idx in zip(sims, idxs):
        print(sim)
        if sim >= similarity_threshold:
            candidates.append((idx, sim))
    print("After :", len(candidates))
    return candidates  # list of (index, similarity)

In [20]:
query_vector = model.encode(["history of car?"]).astype('float32')
# I → gives the indices of the top-k most similar vectors (your chunks).
# D → gives the distance values (how far or close each match is).

index.nprobe = 8   # search in 8 clusters
D, I = index.search(query_vector, k=40)
# here k is - number of nearest neighbors (chunks) you want FAISS to return for each query. This is not searching, just controlling the results return and mainly affects output size
# for idx in I[0]:
#     print(all_chunks[idx])

In [77]:
query_vector = model.encode(["What is a hybrid car??"]).astype('float32')
retrivedChunksList = retrieve_top_k(index, query_vector, 40, 0.25)
retrivedChunks = [(all_chunks[k[0]], k[1]) for k in retrivedChunksList]
textPart = [k[0] for k in retrivedChunks]

Before : 40
0.6666509
0.6595636
0.6372862
0.6344248
0.63149166
0.62552696
0.61763597
0.60430396
0.5971689
0.5936491
0.5913357
0.5897729
0.5849557
0.54924846
0.53535116
0.5321405
0.5255895
0.51595706
0.51414657
0.5093974
0.5082947
0.5054675
0.49767733
0.49699393
0.48841614
0.48740345
0.4852809
0.4849311
0.46972105
0.46540344
0.45565647
0.44790575
-3.4028235e+38
-3.4028235e+38
-3.4028235e+38
-3.4028235e+38
-3.4028235e+38
-3.4028235e+38
-3.4028235e+38
-3.4028235e+38
After : 32


In [78]:
context = "\n\n".join(textPart)
print(context)

with open("testData.txt", "w", encoding="utf-8") as f:
    f.write(context)

 term hybrid vehicle is used, it most often refers to a Hybrid electric vehicle. These encompass such vehicles as the Saturn Vue, Toyota Prius, Toyota Yaris, Toyota Camry Hybrid, Ford Escape Hybrid, Ford Fusion Hybrid, Toyota Highlander Hybrid, Honda Insight, Honda Civic Hybrid, Lexus RX 400h, and 450h, Hyundai Ioniq Hybrid, Hyundai Sonata Hybrid, Hyundai Elantra Hybrid, Kia Sportage Hybrid, Kia Niro Hybrid, Kia Sorento Hybrid and others. A petroleum-electric hybrid most commonly uses internal combustion engines (using a variety of fuels, generally gasoline or Diesel engines) and electric motors to power the vehicle. The energy is stored in the fuel of the internal combustion engine and an electric battery set. There are many types of petroleum-electric hybrid drivetrains, from Full hybrid to Mild hybrid, which offer varying advantages and disadvantages.
William H. Patton filed a patent application for a gasoline-electric hybrid rail-car propulsion system in early 1889, and for a simil

In [None]:
with open("testData.txt", "w", encoding="utf-8") as f:
    f.write(context)