In [None]:
import json
from langchain_core.documents import Document

documents = []
with open ("./data-parsing.json", "r", encoding="utf-8") as f:
    data = json.load(f)
    for d in data:
        documents.append(Document(
            page_content=d["page_content"],
            metadata={
                "anchor_id": d["anchor_id"],
                "parent_id": d["parent_id"],
                "source": d["source"],
                "content_length": len(d["page_content"]),
            }
        ))

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def split_documents(documents, chunk_size=500, chunk_overlap=120):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=[
            "\n\n## ",
            "\n\n### ",
            "\n\n",
            "\n- ",
            "\n* ",
            "\n",
            " "
        ]
    )

    chunks = splitter.split_documents(documents)
    print(f"Split into {len(chunks)} chunks")
    return chunks

In [4]:
chunks = split_documents(documents)

Split into 3963 chunks


In [6]:
for i, c in enumerate(chunks):
    c.metadata["chunk_index"] = i

In [8]:
def build_embedding_text(doc):
    parts = []

    if "parent_id" in doc.metadata:
        parts.append(f"Document section: {doc.metadata['parent_id']}")

    if "anchor_id" in doc.metadata:
        parts.append(f"Subsection: {doc.metadata['anchor_id']}")

    parts.append(doc.page_content)

    return "\n".join(parts)


In [None]:
from sentence_transformers import SentenceTransformer

def embed_documents(chunks, model_name="multi-qa-MiniLM-L6-cos-v1"):
    model = SentenceTransformer(model_name)

    texts = [build_embedding_text(doc) for doc in chunks]

    embeddings = model.encode(
        texts,
        normalize_embeddings=True,
        show_progress_bar=True
    )

    return embeddings

In [10]:
embeddings = embed_documents(chunks)

Batches: 100%|██████████| 124/124 [18:10<00:00,  8.80s/it]


In [30]:
import faiss
import numpy as np

dim = embeddings.shape[1]

index = faiss.IndexFlatIP(dim)
index = faiss.IndexIDMap(index)

ids = np.arange(len(embeddings))
index.add_with_ids(embeddings, ids)

id_to_doc = {i: chunks[i] for i in range(len(chunks))}

faiss.write_index(index, "docs.index")
np.save("embeddings.npy", embeddings)

In [31]:
import pickle

with open("docstore.pkl", "wb") as f:
    pickle.dump(id_to_doc, f)
