In [1]:
import json
from langchain_core.documents import Document

documents = []
with open ("./data-parsing.json", "r", encoding="utf-8") as f:
    data = json.load(f)
    for d in data:
        documents.append(Document(
            page_content=d["page_content"],
            metadata={
                "anchor_id": d["anchor_id"],
                "parent_id": d["parent_id"],
                "source": d["source"],
                "content_length": len(d["page_content"]),
                "has_code": "```" in d["page_content"],
            }
        ))

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def split_documents(documents, chunk_size=500, chunk_overlap=120):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=[
            "\n\n## ",
            "\n\n### ",
            "\n\n",
            "\n- ",
            "\n* ",
            "\n",
            " "
        ]
    )

    chunks = splitter.split_documents(documents)
    print(f"Split into {len(chunks)} chunks")
    return chunks

In [4]:
chunks = split_documents(documents)

Split into 3963 chunks


In [5]:
for c in chunks:
    header = c.metadata.get("anchor_id", "")
    if header:
        c.page_content = f"[Section: {header}]\n{c.page_content}"

In [6]:
for i, c in enumerate(chunks):
    c.metadata["chunk_index"] = i

In [7]:
chunks

[Document(metadata={'anchor_id': 'intro', 'parent_id': 'introduction', 'source': 'https://api.freshservice.com/', 'content_length': 1386, 'has_code': False, 'chunk_index': 0}, page_content='[Section: intro]\nIntroduction\n\nThis document provides comprehensive information about the REST APIs available for:'),
 Document(metadata={'anchor_id': 'intro', 'parent_id': 'introduction', 'source': 'https://api.freshservice.com/', 'content_length': 1386, 'has_code': False, 'chunk_index': 1}, page_content='[Section: intro]\nFreshservice – A cloud-based IT Service Management (ITSM) solution that enables IT teams to streamline service delivery within their organization, improve user experience, and enhance employee satisfaction. Non-IT teams can be onboarded onto the same account via the Business Agent add-on. The applicable APIs are tagged with “Freshservice”.'),
 Document(metadata={'anchor_id': 'intro', 'parent_id': 'introduction', 'source': 'https://api.freshservice.com/', 'content_length': 1386

In [8]:
def build_embedding_text(doc):
    parts = []

    if "parent_id" in doc.metadata:
        parts.append(f"Document section: {doc.metadata['parent_id']}")

    if "anchor_id" in doc.metadata:
        parts.append(f"Subsection: {doc.metadata['anchor_id']}")

    parts.append(doc.page_content)

    return "\n".join(parts)


In [9]:
from sentence_transformers import SentenceTransformer

def embed_documents(chunks, model_name="multi-qa-MiniLM-L6-cos-v1"):
    model = SentenceTransformer(model_name)

    texts = [build_embedding_text(doc) for doc in chunks]

    embeddings = model.encode(
        texts,
        normalize_embeddings=True,   # IMPORTANT
        show_progress_bar=True
    )

    return embeddings

In [10]:
embeddings = embed_documents(chunks)

Batches: 100%|██████████| 124/124 [18:10<00:00,  8.80s/it]


In [11]:
import faiss
import numpy as np

dim = embeddings.shape[1]

index = faiss.IndexFlatIP(dim)
index = faiss.IndexIDMap(index)

ids = np.arange(len(embeddings))
index.add_with_ids(embeddings, ids)

id_to_doc = {i: chunks[i] for i in range(len(chunks))}

In [15]:
query="Give me the curl command to delete a ticket"

def embed_query(chunks, model_name="multi-qa-MiniLM-L6-cos-v1"):
    model = SentenceTransformer(model_name)

    texts = [chunks]

    embeddings = model.encode(
        texts,
        normalize_embeddings=True,
        show_progress_bar=True
    )

    return embeddings
query_embedding = embed_query(query)

Batches: 100%|██████████| 1/1 [00:00<00:00, 14.52it/s]


In [28]:

query = np.array(query_embedding, dtype="float32")

D, I = index.search(query_embedding, k=20)

results = [
    id_to_doc[i]
    for score, i in zip(D[0], I[0])
    if score > 0.3
]

In [29]:
from collections import defaultdict

grouped = defaultdict(list)
for doc in results:
    grouped[doc.metadata["parent_id"]].append(doc)

final_docs = [
    " ".join(d.page_content for d in docs)
    for docs in grouped.values()
]
final_docs

['[Section: delete_a_ticket]\nDELETE\n/api/v2/tickets/[id]\nOAuth 2.0 Scope\nfreshservice.tickets.delete\nSample code | Curl\ncurl -v -u api_key:X -X DELETE \'https://domain.freshservice.com/api/v2/tickets/1\'\nResponse\n1\n\tHTTP Status: 204 No Content [Section: delete_a_ticket_attachment]\nLearn more about Freshservice, Freshservice for Business Teams, and Freshservice for MSPs.\n\nThis API helps you delete an attachment from a ticket.\n\nDELETE\n/api/v2/tickets/[ticket_id]/attachments/[id]\nOAuth 2.0 Scope\nfreshservice.tickets.edit\nSample code | Curl\ncurl -v -u api_key:X -X DELETE \'https://domain.freshservice.com/api/v2/tickets/1/attachments/1\'\nResponse\n1\n\tHTTP Status: 204 No Content [Section: delete_ticket_time_entry]\nLearn more about Freshservice, Freshservice for Business Teams, and Freshservice for MSPs.\n\nThis API can be used to delete an existing Time Entry. Deleted time entries cannot be restored.\n\nDELETE\n/api/v2/tickets/[ticket_id]/time_entries/[id]\nOAuth 2.0 