In [2]:
!pip install -qU faiss-cpu langchain langchain_community langchain-huggingface

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [12]:
import json
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
import faiss
from langchain_community.vectorstores import FAISS
from uuid import uuid4

1. Load Prepared Chunks

In [6]:
# Load prepared chunks from JSON file
with open("chunks.json", "r", encoding="utf-8") as f:
    chunk_data = json.load(f)

# Convert to Document format with metadata
documents = [
    Document(page_content=item["content"], metadata={"section": item["section"]})
    for item in chunk_data
]

print(f"Loaded {len(documents)} document chunks.")


Loaded 89 document chunks.


2. Load Summary Document

In [7]:
# Load summary content
with open("summary.md", "r") as f:
    summary_content = f.read()

# Create a Document object for the summary
summary_doc = Document(
    page_content=summary_content,
    metadata={"section": "summary", "source": "summary.md"}
)

print("Summary document loaded successfully.")


Summary document loaded successfully.


3. Initialize Embeddings & FAISS Vector Store

In [8]:
# Define model
model_name = "BAAI/bge-m3"

# Initialize embeddings
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True}
)

# Determine the embedding dimension
dimension = len(embeddings.embed_query("test"))

# Initialize FAISS index
index = faiss.IndexFlatL2(dimension)

print(f"Initialized FAISS index with dimension: {dimension}")


  embeddings = HuggingFaceBgeEmbeddings(


Initialized FAISS index with dimension: 1024


4. Create Vector Store with Documents

In [13]:
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [14]:
# Combine summary with all document chunks
all_docs = documents

uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)

['e441ae97-9f6f-4f2d-b41c-319ef2d35716',
 '6fbe16bc-58ac-4810-90d2-8936e8a4fc0a',
 '084133a2-8e9e-4119-a3b7-f6a764a6a43a',
 '81adb2c4-2000-44c5-af45-434b1f64db00',
 'bfdb3d83-d16b-4203-9bb4-b2d8cf4507bc',
 '271fefbb-1c2e-42cb-ba7c-f7734435a669',
 'e9bd0197-488a-4ca5-8736-7db47acf92b0',
 '56b88d95-383c-473b-b29b-d77d3f81d589',
 '2ae1e3c1-7100-4fc8-9419-9feb2ca07ad3',
 'bbed0952-9992-4f76-80e4-c7d1af667922',
 '6db022b2-ce09-4ed9-ba84-344e4ec59cc9',
 '0de3c32a-c1d7-4d25-8274-9b1d7f3c8dfa',
 '40c60786-c9f5-452e-97c3-e62e2fcaad98',
 '49f27c15-5f69-4e22-b85b-bab3c1a95f3d',
 '070029cd-63a3-46eb-9925-99bfea59cdab',
 'c5016ad5-abee-426b-a6d0-3f935960453e',
 'c45e1051-0e5c-4da7-95c8-0073d8851610',
 '94809f2d-d1ce-4bd7-b480-4254c4452b9d',
 '7a72489b-119c-44ba-9016-d1cb14218697',
 'd13e2dbd-736f-40ca-a7a0-8bfb098762c1',
 '81fbd1ee-a294-4eac-aeeb-d3c9342199eb',
 '9b95367d-3163-4dc7-a868-9d4bda86f37d',
 'b1c95a3a-5524-4658-a79a-be2303ae4224',
 'cfc5434c-5e4b-4188-9418-7c34b0dab8c6',
 'd9b9beec-d82a-

In [15]:
print(f"FAISS vector store created with {len(all_docs)} documents.")


FAISS vector store created with 89 documents.


5. Generate Merged Contexts

In [19]:
documents[0]

Document(metadata={'section': 'What is TurboML?'}, page_content='# What is TurboML?\n@ TurboML - page_link: https://docs.turboml.com/intro/\n<page_content>\nIntroduction  \nTurboML is a machine learning platform that’s reinvented for real-time. What does that mean? All the steps in the ML lifecycle, from data ingestion, to feature engineering, to ML modelling to post deployment steps like monitoring, are all designed so that in addition to batch data, they can also handle real-time data.  \n## Data Ingestion [Permalink for this section](https://docs.turboml.com/intro/\\#data-ingestion)  \nThe first step is to bring your data to the TurboML platform. There are two major ways to ingest your data. Pull-based and Push-based.  \n### Pull-based ingestion [Permalink for this section](https://docs.turboml.com/intro/\\#pull-based-ingestion)  \nWith this approach, you use TurboML’s prebuilt connectors to connect to your data source. The connectors will continuously pull data from your data sourc

In [29]:
merged_contexts = []

for idx, doc in enumerate(documents):
    try:
        # Find similar chunks (excluding self and summary)
        similar_docs = vector_store.similarity_search(
            query=doc.page_content,
            k=4,  # Fetch extra in case of self-match
        )
        
        # Remove accidental self-matches
        filtered_docs = [d for d in similar_docs if d.metadata["section"] != doc.metadata["section"]]
        
        # Select top 2 similar documents + current doc + summary
        selected_docs = [summary_doc, doc] + filtered_docs[:2]
        
        # Merge content with section headers
        merged_content = "\n\n".join([
            d.page_content
            for d in selected_docs
        ])
        
        # Store merged context data
        merged_contexts.append({
            "base_chunk": doc.metadata["section"],
            "context_sections": [d.metadata["section"] for d in selected_docs],
            "content": merged_content,
            "embedding": embeddings.embed_query(merged_content)
        })
        
    except Exception as e:
        print(f"Error processing chunk {idx}: {str(e)}")
        continue

print(f"Generated {len(merged_contexts)} merged contexts.")


Generated 89 merged contexts.


6. Save Merged Contexts to File

In [30]:
# Save merged contexts as JSON file
with open("merged_contexts.json", "w") as f:
    json.dump(merged_contexts, f, indent=2)

print("Merged contexts saved successfully.")


Merged contexts saved successfully.


7. Verify Sample Output

In [31]:
for i, sample in enumerate(merged_contexts):
    print(f"Sample {i+1}:")
    print(f"Base chunk: {sample['base_chunk']}")
    print(f"Merged sections: {sample['context_sections']}")
    print(f"Content length: {len(sample['content'])} characters")
    print("=" * 50)  # Separator for readability


Sample 1:
Base chunk: What is TurboML?
Merged sections: ['summary', 'What is TurboML?', 'TurboML Quickstart', 'Batch APIs']
Content length: 58376 characters
Sample 2:
Base chunk: TurboML Quickstart
Merged sections: ['summary', 'TurboML Quickstart', 'What is TurboML?', 'Batch APIs']
Content length: 58376 characters
Sample 3:
Base chunk: String Encoding
Merged sections: ['summary', 'String Encoding', 'Hyperparameter Tuning', 'Algorithm Tuning']
Content length: 36758 characters
Sample 4:
Base chunk: AMF Regressor
Merged sections: ['summary', 'AMF Regressor', 'AMF Classifier', 'FFM Regressor']
Content length: 35606 characters
Sample 5:
Base chunk: MultinomialNB
Merged sections: ['summary', 'MultinomialNB', 'Gaussian Naive Bayes', 'HeteroAdaBoostClassifier']
Content length: 30644 characters
Sample 6:
Base chunk: PreProcessors
Merged sections: ['summary', 'PreProcessors', 'EmbeddingModel', 'What is TurboML?']
Content length: 48675 characters
Sample 7:
Base chunk: Python Model: Batch Example


In [32]:
# Check first merged context
sample = merged_contexts[3]

print(sample["content"])

# **TurboML: A Real-Time Machine Learning Platform - Detailed Summary**

TurboML is a platform designed for building, deploying, and managing real-time machine learning applications. It emphasizes streaming data and provides tools for the entire ML lifecycle, from data ingestion to model monitoring.

**1. Data Ingestion and Management:**

*   **Core Principle:** TurboML treats data as continuous streams, enabling real-time processing and updates.
*   **Ingestion Methods:**
    *   **Pull-based:**
        *   Uses pre-built connectors to continuously pull data from various sources.
        *   Supported sources include cloud storage (e.g., S3) and databases (e.g., Postgres). *While Kafka is used internally, the documentation doesn't explicitly present it as a direct pull-based source for end-users in the introductory sections.*
        *   Connectors are configured to handle data formats and connection details.
    *   **Push-based:**
        *   Allows direct data injection into the Tu