In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd drive/MyDrive/Colab\ Notebooks/rag_training

/content/drive/MyDrive/Colab Notebooks/rag_training


In [3]:
%%bash

uv pip install haystack-ai
uv pip install milvus_haystack
uv pip install pymilvus
uv pip install python-docx

Using Python 3.11.13 environment at: /usr
Audited 1 package in 154ms
Using Python 3.11.13 environment at: /usr
Audited 1 package in 150ms
Using Python 3.11.13 environment at: /usr
Audited 1 package in 100ms
Using Python 3.11.13 environment at: /usr
Audited 1 package in 141ms


In [4]:
from pathlib import Path
import glob
import os
from dotenv import load_dotenv
from getpass import getpass
from haystack import Pipeline
from haystack.components.converters import DOCXToDocument
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder

from milvus_haystack import MilvusDocumentStore
from milvus_haystack.milvus_embedding_retriever import MilvusEmbeddingRetriever
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack.components.builders import ChatPromptBuilder
from haystack.dataclasses import ChatMessage

In [5]:
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")

Enter OpenAI API key:··········


In [6]:
DOCUMENTS_DIR = Path("./dummy_rag/documents_dir")
FILES = [file.resolve() for file in DOCUMENTS_DIR.rglob("*") if file.is_file()]

In [7]:
converter = DOCXToDocument()

result = converter.run(sources=FILES)
print(f'{result["documents"][0].meta["file_path"]}')
print(result["documents"][0].content)

New Office Layout Plan.docx

🏢 New Office Layout Plan – Q3 2025 Facilities Update
📅 Effective Date: August 1, 2025
🏗️ Released By: Facilities & Workplace Experience Team
📍 Location: Main Corporate Headquarters
🎯 Objective: "Optimizing Space for Productivity, Collaboration, and Comfort"

🧠 Overview
As part of our ongoing efforts to enhance the workplace environment and support evolving team needs, the Facilities team has finalized the Q3 2025 Office Layout Plan. This update reflects strategic changes in departmental locations, the introduction of new work zones, and improvements to shared spaces.
The redesign aims to foster better collaboration, reduce noise distractions, and provide flexible workspaces that accommodate both focused and team-based work styles.

🗺️ Department Relocations
Effective August 1, the following departmental moves will take place:
Engineering Team → 3rd Floor 
Dedicated development pods 
Enhanced technical infrastructure 
Proximity to server and testing labs
Mar

In [10]:
connection_args={"uri": "./rag_vectordb/milvus.db"}
document_store = MilvusDocumentStore(
    connection_args=connection_args,
    drop_old=True,
)

In [11]:
indexing_pipeline = Pipeline()
indexing_pipeline.add_component("converter", DOCXToDocument())
indexing_pipeline.add_component("cleaner", DocumentCleaner())
indexing_pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2))
indexing_pipeline.add_component("embedder", OpenAIDocumentEmbedder())
indexing_pipeline.add_component("writer", DocumentWriter(document_store))
indexing_pipeline.connect("converter", "cleaner")
indexing_pipeline.connect("cleaner", "splitter")
indexing_pipeline.connect("splitter", "embedder")
indexing_pipeline.connect("embedder", "writer")
indexing_pipeline.run({"converter": {"sources": FILES}})

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
Calculating embeddings: 5it [00:04,  1.09it/s]


{'embedder': {'meta': {'model': 'text-embedding-ada-002-v2',
   'usage': {'prompt_tokens': 9773, 'total_tokens': 9773}}},
 'writer': {'documents_written': 135}}

In [12]:
question = "Tell me a bit about QuantumStream"  # You can replace it with your own question.

retrieval_pipeline = Pipeline()
retrieval_pipeline.add_component("embedder", OpenAITextEmbedder())
retrieval_pipeline.add_component("retriever", MilvusEmbeddingRetriever(document_store=document_store, top_k=3))
retrieval_pipeline.connect("embedder", "retriever")

retrieval_results = retrieval_pipeline.run({"embedder": {"text": question}})

for doc in retrieval_results["retriever"]["documents"]:
    print(doc.content)
    print("-" * 10)

The QuantumStream CLI is your command-line gateway to building, managing, and scaling real-time data pipelines with confidence and control.
----------
The QuantumStream Scheduler brings automation, reliability, and control to your data operations—so your pipelines run on time, every time.
----------
QuantumStream supports integration with AWS KMS, Azure Key Vault, and GCP Secret Manager via plugin modules. QuantumStream’s encryption capabilities ensure your data remains secure, compliant, and resilient—no matter where it flows.
----------


In [13]:
prompt_template = """Answer the following query based on the provided context. If the context does
                     not include an answer, reply with 'I don't know'.\n
                     Query: {{query}}
                     Documents:
                     {% for doc in documents %}
                        {{ doc.content }}
                     {% endfor %}
                     Answer:
                  """

llm = OpenAIChatGenerator(model="gpt-4o-mini")

rag_pipeline = Pipeline()
rag_pipeline.add_component("text_embedder", OpenAITextEmbedder())
rag_pipeline.add_component("retriever", MilvusEmbeddingRetriever(document_store=document_store, top_k=3))
rag_pipeline.add_component("prompt_builder", ChatPromptBuilder(template=[ChatMessage.from_user(prompt_template)]))

rag_pipeline.add_component("llm", llm)
rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
rag_pipeline.connect("retriever", "prompt_builder")
rag_pipeline.connect("prompt_builder.prompt", "llm.messages")

messages = [ChatMessage.from_user(prompt_template)]



In [14]:
results = rag_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"query": question}})

print('RAG answer:\n\n', results["llm"]["replies"][0].text)

RAG answer:

 QuantumStream is a platform that facilitates the building, managing, and scaling of real-time data pipelines through its command-line interface (CLI). It offers a Scheduler for automating data operations to ensure pipelines run reliably and on time. Additionally, QuantumStream supports integration with various cloud services for secure data management and compliance, using encryption capabilities to protect data as it flows through the system.
