In [1]:
# %pip install pypdf haystack "sentence-transformers>=3.0.0" google-ai-haystack python-dotenv

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [3]:
from haystack_integrations.components.generators.google_ai import (
    GoogleAIGeminiGenerator,
)
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.converters import PyPDFToDocument
from haystack.components.writers import DocumentWriter
from haystack import Pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
file_path1 = r"D:\SIH\documentation\hello_file.txt"
file_path2 = "D:\SIH\documentation\progit.pdf"

from pathlib import Path

Path(file_path2).exists()

True

In [5]:
context_template = """
Given the following information, answer the question.
Context: {{ context }}
Question: {{ question }}
"""
query_template = "{{ question }}?"

In [15]:
document_splitter = DocumentSplitter(
    split_by="word", split_length=400, split_overlap=50
)
embedder = SentenceTransformersDocumentEmbedder(model="all-MiniLM-L6-v2")
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
writer = DocumentWriter(document_store=document_store)

In [16]:
pipeline = Pipeline()


pipeline.add_component("converter", PyPDFToDocument())
pipeline.add_component("cleaner", DocumentCleaner())
pipeline.add_component("splitter", document_splitter)
pipeline.add_component("embedder", embedder)
pipeline.add_component("writer", writer)

pipeline.connect("converter", "cleaner")
pipeline.connect("cleaner", "splitter")
pipeline.connect("splitter", "embedder")
pipeline.connect("embedder", "writer")


pipeline.run({"converter": {"sources": [file_path2]}})

Batches: 100%|██████████| 13/13 [00:10<00:00,  1.28it/s]


{'writer': {'documents_written': 388}}

In [17]:
document_retriever = InMemoryEmbeddingRetriever(document_store=document_store)
query_embedder = SentenceTransformersTextEmbedder(model="all-MiniLM-L6-v2")

In [10]:
query = "what is git?"

In [18]:
retrieve_pipeline = Pipeline()
retrieve_pipeline.add_component("text_embedder", query_embedder)
retrieve_pipeline.add_component("retriever", document_retriever)

retrieve_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

relevant_documents = retrieve_pipeline.run({"text_embedder": {"text": query}})
print(relevant_documents["retriever"]["documents"])

Batches: 100%|██████████| 1/1 [00:00<00:00, 53.36it/s]

[Document(id=e6bbe1a198c135da5cf8a235d188521e527803243e3b7afa3e98fe22eac3407b, content: 'to use it to find more
information about the git shell in Setting Up the Server .
Getting and Creati...', meta: {'file_path': 'D:\\SIH\\documentation\\progit.pdf', 'source_id': '12ab6505563397319cfb5f16535d4bc178ff46b6492082fa86cc721182b676da', 'page_number': 486, 'split_id': 374, 'split_idx_start': 840772, '_split_overlap': [{'doc_id': '9218483da56dab9e8fda03788d11c371c76c775197900555a4f424887080b93c', 'range': (2878, 3148)}, {'doc_id': 'e441f05866a6b3b971e5ba14fe2e6b36552dc0ac8dd66186938207b69c1dffc3', 'range': (0, 286)}]}, score: 0.6621929776398874), Document(id=42d84c34f49822a65ccc2f4382e15033f7a8be1b2842ec17ab87873b6cd62f8d, content: 'how Git stores its
objects, what the object model is, details of packfiles, server protocols, and mo...', meta: {'file_path': 'D:\\SIH\\documentation\\progit.pdf', 'source_id': '12ab6505563397319cfb5f16535d4bc178ff46b6492082fa86cc721182b676da', 'page_number': 15,




In [12]:
# similarity_ranker = TransformersSimilarityRanker(top_k=3)
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
response_generator = GoogleAIGeminiGenerator(model="gemini-1.5-flash")
context_prompt_builder = PromptBuilder(template=context_template)
query_prompt_builder = PromptBuilder(template=query_template)

In [13]:
query_pipeline = Pipeline()

query_pipeline.add_component("context_prompt", context_prompt_builder)
query_pipeline.add_component("query_prompt", query_prompt_builder)
query_pipeline.add_component("response_generator", response_generator)

query_pipeline.connect("query_prompt", "context_prompt.question")
query_pipeline.connect("context_prompt", "response_generator")

<haystack.core.pipeline.pipeline.Pipeline object at 0x0000029DED787A30>
🚅 Components
  - context_prompt: PromptBuilder
  - query_prompt: PromptBuilder
  - response_generator: GoogleAIGeminiGenerator
🛤️ Connections
  - context_prompt.prompt -> response_generator.parts (str)
  - query_prompt.prompt -> context_prompt.question (str)

In [19]:
rag_response = query_pipeline.run(
    {
        "context_prompt": {"context": relevant_documents["retriever"]["documents"]},
        "query_prompt": {"question": query},
    }
)
print(rag_response)

{'response_generator': {'replies': ['The provided documents don\'t directly answer "What is Git?" but they offer information about Git\'s functions and its use in version control. Here\'s what we can glean from them:\n\n* **Git is a version control system (VCS)**: This is stated directly in some documents, like the one that mentions it being "special" and setting it apart in the VCS community.\n* **Git is used for tracking changes in code:** This is implied in the documents that talk about "objects," "packfiles," and "server protocols."  These are terms related to how Git stores and manages code history.\n* **Git is powerful and has many advanced features:**  Documents mention using Git in "advanced ways" and its use by the "Linux development community."  \n\n**In a nutshell:** Git is a powerful system designed to track changes in code, making it easier for developers to collaborate and manage project history. \n']}}


In [21]:
print(rag_response["response_generator"]["replies"][0])

The provided documents don't directly answer "What is Git?" but they offer information about Git's functions and its use in version control. Here's what we can glean from them:

* **Git is a version control system (VCS)**: This is stated directly in some documents, like the one that mentions it being "special" and setting it apart in the VCS community.
* **Git is used for tracking changes in code:** This is implied in the documents that talk about "objects," "packfiles," and "server protocols."  These are terms related to how Git stores and manages code history.
* **Git is powerful and has many advanced features:**  Documents mention using Git in "advanced ways" and its use by the "Linux development community."  

**In a nutshell:** Git is a powerful system designed to track changes in code, making it easier for developers to collaborate and manage project history. 

