In [None]:
!git clone https://github.com/CaSToRC-CyI/AI-Agents-Training.git

In [None]:
%cd ./AI-Agents-Training

In [None]:
%%bash

uv pip install haystack-ai
uv pip install datasets -U
uv pip install "sentence-transformers>=4.1.0"
uv pip install huggingface_hub -U
uv pip install python-docx

In [None]:
import os
from pathlib import Path
from getpass import getpass
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack import Document
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.dataclasses import ChatMessage
from haystack.components.builders import ChatPromptBuilder
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack.components.converters import DOCXToDocument


In [4]:
os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")

In [2]:
DOCUMENTS_DIR = Path("./dummy_data/documents_dir")
FILES = [file.resolve() for file in DOCUMENTS_DIR.rglob("*") if file.is_file()]
converter = DOCXToDocument()

docs = []
for file in FILES:
    result = converter.run(sources=[file])
    docs.extend(result["documents"])  # Append the converted documents

In [None]:
for doc_idx, doc in enumerate(docs[:2]):
    print(f"Document idx {doc_idx}\n")
    print(f"Content: {doc.content}")
    print(f"Meta: {doc.meta}\n\n")

In [8]:
document_store = InMemoryDocumentStore()
doc_embedder = OpenAIDocumentEmbedder()

In [None]:
docs_with_embeddings = doc_embedder.run(docs)
document_store.write_documents(docs_with_embeddings["documents"])
print(f"Stored {len(docs_with_embeddings['documents'])} documents with embeddings in the document store.")

In [None]:
template = [
    ChatMessage.from_user(
        """
Given the following information, answer the question.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""
    )
]

prompt_builder = ChatPromptBuilder(template=template)

In [11]:
text_embedder = OpenAITextEmbedder()
retriever = InMemoryEmbeddingRetriever(document_store)
chat_generator = OpenAIChatGenerator(model="gpt-4o-mini")

In [12]:
basic_rag_pipeline = Pipeline()

basic_rag_pipeline.add_component("text_embedder", text_embedder)
basic_rag_pipeline.add_component("retriever", retriever)
basic_rag_pipeline.add_component("prompt_builder", prompt_builder)
basic_rag_pipeline.add_component("llm", chat_generator)

In [None]:
basic_rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
basic_rag_pipeline.connect("retriever", "prompt_builder")
basic_rag_pipeline.connect("prompt_builder.prompt", "llm.messages")

In [None]:
question = "What does Rhodes Statue look like?"

response = basic_rag_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print(response["llm"]["replies"][0].text)