In [2]:
import os
from dotenv import load_dotenv
load_dotenv(override=True)

api_key = os.getenv("OAI_TOKEN")

In [3]:
# If using Scrapy
def get_files_dict(directory_path):    
    files_dict = [{"path": os.path.join(directory_path, item), "url": item} for item in os.listdir(directory_path)]
    return files_dict
  
abs_path = "/Users/ferdydh/Code/haystack-rag-showcase/tum_data"

result = get_files_dict(abs_path)

In [5]:
# Classifier
# Routers

from haystack import Pipeline
from qdrant_haystack import QdrantDocumentStore
from haystack.components.converters import HTMLToDocument
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.components.classifiers import DocumentLanguageClassifier
from haystack.components.routers import MetadataRouter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder

# Document Store
document_store_en = QdrantDocumentStore(
    location="localhost",
    port=6333,
    recreate_index=True,
    return_embedding=True,
    wait_result_from_api=True,
)

document_store_de = QdrantDocumentStore(
    location="localhost",
    port=6334,
    recreate_index=True,
    return_embedding=True,
    wait_result_from_api=True,
)

en_writer = DocumentWriter(document_store = document_store_en)
de_writer = DocumentWriter(document_store = document_store_de)


# Data pipeline
pipeline = Pipeline()
pipeline.add_component("converter", HTMLToDocument())
pipeline.add_component("cleaner", DocumentCleaner())
pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=30))


# Language Pipeline Components
document_classifier = DocumentLanguageClassifier(languages = ["en", "de", "id"])
metadata_router = MetadataRouter(rules={"en": {"language": {"$eq": "en"}}, "de": {"language": {"$eq": "de"}}})
english_embedder = SentenceTransformersDocumentEmbedder()
german_embedder = SentenceTransformersDocumentEmbedder(model_name_or_path="PM-AI/bi-encoder_msmarco_bert-base_german")

pipeline.add_component(instance=document_classifier, name="document_classifier")
pipeline.add_component(instance=metadata_router, name="metadata_router")
pipeline.add_component(instance=english_embedder, name="english_embedder")
pipeline.add_component(instance=german_embedder, name="german_embedder")
pipeline.add_component(instance=en_writer, name="en_writer")
pipeline.add_component(instance=de_writer, name="de_writer")

# Connect all
pipeline.connect("converter", "cleaner")
pipeline.connect("cleaner", "splitter")
pipeline.connect("splitter", "document_classifier.documents")

pipeline.connect("document_classifier.documents", "metadata_router.documents")
pipeline.connect("metadata_router.en", "english_embedder.documents")
pipeline.connect("metadata_router.de", "german_embedder.documents")
pipeline.connect("english_embedder", "en_writer")
pipeline.connect("german_embedder", "de_writer")


pipeline.run(
    {
        "converter": {
            "sources": [x["path"] for x in result],
            "meta": [{"url":x["url"]} for x in result]
            }
        }) 

Error parsing HTML
Traceback (most recent call last):
  File "/Users/ferdydh/Code/haystack-rag-showcase/.venv/lib/python3.11/site-packages/boilerpy3/extractors.py", line 108, in parse_doc
    bp_parser.feed(input_str)
  File "/Users/ferdydh/Code/haystack-rag-showcase/.venv/lib/python3.11/site-packages/boilerpy3/parser.py", line 658, in feed
    self.end_document()
  File "/Users/ferdydh/Code/haystack-rag-showcase/.venv/lib/python3.11/site-packages/boilerpy3/parser.py", line 461, in end_document
    self.flush_block()
  File "/Users/ferdydh/Code/haystack-rag-showcase/.venv/lib/python3.11/site-packages/boilerpy3/parser.py", line 540, in flush_block
    if self.last_start_tag.lower() == "title":
       ^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'lower'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/ferdydh/Code/haystack-rag-showcase/.venv/lib/python3.11/site-packages/boilerpy3/extrac

{'metadata_router': {'unmatched': []},
 'en_writer': {'documents_written': 543},
 'de_writer': {'documents_written': 966}}

In [11]:
# Rankers
# Retrievers
# GPTGenerator

from haystack import Pipeline
from langdetect import detect
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.rankers import TransformersSimilarityRanker
from qdrant_haystack.retriever import QdrantRetriever
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import OpenAIGenerator

# English 
english_pipeline = Pipeline()
english_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder())
english_pipeline.add_component("retriever", QdrantRetriever(document_store=document_store_en, top_k=50))
english_pipeline.add_component("ranker", TransformersSimilarityRanker())
english_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
english_pipeline.connect("retriever.documents", "ranker.documents")

# German
german_pipeline = Pipeline()
german_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder(model_name_or_path="PM-AI/bi-encoder_msmarco_bert-base_german"))
german_pipeline.add_component("retriever", QdrantRetriever(document_store=document_store_de, top_k=50))
german_pipeline.add_component("ranker", TransformersSimilarityRanker())
german_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
german_pipeline.connect("retriever.documents", "ranker.documents")


template_en = """
Given the following information, answer the question.

Context: 
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{ query }}?
"""

template_de = """
Gegeben die folgenden Informationen, beantworte die Frage.

Kontext:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Frage: {{ query }}?
"""


english_pipeline.add_component("prompt_builder", PromptBuilder(template=template_en))
english_pipeline.add_component("llm", OpenAIGenerator(api_key=api_key))
english_pipeline.connect("ranker", "prompt_builder.documents")
english_pipeline.connect("prompt_builder", "llm")

german_pipeline.add_component("prompt_builder", PromptBuilder(template=template_de))
german_pipeline.add_component("llm", OpenAIGenerator(api_key=api_key))
german_pipeline.connect("ranker", "prompt_builder.documents")
german_pipeline.connect("prompt_builder", "llm")



def ask_rag(query: str):
  language = detect(query)
  pipeline = german_pipeline if language == "de" else english_pipeline
  
  result = pipeline.run(
      {
        "text_embedder": {"text": query}, 
        "ranker": {"query": query, "top_k": 3},
        "prompt_builder": {"query": query},
        })
  
  return result



In [12]:
result = ask_rag("When is Summer 2024's semester start for TUM")

Batches: 100%|██████████| 1/1 [00:00<00:00, 17.08it/s]


In [13]:
print(result["llm"]["replies"][0])
# print(result)

The Summer 2024 semester at TUM is set to start on April 1, 2024.
