In [21]:
# Classifier
# Routers

# Rankers
# Retrievers
# GPTGenerator

In [22]:
import os
from dotenv import load_dotenv
load_dotenv(override=True)

api_key = os.getenv("OAI_TOKEN")


In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin, urlunparse
from concurrent.futures import ThreadPoolExecutor
from urllib.robotparser import RobotFileParser
from tqdm import tqdm
import logging
import urllib3
from haystack.dataclasses import ByteStream

# Set the logging level for urllib3 to WARNING to suppress debug messages
urllib3_logger = logging.getLogger('urllib3')
urllib3_logger.setLevel(logging.WARNING)

def can_fetch(url, user_agent='*'):
    rp = RobotFileParser()
    rp.set_url(urljoin(url, '/robots.txt'))
    rp.read()

    return rp.can_fetch(user_agent, url)

def is_media_url(url):
    media_extensions = ['.pdf', '.jpg', '.jpeg', '.png', '.gif']
    return any(url.lower().endswith(ext) for ext in media_extensions)

result = []

def crawl_website(root_url, max_workers=15, fetch_count=100):
    search_domain = urlparse(root_url).hostname    
    visited_urls = set()
    stack = [root_url]
    visited_count = 0

    def process_page(url):
        try:
            response = requests.get(url, stream=True)
            soup = BeautifulSoup(response.text, 'html.parser')
            visited_urls.add(url)

            # Process the page here, e.g., extract information or save data
            result.append({"url":url, "content": ByteStream.from_string(response.text)})

            # Find and collect outgoing links
            outgoing_links = [urlparse(urljoin(url, link['href'])) for link in soup.find_all('a', href=True)]
            outgoing_stripped_urls = [urlunparse((next_url.scheme, next_url.netloc, next_url.path, '', '', '')) for next_url in outgoing_links]

            # Filter out media URLs
            non_media_links = [outgoing_url for outgoing_url in outgoing_stripped_urls if not is_media_url(outgoing_url)]

            return [non_media_url for non_media_url in non_media_links if search_domain in non_media_url]

        except Exception as e:
            print(f"Error crawling {url}: {e}")
            return []

    with ThreadPoolExecutor(max_workers) as executor, tqdm(total=fetch_count) as pbar:
        while stack:
            if (visited_count >= fetch_count):
                return
            
            current_url = stack.pop()

            if (current_url in visited_urls) or (not search_domain in current_url) or (not can_fetch(current_url)):
                continue

            # Submit the processing of the page to the thread pool
            future = executor.submit(process_page, current_url)

            # Collect outgoing links from the processed page
            outgoing_links = future.result()

            # Add outgoing links to the stack
            stack.extend(filter(lambda url: url not in visited_urls, outgoing_links))
            
            visited_count += 1
            pbar.update(1)

# Example usage:
crawl_website('https://www.cit.tum.de', fetch_count=100)


  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 100/100 [00:43<00:00,  2.28it/s]


In [3]:
# Classifier
# Routers

from haystack import Pipeline
from haystack.components.converters import HTMLToDocument
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.components.classifiers import DocumentLanguageClassifier
from haystack.components.routers import MetadataRouter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.document_stores import InMemoryDocumentStore

# Document Store
document_store_en = InMemoryDocumentStore()
document_store_de = InMemoryDocumentStore()


en_writer = DocumentWriter(document_store = document_store_en)
de_writer = DocumentWriter(document_store = document_store_de)


# Data pipeline
pipeline = Pipeline()
pipeline.add_component("converter", HTMLToDocument())
pipeline.add_component("cleaner", DocumentCleaner())
pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=30))


# Language Pipeline Components
document_classifier = DocumentLanguageClassifier(languages = ["en", "de", "id"])
metadata_router = MetadataRouter(rules={"en": {"language": {"$eq": "en"}}, "de": {"language": {"$eq": "de"}}})
english_embedder = SentenceTransformersDocumentEmbedder()
german_embedder = SentenceTransformersDocumentEmbedder(model_name_or_path="PM-AI/bi-encoder_msmarco_bert-base_german")

pipeline.add_component(instance=document_classifier, name="document_classifier")
pipeline.add_component(instance=metadata_router, name="metadata_router")
pipeline.add_component(instance=english_embedder, name="english_embedder")
pipeline.add_component(instance=german_embedder, name="german_embedder")
pipeline.add_component(instance=en_writer, name="en_writer")
pipeline.add_component(instance=de_writer, name="de_writer")

# Connect all
pipeline.connect("converter", "cleaner")
pipeline.connect("cleaner", "splitter")
pipeline.connect("splitter", "document_classifier.documents")

pipeline.connect("document_classifier.documents", "metadata_router.documents")
pipeline.connect("metadata_router.en", "english_embedder.documents")
pipeline.connect("metadata_router.de", "german_embedder.documents")
pipeline.connect("english_embedder", "en_writer")
pipeline.connect("german_embedder", "de_writer")


pipeline.run(
    {
        "converter": {
            "sources": [x["content"] for x in result],
            "meta": [{"url":x["url"]} for x in result]
            }
        }) 

Batches: 100%|██████████| 2/2 [00:07<00:00,  3.93s/it]
Batches: 100%|██████████| 4/4 [00:08<00:00,  2.15s/it]


{'metadata_router': {'unmatched': []},
 'en_writer': {'documents_written': 64},
 'de_writer': {'documents_written': 107}}

In [23]:
# Rankers
# Retrievers
# GPTGenerator

from haystack import Pipeline
from langdetect import detect
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.rankers import TransformersSimilarityRanker
from haystack.components.retrievers import InMemoryEmbeddingRetriever
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import OpenAIGenerator

# English 
english_pipeline = Pipeline()
english_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder())
english_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store_en, top_k=50))
english_pipeline.add_component("ranker", TransformersSimilarityRanker())
english_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
english_pipeline.connect("retriever.documents", "ranker.documents")

# German
german_pipeline = Pipeline()
german_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder(model_name_or_path="PM-AI/bi-encoder_msmarco_bert-base_german"))
german_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store_de, top_k=50))
german_pipeline.add_component("ranker", TransformersSimilarityRanker())
german_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
german_pipeline.connect("retriever.documents", "ranker.documents")


template_en = """
Given the following information, answer the question.

Context: 
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{ query }}?
"""

template_de = """
Gegeben die folgenden Informationen, beantworte die Frage.

Kontext:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Frage: {{ query }}?
"""


english_pipeline.add_component("prompt_builder", PromptBuilder(template=template_en))
english_pipeline.add_component("llm", OpenAIGenerator(api_key=api_key))
english_pipeline.connect("ranker", "prompt_builder.documents")
english_pipeline.connect("prompt_builder", "llm")

german_pipeline.add_component("prompt_builder", PromptBuilder(template=template_de))
german_pipeline.add_component("llm", OpenAIGenerator(api_key=api_key))
german_pipeline.connect("ranker", "prompt_builder.documents")
german_pipeline.connect("prompt_builder", "llm")



def ask_rag(query: str):
  language = detect(query)
  pipeline = german_pipeline if language == "de" else english_pipeline
  
  result = pipeline.run(
      {
        "text_embedder": {"text": query}, 
        "ranker": {"query": query, "top_k": 3},
        "prompt_builder": {"query": query},
        })
  
  return result



In [27]:
result = ask_rag("who is Tum's president")

Batches: 100%|██████████| 1/1 [00:00<00:00, 25.02it/s]


In [29]:
# print(result["llm"]["replies"][0])
print(result)

{'llm': {'replies': ["The name of TUM's president is Prof. Thomas F. Hofmann."], 'meta': [{'model': 'gpt-3.5-turbo-0613', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 16, 'prompt_tokens': 2382, 'total_tokens': 2398}}]}}
