In [1]:
import os

from dotenv import load_dotenv

from haystack import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.writers import DocumentWriter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
from haystack.components.preprocessors.document_splitter import DocumentSplitter
from haystack import Pipeline
from haystack.utils import ComponentDevice
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
from haystack.components.joiners import DocumentJoiner
from haystack.components.rankers import TransformersSimilarityRanker
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
openai_api_key = os.environ.get("OPENAI_API_KEY")

# Load documents

In [4]:
document_store = InMemoryDocumentStore()

docs = []
ARTICLES_DIR = os.path.join("data", "articles")
articles_titles = [name for name in os.listdir(ARTICLES_DIR)]

for title in articles_titles:
    path = os.path.join(ARTICLES_DIR, title)
    with open(path, "r") as file:
        content = file.read()
        docs.append(Document(content=content, meta={"title": title}))


# Process Documents

In [5]:
document_splitter = DocumentSplitter(split_by="word", split_length=512, split_overlap=32)
document_embedder = SentenceTransformersDocumentEmbedder(
    model="BAAI/bge-small-en-v1.5"
)
document_writer = DocumentWriter(document_store)

indexing_pipeline = Pipeline()
indexing_pipeline.add_component("document_splitter", document_splitter)
indexing_pipeline.add_component("document_embedder", document_embedder)
indexing_pipeline.add_component("document_writer", document_writer)

indexing_pipeline.connect("document_splitter", "document_embedder")
indexing_pipeline.connect("document_embedder", "document_writer")

indexing_pipeline.run({"document_splitter": {"documents": docs}})

Batches: 100%|██████████| 53/53 [12:36<00:00, 14.27s/it]


{'document_writer': {'documents_written': 1688}}

# Create the Retriever

In [15]:
text_embedder = SentenceTransformersTextEmbedder(
    model="BAAI/bge-small-en-v1.5"
    #, device=ComponentDevice.from_str("cuda:0")
)
embedding_retriever = InMemoryEmbeddingRetriever(document_store)
bm25_retriever = InMemoryBM25Retriever(document_store)

In [16]:
document_joiner = DocumentJoiner()
ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-base")


In [17]:
hybrid_retrieval = Pipeline()
hybrid_retrieval.add_component("text_embedder", text_embedder)
hybrid_retrieval.add_component("embedding_retriever", embedding_retriever)
hybrid_retrieval.add_component("bm25_retriever", bm25_retriever)
hybrid_retrieval.add_component("document_joiner", document_joiner)
hybrid_retrieval.add_component("ranker", ranker)

hybrid_retrieval.connect("text_embedder", "embedding_retriever")
hybrid_retrieval.connect("bm25_retriever", "document_joiner")
hybrid_retrieval.connect("embedding_retriever", "document_joiner")
hybrid_retrieval.connect("document_joiner", "ranker")

# hybrid_retrieval.draw("hybrid-retrieval.png")


<haystack.core.pipeline.pipeline.Pipeline object at 0x1610a6f50>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - embedding_retriever: InMemoryEmbeddingRetriever
  - bm25_retriever: InMemoryBM25Retriever
  - document_joiner: DocumentJoiner
  - ranker: TransformersSimilarityRanker
🛤️ Connections
  - text_embedder.embedding -> embedding_retriever.query_embedding (List[float])
  - embedding_retriever.documents -> document_joiner.documents (List[Document])
  - bm25_retriever.documents -> document_joiner.documents (List[Document])
  - document_joiner.documents -> ranker.documents (List[Document])

In [18]:
query = "Jordan Bardella est-il un bon candidat pour l'écologie ?"

result = hybrid_retrieval.run(
    {"text_embedder": {"text": query}, "bm25_retriever": {"query": query}, "ranker": {"query": query}}
)

Batches: 100%|██████████| 1/1 [00:03<00:00,  3.50s/it]


In [27]:
for doc in result.get("ranker").get("documents"):
    print("===================")
    print(doc.meta["title"])
    print(doc.content)


presidentielle-2022-course-a-la-nullite-des-candidates
grosses bêtises en 20 secondes
, en plaçant notamment la France comme meilleur élève écologique (c’est faux) et en confondant mix électrique et mix énergétique. Heureusement que c’est Jordan Bardella qui est aujourd’hui président du Rassemblement National le temps de la campagne, lui qui avait déclaré en juin dernier que “
Le C20 a été divisé par 1000 depuis les années 60
“. Une équipe prête à diriger la France, sans aucun doute.
Jean-Luc Mélenchon, l’écologie spectacle
S’il y a bien un candidat qui illustre à la perfection la politique des punchlines, des incantations et des petites phrases relayées sur les réseaux sociaux, c’est Jean-Luc Mélenchon.
Mais lorsqu’il s’agit de mettre en pratique les paroles, c’est un tout petit peu plus compliqué. Il y a un an, il proposait déjà de sortir du nucléaire en “
inventant une autre énergie bas-carbone
“. Le débat énergétique mérite mieux que d’attendre qu’Harry Potter nous sorte une énergi

# RAG

In [28]:
in_memory_retriever = InMemoryEmbeddingRetriever(document_store)

In [57]:
template = """
You are Bonbot, a virtual assistant that answers questions related to climate. The topics you cover are climate change, its impacts, and socio-economic news.

You must answer the question based on the excerpts of articles given in the context. You can only rely on the information contained in these excerpts.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""

prompt_builder = PromptBuilder(template=template)

In [58]:
generator = OpenAIGenerator(model="gpt-4o", generation_kwargs={"temperature": 0, "max_tokens": 250})

In [59]:
retriever = InMemoryEmbeddingRetriever(document_store)
embedder = SentenceTransformersTextEmbedder(model="BAAI/bge-small-en-v1.5")

basic_rag_pipeline = Pipeline()
# Add components to your pipeline
# TODO change retriever for hybrid
basic_rag_pipeline.add_component("text_embedder", embedder)
basic_rag_pipeline.add_component("retriever", retriever)
basic_rag_pipeline.add_component("prompt_builder", prompt_builder)
basic_rag_pipeline.add_component("llm", generator)

# Now, connect the components to each other
basic_rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
basic_rag_pipeline.connect("retriever", "prompt_builder.documents")
basic_rag_pipeline.connect("prompt_builder", "llm")

<haystack.core.pipeline.pipeline.Pipeline object at 0x17ff67b10>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - prompt_builder: PromptBuilder
  - llm: OpenAIGenerator
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [60]:
question = "Jordan Bardella est-il un bon candidat pour l'écologie ?"

response = basic_rag_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print("\n\n")
print(response["llm"]["replies"][0])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.37it/s]





D'après les informations contenues dans les extraits, Jordan Bardella ne semble pas être un bon candidat pour l'écologie. Il est critiqué pour son manque de connaissance sur les sujets climatiques et pour ses déclarations erronées, comme confondre le CO2 et le C2O. De plus, son parti, le Rassemblement National, est décrit comme ayant des positions qui ne favorisent pas l'atteinte des objectifs climatiques, et leur programme est même considéré comme un grand bond en arrière en matière de lutte contre le changement climatique.


In [6]:
document_store.save_to_disk("data/document_store")

In [7]:
new_store = InMemoryDocumentStore.load_from_disk("data/document_store")