<a href="https://colab.research.google.com/github/FKENZOLS/RAG_haystack/blob/main/RAGhaystack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install farm-haystack[inference]

In [None]:
!pip install farm-haystack[colab,ocr,preprocessing,file-conversion,pdf]

In [13]:
from haystack.utils import convert_files_to_docs

doc_dir = "/content/drive/MyDrive/data"
all_docs = convert_files_to_docs(dir_path=doc_dir)


In [75]:
from haystack.nodes import PreProcessor
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length = 100,
    split_respect_sentence_boundary=True,
)
docs = preprocessor.process(all_docs)

print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")

Preprocessing: 100%|██████████| 1/1 [00:01<00:00,  1.26s/docs]

n_files_input: 1
n_docs_output: 1905





# Document store

In [76]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(embedding_field = "embedding", embedding_dim = 384)


In [77]:
document_store.write_documents(docs)

In [78]:
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store,
   embedding_model="sentence-transformers/all-MiniLM-L6-v2",
   model_format="sentence_transformers",
   top_k = 10,
   max_seq_len= 512,
   batch_size=16,
)

In [79]:
document_store.update_embeddings(retriever)

Updating Embedding:   0%|          | 0/1905 [00:00<?, ? docs/s]

Batches:   0%|          | 0/120 [00:00<?, ?it/s]

Documents Processed: 10000 docs [00:04, 2054.29 docs/s]


In [92]:
from haystack.nodes import PromptNode, PromptTemplate, AnswerParser

rag_prompt = PromptTemplate(
    prompt="""Synthesize a comprehensive answer from the following text for the given question.
                             Provide a clear and concise response that summarizes the key points and information presented in the text.
                             \n\n Related text: {join(documents)} \n\n Question: {query} \n\n Answer:""",
    output_parser=AnswerParser(),
)

prompt_node = PromptNode(model_name_or_path="TinyLlama/TinyLlama-1.1B-Chat-v1.0", default_prompt_template=rag_prompt)


In [93]:
from haystack.pipelines import Pipeline

pipe = Pipeline()
pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
pipe.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])


In [98]:
output = pipe.run(query=" explain how to measure the radius of a black hole? ")

print(output["answers"][0].answer)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 

1. The radius of a black hole is determined by the area of its event horizon.

2. The event horizon is the boundary of the region where the black hole’s gravitational
force is strong enough to cause the matter to fall in.

3. The area of the event horizon is given by the formula:

A(r) = 4π√[(r2 + a2)2 − a2].

4. The radius
