In [None]:
!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss,ocr]

In [None]:
from typing import List
import requests
import pandas as pd
from haystack import Document
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import RAGenerator, DensePassageRetriever

In [None]:

!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz
!tar -xvf xpdf-tools-linux-4.03.tar.gz && sudo cp xpdf-tools-linux-4.03/bin64/pdftotext/usr /local/bin

In [None]:
from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor
from haystack.utils import convert_files_to_dicts, fetch_archive_from_http

In [None]:
all_docs = convert_files_to_dicts("Data")

INFO - haystack.utils.preprocessing -  Converting Data/AybarsManavHW2.docx
INFO - haystack.utils.preprocessing -  Converting Data/Textfile.txt


In [None]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True
)

In [None]:
docs = preprocessor.process(all_docs)

print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")

100%|██████████| 2/2 [00:00<00:00, 955.97docs/s]

n_files_input: 2
n_docs_output: 5





In [None]:
document_store = FAISSDocumentStore(
    faiss_index_factory_str="Flat",
    return_embedding=True
)
#initializing FAISS Document Store

In [None]:
#initializing DPR Retriever
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
)

In [None]:
#initializing RAG Generator
generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=True,
    top_k=1,
    max_length=200,
    min_length=2,
    embed_title=True,
    num_beams=2,
)

In [None]:
document_store.write_documents(docs)
document_store.update_embeddings(retriever = retriever)

In [None]:
from haystack.pipelines import GenerativeQAPipeline
from haystack.utils import print_answers

In [None]:
pipe = GenerativeQAPipeline(generator = generator, retriever = retriever)
res = pipe.run(query = "what increasing the expected value and variance results in ",  params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})



In [None]:
print_answers(res, details="minimum")


Query: what increasing the expected value and variance results in 
Answers:
[{'answer': ' increase in standard deviation'}]
