## Load a v0 QA pipeline
- Follow the steps [here](https://haystack.deepset.ai/tutorials/01_basic_qa_pipeline) at first
- Expand with preprocessing learnings from [here](https://haystack.deepset.ai/tutorials/08_preprocessing), [here](https://docs.haystack.deepset.ai/docs/preprocessor#usage), and [here](https://docs.haystack.deepset.ai/docs/optimization)

In [1]:
# Import some files
import pickle
with open('../data/chapter_fmt_list.pkl','rb') as f:
    chapters = pickle.load(f)

In [2]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


In [6]:
from haystack import Document
chapter_documents = [Document.from_dict(d) for d in chapters]

In [95]:
# Try with a few different Preprocessors on either passages (too long) or sentence. We'll probably need to tinker with this
from haystack.nodes import PreProcessor

word_preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
    split_overlap=20,
    progress_bar=True, 
    add_page_number=True
)

sentence_preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="sentence",
    split_length=6,
    split_respect_sentence_boundary=False,
    split_overlap=2,
    progress_bar=True, 
    add_page_number=True
)
# Should add max_chars_check or similar once we get to the point we do a dense retreival model

In [96]:
docs_word = word_preprocessor.process(chapter_documents)
docs_sentence = sentence_preprocessor.process(chapter_documents)

Preprocessing:   0%|          | 0/307 [00:00<?, ?docs/s]



Preprocessing:   0%|          | 0/307 [00:00<?, ?docs/s]

In [97]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(use_bm25=True)


INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


In [98]:
# Write some documents to the store with their metadata
document_store.delete_documents() # Keep this around for experimentation
document_store.write_documents(documents=docs_sentence)

Updating BM25 representation...:   0%|          | 0/62501 [00:00<?, ? docs/s]

In [99]:
# Initialize a Retreiver
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)


In [100]:
# initialize the Reader
from haystack.nodes import FARMReader

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)


INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.modeling.model.language_model -   * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
INFO - haystack.modeling.model.language_model -  Auto-detected model language: english
INFO - haystack.modeling.model.language_model -  Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


In [101]:
# Create a retreiver-reader pipeline
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [102]:
# Ask some questions and get some answers
prediction = pipe.run(
    query="Who is the Sable Prince?",
    params={
        "Retriever": {"top_k": 10},
        "Reader": {"top_k": 5}
    }
)



Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]

In [103]:
# Print out some answers
from pprint import pprint

pprint(prediction)

{'answers': [<Answer {'answer': 'Percival Awarnach', 'type': 'extractive', 'score': 0.7275899052619934, 'context': 'ember of this constituency.  I will begin again.  My given name is Percival Awarnach of the ninety-nine titles, the first and foremost of which…”  The', 'offsets_in_document': [{'start': 153, 'end': 170}], 'offsets_in_context': [{'start': 67, 'end': 84}], 'document_id': '584851e29a4c266696c73ad046e4368e', 'meta': {'arc_title': 'In Absentia', 'pov': 'Lucy', 'wordcount': '13351', 'series_chapter_number': 253, 'arc_number': '21', 'extra_material': False, 'title': 'In Absentia - 21.4', 'chapter': '21.4', '_split_id': 202, 'page': 1}}>,
             <Answer {'answer': 'once a higher spirit', 'type': 'extractive', 'score': 0.43792739510536194, 'context': ' The Choir would become the Carmine Choir.  The Sable Prince was once a higher spirit.  Prince was the easiest title to give him.”  “Did you have anyt', 'offsets_in_document': [{'start': 265, 'end': 285}], 'offsets_in_context'

In [104]:
from haystack.utils import print_answers

print_answers(
    prediction,
    details="minimum" ## Choose from `minimum`, `medium`, and `all`
)



Query: Who is the Sable Prince?
Answers:
[   {   'answer': 'Percival Awarnach',
        'context': 'ember of this constituency.  I will begin again.  My given '
                   'name is Percival Awarnach of the ninety-nine titles, the '
                   'first and foremost of which…”  The'},
    {   'answer': 'once a higher spirit',
        'context': ' The Choir would become the Carmine Choir.  The Sable '
                   'Prince was once a higher spirit.  Prince was the easiest '
                   'title to give him.”  “Did you have anyt'},
    {   'answer': 'John',
        'context': '  “Yeah,” Lucy said, quiet.  “Go, look after him, look '
                   'after yourself.”  John turned his head, looking at the '
                   'Sable Prince.  Then he strode away, paus'},
    {   'answer': 'Carmine',
        'context': 'er, writing on each side, giving him an internal map and '
                   'instructions.  An acorn.”  “He retains the right, '
                 