In [59]:
from haystack.telemetry import tutorial_running

tutorial_running(3)


In [60]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


In [61]:
import os
from haystack.document_stores import ElasticsearchDocumentStore

# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_store = ElasticsearchDocumentStore(host=host, username="", password="", index="document")


In [62]:
from haystack.utils import fetch_archive_from_http

doc_dir = r"C:\Users\Swach\Mane-Project\Data\text_files"


In [63]:
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor

indexing_pipeline = Pipeline()
text_converter = TextConverter()
preprocessor = PreProcessor(
    clean_whitespace=True,
    clean_header_footer=True,
    clean_empty_lines=True,
    split_by="word",
    split_length=1000,
    split_overlap=20,
    split_respect_sentence_boundary=True,
)


In [64]:
import os

indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"])
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])


In [65]:
files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
indexing_pipeline.run_batch(file_paths=files_to_index)


INFO - haystack.pipelines.base -  It seems that an indexing Pipeline is run, so using the nodes' run method instead of run_batch.
Converting files: 100%|██████████| 41/41 [00:00<00:00, 255.88it/s]
Preprocessing: 100%|██████████| 41/41 [00:00<00:00, 276.30docs/s]


{'documents': [<Document: {'content': 'PATENT PORTFOLIO\nAUDIT AND\nLICENSING POLICY\nCONFIDENTIAL\n2\nPATENT PORTFOLIO AUDIT AND LICENSING POLICY\nConfidential | Aminata POUYE\n01\nCONTENT\nPORTFOLIO AUDIT\n02\nLICENSING OPPORTUNITIES\n03\nREASONS FOR A LICENSING POLICY\n3\nPATENT PORTFOLIO AUDIT AND LICENSING POLICY\nConfidential | Aminata POUYE\n01 REASONS FOR A LICENSING POLICY\n4\nPATENT PORTFOLIO AUDIT AND LICENSING POLICY\nConfidential | Aminata POUYE\nA. Why a licensing policy? (1/2)\n� Patents\nrepresent\nan\nimportant\ninvestment\nnotably\nfinancially\nfor\ncompanies, and are considered as intangible asset that are of importance\nin evaluating the value and strength of a company.\n� Furthermore, generally speaking, less than 10% of a patent portfolio\ncontribute towards revenues of a company.\nThus, there is a need to find returns on investments (ROI)\nopportunities.\nREASONS FOR A LICENSING POLICY\n5\nPATENT PORTFOLIO AUDIT AND LICENSING POLICY\nConfidential | Aminata POUYE\

In [66]:
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)


In [67]:
from haystack.nodes import TransformersReader
reader = TransformersReader("ahotrod/albert_xxlargev1_squad2_512")


INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0
Downloading config.json: 100%|██████████| 715/715 [00:00<?, ?B/s] 
Downloading pytorch_model.bin: 100%|██████████| 890M/890M [14:06<00:00, 1.05MB/s]
  return self.fget.__get__(instance, owner)()
Some weights of the model checkpoint at ahotrod/albert_xxlargev1_squad2_512 were not used when initializing AlbertForQuestionAnswering: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading tokenizer_config.json: 100%|██████████| 48.0/48.0 

In [68]:
from haystack import Pipeline

querying_pipeline = Pipeline()
querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])


In [69]:
prediction = querying_pipeline.run(
    query="Can you explain about General Data Protection Regulation?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)




In [70]:
from pprint import pprint

pprint(prediction)


{'answers': [<Answer {'answer': ' GDPR', 'type': 'extractive', 'score': 0.15732254087924957, 'context': 'bruary 4 2020\n\nThe general data protection regulation, better known as GDPR (Regulation (EU) 2016/679 of the European Parliament and of the Counc', 'offsets_in_document': [{'start': 104, 'end': 109}], 'offsets_in_context': [{'start': 70, 'end': 75}], 'document_ids': ['c5fc569c80e5b7e2613e51ca3de5cacd'], 'meta': {'_split_id': 0, '_split_overlap': []}}>,
             <Answer {'answer': ' information must be prior to the collection of personal data', 'type': 'extractive', 'score': 0.14568579196929932, 'context': ' information note by hand, drafting a clause in a contract, ...). This information must be prior to the collection of personal data for a new processing or when personal data, already collected and pro', 'offsets_in_document': [{'start': 1045, 'end': 1106}], 'offsets_in_context': [{'start': 70, 'end': 131}], 'document_ids': ['3d92c3c288d25ca75176ab16c559fd4e'], 'meta': {'_sp

In [71]:
from haystack.utils import print_answers

print_answers(prediction, details="minimum")  ## Choose from `minimum`, `medium` and `all`


'Query: Can you explain about General Data Protection Regulation?'
'Answers:'
[   {   'answer': ' GDPR',
        'context': 'bruary 4 2020\n'
                   '\n'
                   'The general data protection regulation, better known as '
                   'GDPR (Regulation (EU) 2016/679 of the European Parliament '
                   'and of the Counc'},
    {   'answer': ' information must be prior to the collection of personal '
                  'data',
        'context': ' information note by hand, drafting a clause in a '
                   'contract, ...). This information must be prior to the '
                   'collection of personal data for a new processing or when '
                   'personal data, already collected and pro'},
    {   'answer': ' information must be prior to the collection of personal '
                  'data for a new processing',
        'context': ' information note by hand, drafting a clause in a '
                   'contract, ...). This inf