## Notes
I have used Haystack`s awesome tutorial and amended their code to my needs. 
You can find the original Haystack code by Deepset here (https://github.com/deepset-ai/haystack/).



In [34]:
!pip install --upgrade pip # Install the latest master of Haystack
!pip install git+https://github.com/deepset-ai/haystack.git
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import EmbeddingRetriever
import pandas as pd
import requests
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q # start Elasticsearch server from source
! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.9.2
import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(
    ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1)  # as daemon
)
! sleep 30 # wait until ES has started
from haystack.document_stores import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore( # init the DocumentStore
    host="localhost",
    username="",
    password="",
    index="document",
    embedding_field="question_emb", # here the embedding of our question is stored and that is used later for calculating our similarity to the incoming user question
    embedding_dim=384,
    excluded_meta_data=["question_emb"], # so that we don't return the huge embedding vectors in our search results
) 
retriever = EmbeddingRetriever( # we can use the EmbeddingRetriever for this purpose and specify a model that we use for the embeddings.
    document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2", use_gpu=True #instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones).
)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/deepset-ai/haystack.git
  Cloning https://github.com/deepset-ai/haystack.git to /tmp/pip-req-build-yf74ehys
  Running command git clone --filter=blob:none --quiet https://github.com/deepset-ai/haystack.git /tmp/pip-req-build-yf74ehys
  Resolved https://github.com/deepset-ai/haystack.git to commit b6986ea25d7bbd06158dcb9b2f2b9187266accff
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[0m

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.nodes.retriever.dense -  Init retriever using embeddings of model sentence-transformers/all-MiniLM-L6-v2
INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find sentence-transformers/all-MiniLM-L6-v2 locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded sentence-transformers/all-MiniLM-L6-v2
INFO - haystack.modeling.data_handler.processor -  Initialized processor without tasks. Supply `metric` and `label_list` to the constructor for using the default task or add a custom task later via processor.add_task()
INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - h

In [35]:
doc_dir = "/content/" 
df = pd.read_excel(f"{doc_dir}/SearchAutofill.xlsm", header=None) # create a pandas dataframe containing our bots FAQ data
df = df.rename(columns={0: 'question', 1: 'answer'})
df.fillna(value="", inplace=True)
df['question'] = df['question'].apply(lambda x: x.strip())
questions = list(df['question'].values) # get embeddings for our questions from the FAQs
df["question_emb"] = retriever.embed_queries(texts=questions)
df = df.rename(columns={'question': "content"})
print(df.head())

Inferencing Samples: 100%|██████████| 11/11 [00:02<00:00,  4.10 Batches/s]

                  content                answer  \
0  accruals and deferrals          Abgrenzungen   
1            depreciation        Abschreibungen   
2   reconciliation bridge         Abstimmbrücke   
3          reconciliation  Abstimmung, Abgleich   
4                variance            Abweichung   

                                                                      question_emb  
0  [-0.6159873008728027, -0.27220212088690865, 0.022060438990592957, -0.1667093...  
1  [-0.26343029737472534, 0.5312923192977905, 0.33144572377204895, -0.128729134...  
2  [-0.7546553611755371, 0.6393743753433228, -0.0718823770682017, -0.1193267405...  
3  [-0.7285127639770508, 0.9149800539016724, 0.15480560064315796, -0.0416017398...  
4  [0.8309152126312256, 0.3147714138031006, 0.16317422688007355, 0.251116096973...  





In [36]:
docs_to_index = df.to_dict(orient="records") # convert dataframe to list of dicts and index them in documentStore in Elasticsearch
document_store.write_documents(docs_to_index)
from haystack.pipelines import FAQPipeline #initialize a pipeline and ask questions
pipe = FAQPipeline(retriever=retriever) 
from haystack.utils import print_answers
prediction = pipe.run(query="What does absorption costing mean in German?", params={"Retriever": {"top_k": 3}})
print_answers(prediction, details="medium")

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 80.60 Batches/s]


Query: What does absorption costing mean in German?
Answers:
[   {   'answer': 'Vollkostenrechnung',
        'context': 'Vollkostenrechnung',
        'score': 0.6111562350718944},
    {   'answer': 'Deckungsbeitragsrechnung (einstufig)',
        'context': 'Deckungsbeitragsrechnung (einstufig)',
        'score': 0.5802788008848704},
    {   'answer': 'Standardkostenrechnung',
        'context': 'Standardkostenrechnung',
        'score': 0.5791112368966103}]





In [37]:
docs_to_index = df.to_dict(orient="records") # convert dataframe to list of dicts and index them in documentStore in Elasticsearch
document_store.write_documents(docs_to_index)
from haystack.pipelines import FAQPipeline #initialize a pipeline and ask questions
pipe = FAQPipeline(retriever=retriever) 
from haystack.utils import print_answers
prediction = pipe.run(query="What does absorption mean in German?", params={"Retriever": {"top_k": 3}})
print_answers(prediction, details="medium")

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 51.53 Batches/s]


Query: What does absorption mean in German?
Answers:
[   {   'answer': 'Vollkostenrechnung',
        'context': 'Vollkostenrechnung',
        'score': 0.5663206497356694},
    {'answer': 'Tilgung', 'context': 'Tilgung', 'score': 0.5365138813353966},
    {'answer': 'Vermögen', 'context': 'Vermögen', 'score': 0.5360844069058252}]





In [38]:
from haystack.utils import print_answers
prediction = pipe.run(query="What are accruals?", params={"Retriever": {"top_k": 3}})
print_answers(prediction, details="all")

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 64.20 Batches/s]


Query: What are accruals?
Answers:
[   <Answer {'answer': 'Rückstellungen', 'type': 'other', 'score': 0.6129861828861322, 'context': 'Rückstellungen', 'offsets_in_document': None, 'offsets_in_context': [{'start': 0, 'end': 14}], 'document_id': 'd8dd4582c851b70968837d0ddc24b010', 'meta': {'answer': 'Rückstellungen', 'query': 'accruals'}}>,
    <Answer {'answer': 'Rabatt', 'type': 'other', 'score': 0.5722656545690049, 'context': 'Rabatt', 'offsets_in_document': None, 'offsets_in_context': [{'start': 0, 'end': 6}], 'document_id': '9bf6c46ce95bdab52a72ab1bf7333f8c', 'meta': {'answer': 'Rabatt', 'query': 'allowance'}}>,
    <Answer {'answer': 'Abgrenzungen', 'type': 'other', 'score': 0.5675899718102981, 'context': 'Abgrenzungen', 'offsets_in_document': None, 'offsets_in_context': [{'start': 0, 'end': 12}], 'document_id': '6251dfe668dd12ac323b626bf66d010d', 'meta': {'answer': 'Abgrenzungen', 'query': 'accruals and deferrals'}}>]





In [39]:
from haystack.utils import print_answers
prediction = pipe.run(query="accruals?", params={"Retriever": {"top_k": 3}})
print_answers(prediction, details="minimum")

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 63.38 Batches/s]


Query: accruals?
Answers:
[   {'answer': 'Rückstellungen', 'context': 'Rückstellungen'},
    {'answer': 'Rabatt', 'context': 'Rabatt'},
    {'answer': 'Rückstellungen', 'context': 'Rückstellungen'}]





In [40]:
from haystack.utils import print_answers
prediction = pipe.run(query="absorption", params={"Retriever": {"top_k": 10}})
print_answers(prediction, details="all")

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 62.09 Batches/s]


Query: absorption
Answers:
[   <Answer {'answer': 'Vollkostenrechnung', 'type': 'other', 'score': 0.6366829641277999, 'context': 'Vollkostenrechnung', 'offsets_in_document': None, 'offsets_in_context': [{'start': 0, 'end': 18}], 'document_id': '9b15365319e1aca795c3c1488e546b59', 'meta': {'answer': 'Vollkostenrechnung', 'query': 'absorption costing'}}>,
    <Answer {'answer': 'Vermögen', 'type': 'other', 'score': 0.5736674016988692, 'context': 'Vermögen', 'offsets_in_document': None, 'offsets_in_context': [{'start': 0, 'end': 8}], 'document_id': '6880d5e9de935ca7ce70de479620a022', 'meta': {'answer': 'Vermögen', 'query': 'assets'}}>,
    <Answer {'answer': 'Eigenkapital', 'type': 'other', 'score': 0.5709296241155433, 'context': 'Eigenkapital', 'offsets_in_document': None, 'offsets_in_context': [{'start': 0, 'end': 12}], 'document_id': 'bbe5b89c14a41f56230b495a1bb0c4ae', 'meta': {'answer': 'Eigenkapital', 'query': 'equity'}}>,
    <Answer {'answer': 'Tilgung', 'type': 'other', 'score': 0


