Run only for the first time:

In [1]:
# Install the latest release of Haystack in your own environment 
#!pip install git+https://github.com/deepset-ai/haystack.git

# If running on GPUs, e.g., DALMA
# Install the latest master of Haystack
#!pip install git+https://github.com/deepset-ai/haystack.git
#!pip install urllib3==1.25.4
#!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html

In [3]:
from haystack.document_store.memory import InMemoryDocumentStore
from haystack.retriever.dense import EmbeddingRetriever
from haystack.utils import print_answers
import pandas as pd
import numpy as np
import pickle


01/12/2021 18:49:02 - INFO - faiss -   Loading faiss.


In [80]:
# In-Memory Document Store

document_store = InMemoryDocumentStore(similarity="cosine")

In [81]:
model_path = "deepset/sentence_bert"

retriever = EmbeddingRetriever(document_store=document_store, 
                               embedding_model=model_path, 
                               use_gpu=False)


01/13/2021 11:26:42 - INFO - haystack.retriever.dense -   Init retriever using embeddings of model deepset/sentence_bert
01/13/2021 11:26:42 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
01/13/2021 11:26:42 - INFO - farm.infer -   Could not find `deepset/sentence_bert` locally. Try to download from model hub ...
	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
01/13/2021 11:26:52 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None


In [92]:
# Get dataframe with columns "question", "answer" and some custom metadata
df = pd.read_csv("data/MargaritaCorpusKB_video_id.csv")
df = df[["Context", "Utterance", "id_video"]]
df = df.rename(columns={"Context": "text", "Utterance": "answer"})
df.drop_duplicates(subset=['text'], inplace=True)
df.drop_duplicates(subset=['answer'], inplace=True)
# Minimal cleaning
df.fillna(value="", inplace=True)
df["text"] = df["text"].apply(lambda x: x.strip())
# Drop question that only have *
index_drop = df[df["text"] == "*"].index
df.drop(index_drop, inplace=True)

# Get embeddings for our questions from the FAQs
# questions = list(df["text"].values)
# df["embedding"] = retriever.embed_queries(texts=questions)

# Convert Dataframe to list of dicts and index them in our DocumentStore
docs_to_index = df.to_dict(orient="records")

# # Delete existing documents in documents store
document_store.delete_all_documents()

# Write documents to document store
document_store.write_documents(docs_to_index)

# Add documents embeddings to index
document_store.update_embeddings(
    retriever=retriever
)

01/13/2021 18:45:44 - INFO - haystack.document_store.memory -   Updating embeddings for 352 docs ...
Inferencing Samples: 100%|██████████| 88/88 [03:15<00:00,  2.22s/ Batches]


In [93]:

query_embedding = np.array(
    retriever.embed_queries(texts="How are you?")
)

response = document_store.query_by_embedding(
    query_embedding, 
    top_k=1, 
    return_embedding=False
)

print(response[0].meta['answer'])
print(response[0].meta['id_video'])

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.74 Batches/s]

Not too bad, thanks.
ca71d0c3f77e9c0b61f8d810617b3841





In [94]:

outfile = open("faiss_indices/margarita.pkl", 'wb')
pickle.dump(document_store, outfile)
outfile.close()

In [95]:
infile = open("faiss_indices/margarita.pkl",'rb')
new_document_store = pickle.load(infile)
infile.close()

In [96]:
query_embedding = np.array(
    retriever.embed_queries(texts="How are you?")
)
response = new_document_store.query_by_embedding(
    query_embedding, 
    top_k=1, 
    return_embedding=False
)

print(response[0].meta['answer'])
print(response[0].meta['id_video'])

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.76 Batches/s]

Not too bad, thanks.
ca71d0c3f77e9c0b61f8d810617b3841





### Dialogue Mgr can stop here

Below is evaluation

In [87]:
%%capture --no-stdout --no-display

df_dial = pd.read_csv("data/DIALOGUES.csv")
df_dial = df_dial[df_dial['Experiment'] == 'TRAIN']
test_questions = df_dial['Q'].to_list()
annotation_cols = ['BA1', 'BA2', 'BA3', 'BA4', 'BA5', 'BA6']

hits_at_1 = 0
hits, probs, scores, answers = [], [], [], []
test_questions_emb = retriever.embed_queries(texts=test_questions)
for question, embedding in zip(test_questions, test_questions_emb):
    prediction = document_store.query_by_embedding(
        np.array(embedding), 
        top_k=1, 
        return_embedding=False
    )
    answer = prediction[0].meta['answer']
    if answer in df_dial[df_dial['Q'] == question][annotation_cols].values:
        hits_at_1 += 1
        hits.append(1)
    else:
        hits_at_1 == 0
        hits.append(0)
    probs.append(prediction[0].probability)
    scores.append(prediction[0].score)
    answers.append(answer)

hits_at_k = 0
for question, embedding in zip(test_questions, test_questions_emb):
    predictions = document_store.query_by_embedding(
        np.array(embedding), 
        top_k=10, 
        return_embedding=False
    )
    pred_answers = [pred.meta['answer'] for pred in predictions]
    if any([pred_ans in df_dial[
        df_dial['Q'] == question][annotation_cols].values 
            for pred_ans in pred_answers]):
        hits_at_k += 1
    else:
        hits_at_k == 0

In [88]:
# deepset/sentence_bert on 404 unique q-a pairs
print(hits_at_1/len(test_questions))
print(hits_at_k/len(test_questions))

0.4411764705882353
0.6058823529411764
