Run only for the first time:

In [1]:
# Install the latest release of Haystack in your own environment 
#!pip install git+https://github.com/deepset-ai/haystack.git

# If running on GPUs, e.g., DALMA
# Install the latest master of Haystack
#!pip install git+https://github.com/deepset-ai/haystack.git
#!pip install urllib3==1.25.4
#!pip install torch==1.6.0+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html

In [2]:
from haystack.document_store.faiss import FAISSDocumentStore
from haystack.retriever.dense import EmbeddingRetriever
from haystack.utils import print_answers
import pandas as pd
import numpy as np
import pickle


06/13/2021 09:57:18 - INFO - faiss -   Loading faiss.


In [3]:
# FAISS Document Store

document_store = FAISSDocumentStore(
    sql_url="postgresql:///margarita1234?client_encoding=utf8"
)

In [4]:
model_path = "deepset/sentence_bert"

retriever = EmbeddingRetriever(document_store=document_store, 
                               embedding_model=model_path, 
                               use_gpu=False)

06/13/2021 09:57:22 - INFO - haystack.retriever.dense -   Init retriever using embeddings of model deepset/sentence_bert
06/13/2021 09:57:22 - INFO - farm.utils -   Using device: CPU 
06/13/2021 09:57:22 - INFO - farm.utils -   Number of GPUs: 0
06/13/2021 09:57:22 - INFO - farm.utils -   Distributed Training: False
06/13/2021 09:57:22 - INFO - farm.utils -   Automatic Mixed Precision: None
06/13/2021 09:57:33 - INFO - farm.utils -   Using device: CPU 
06/13/2021 09:57:33 - INFO - farm.utils -   Number of GPUs: 0
06/13/2021 09:57:33 - INFO - farm.utils -   Distributed Training: False
06/13/2021 09:57:33 - INFO - farm.utils -   Automatic Mixed Precision: None


In [15]:
# Get dataframe with columns "question", "answer" and some custom metadata
df = pd.read_csv("data/MargaritaCorpusKB_video_id.csv", encoding='utf-8')
df = df[["Context", "Utterance", "id_video"]]
df = df.rename(columns={"Context": "text", "Utterance": "answer"})
df.drop_duplicates(subset=['text'], inplace=True)
df.drop_duplicates(subset=['answer'], inplace=True)
# Minimal cleaning
df.fillna(value="", inplace=True)
df["text"] = df["text"].apply(lambda x: x.strip())
# Drop question that only have *
index_drop = df[df["text"] == "*"].index
df.drop(index_drop, inplace=True)

# Get embeddings for our questions from the FAQs
# questions = list(df["text"].values)
# df["embedding"] = retriever.embed_queries(texts=questions)

# Convert Dataframe to list of dicts and index them in our DocumentStore
docs_to_index = df.to_dict(orient="records")

# # Delete existing documents in documents store
document_store.delete_all_documents()

# Write documents to document store
document_store.write_documents(docs_to_index)

# Add documents embeddings to index
document_store.update_embeddings(
    retriever=retriever
)

02/11/2021 13:55:09 - INFO - haystack.document_store.faiss -   Updating embeddings for 349 docs...

Inferencing Samples:   0%|          | 0/88 [00:00<?, ? Batches/s][A
Inferencing Samples:   1%|          | 1/88 [00:04<06:39,  4.59s/ Batches][A
Inferencing Samples:   2%|▏         | 2/88 [00:09<06:28,  4.52s/ Batches][A
Inferencing Samples:   3%|▎         | 3/88 [00:13<06:24,  4.53s/ Batches][A
Inferencing Samples:   5%|▍         | 4/88 [00:18<06:21,  4.54s/ Batches][A
Inferencing Samples:   6%|▌         | 5/88 [00:22<06:19,  4.57s/ Batches][A
Inferencing Samples:   7%|▋         | 6/88 [00:27<06:16,  4.59s/ Batches][A
Inferencing Samples:   8%|▊         | 7/88 [00:32<06:12,  4.60s/ Batches][A
Inferencing Samples:   9%|▉         | 8/88 [00:36<06:08,  4.60s/ Batches][A
Inferencing Samples:  10%|█         | 9/88 [00:41<06:05,  4.62s/ Batches][A
Inferencing Samples:  11%|█▏        | 10/88 [00:45<06:01,  4.64s/ Batches][A
Inferencing Samples:  12%|█▎        | 11/88 [00:50<05:57,  4

In [16]:
query_embedding = np.array(
    retriever.embed_queries(texts="How are you?")
)

response = document_store.query_by_embedding(
    query_embedding, 
    top_k=1, 
    return_embedding=False
)

print(response[0].meta['answer'])
print(response[0].meta['id_video'])

Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.20s/ Batches]

Pretty good, thank you!
77157368d489465a3af172497e80ed59





In [17]:
document_store.save("faiss_indices/margarita1234")
# outfile = open("faiss_indices/margarita.pkl", 'wb')
# pickle.dump(document_store, outfile)
# outfile.close()

In [5]:
# infile = open("faiss_indices/margarita.pkl",'rb')
# new_document_store = pickle.load(infile)

# infile.close()

new_document_store = FAISSDocumentStore.load(faiss_file_path="faiss_indices/margarita1234",
                                             sql_url='postgresql:///margarita1234?client_encoding=utf8')

In [6]:
model_path = "deepset/sentence_bert"

retriever = EmbeddingRetriever(document_store=new_document_store, 
                               embedding_model=model_path, 
                               use_gpu=False)

06/13/2021 09:57:40 - INFO - haystack.retriever.dense -   Init retriever using embeddings of model deepset/sentence_bert
06/13/2021 09:57:40 - INFO - farm.utils -   Using device: CPU 
06/13/2021 09:57:40 - INFO - farm.utils -   Number of GPUs: 0
06/13/2021 09:57:40 - INFO - farm.utils -   Distributed Training: False
06/13/2021 09:57:40 - INFO - farm.utils -   Automatic Mixed Precision: None
06/13/2021 09:57:51 - INFO - farm.utils -   Using device: CPU 
06/13/2021 09:57:51 - INFO - farm.utils -   Number of GPUs: 0
06/13/2021 09:57:51 - INFO - farm.utils -   Distributed Training: False
06/13/2021 09:57:51 - INFO - farm.utils -   Automatic Mixed Precision: None


In [7]:
query_embedding = np.array(
    retriever.embed_queries(texts="How are you?")
)
response = new_document_store.query_by_embedding(
    query_embedding, 
    top_k=1, 
    return_embedding=False
)

print(response[0].meta['answer'])
print(response[0].meta['id_video'])

Inferencing Samples: 100%|██████████| 1/1 [00:01<00:00,  1.25s/ Batches]

Pretty good, thank you!
77157368d489465a3af172497e80ed59





### Dialogue Mgr can stop here

Below is evaluation and further research

In [11]:
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database

sql_url = "postgresql:///ironman:kolomino@localhost:5432"

avatar_id = "avatar_id1334"

avatar_sql_url = sql_url.format(avatar_id)

engine = create_engine(sql_url)

if not database_exists("postgresql:///{}".format(avatar_id)):
    create_database("postgresql:///{}".format(avatar_id))

database_exists('postgresql:///margarita1234')  

True

In [62]:
%%capture --no-stdout --no-display

df_dial = pd.read_csv("data/DIALOGUES.csv", encoding='utf-8')
df_dial = df_dial[df_dial['Experiment'] == 'TEST']
df_dial
df_dial_test = df_dial.sample(frac=.5, random_state=1)
df_dial_test.reset_index()
df_dial_finetune = df_dial.drop(df_dial_test.index)

annotation_cols = ['BA1', 'BA2', 'BA3', 'BA4', 'BA5', 'BA6']

test_questions = df_dial_test['Q'].to_list()
test_questions_emb = retriever.embed_queries(texts=test_questions)

finetune_questions = df_dial_finetune['Q'].to_list()
finetune_questions_emb = retriever.embed_queries(texts=finetune_questions)

def hitsatk(k, document_store, test_questions, test_questions_emb, df_dial, annotation_cols, finder=None):
    
    hits_at_k = 0
    hits_at_k_itemized, probs, scores, answers = [], [], [], []
    
    for question, embedding in zip(test_questions, test_questions_emb):
        if finder == None:
            predictions = document_store.query_by_embedding(
                np.array(embedding), 
                top_k=k, 
                return_embedding=False
            )
            annotated_answers = df_dial[df_dial['Q'] == question][annotation_cols].values
            pred_answers = [pred.meta['answer'] for pred in predictions]
            probs.append(predictions[0].probability)
            scores.append(predictions[0].score)
            answers.append(pred_answers[0])
            
        else:
            predictions = finder.get_answers_via_similar_questions(
                question=question,
                top_k_retriever=k
            )
            annotated_answers = df_dial[df_dial['Q'] == embedding][annotation_cols].values
            if len(predictions["answers"]) == 0:
                pred_answers = ["NA"]
                probs.append(np.nan)
                scores.append(np.nan)
                answers.append("NA")
            else:
                pred_answers = [pred["answer"] for pred in predictions["answers"]]
                probs.append(predictions["answers"][0]["probability"])
                scores.append(predictions["answers"][0]["score"])
                answers.append(pred_answers[0])
            
        if any([pred_ans in annotated_answers for pred_ans in pred_answers]):
            hits_at_k += 1
            hits_at_k_itemized.append(1)
        else:
            hits_at_k += 0
            hits_at_k_itemized.append(0)
            
    return hits_at_k, hits_at_k_itemized, probs, scores, answers



In [None]:
hits_at_k, hits_at_k_itemized, probs, scores, answers = hitsatk(
    1, new_document_store, test_questions, test_questions, df_dial_test_emb, annotation_cols)

In [None]:
print("SR@1: ", hits_at_k/len(test_questions))

# All dialogues, hits @ 1: 0.15673981191222572
# Only PER dialogues, hits @ 1: 0.1569767441860465
# Only dialogues 6, 7 (2x PER, 2x UNI), hits @ 1: 0.19166666666666668

In [None]:
for k in [2, 5, 10, 20]:
    hits_at_k, _, _, _, _ = hitsatk(
        k, new_document_store, test_questions, test_questions_emb, df_dial_test, annotation_cols)

    print("SR@{}: ".format(k), hits_at_k/len(test_questions))

# All dialogues, hits @ 10: 0.32601880877742945
# Only PER dialogues, hits @ 10: 0.3313953488372093
# Only dialogues 6, 7 (2x PER, 2x UNI), hits @ 10: 0.44166666666666665

In [None]:
for k in [1, 2, 5, 10, 20]:
    hits_at_k, _, _, _, _ = hitsatk(
        k, new_document_store, finetune_questions, finetune_questions, df_dial_finetune, annotation_cols)

    print("SR@{}: ".format(k), hits_at_k/len(df_dial_finetune))

In [17]:
df_thresholds = pd.DataFrame(
{
    "question": test_questions,
    "answer": answers,
    "hit_at_1": hits_at_k_itemized,
    "prob": probs,
    "score": scores,
    "no_ans": df_dial_test.BA1.isna()
})

df_thresholds["combo_mult"] = df_thresholds["prob"] * df_thresholds["score"]

df_thresholds["combo_sum"] = df_thresholds["prob"] + df_thresholds["score"]

In [18]:
df_thresholds[["combo_mult", "combo_sum", "hit_at_1"]].groupby("hit_at_1").describe()

Unnamed: 0_level_0,combo_mult,combo_mult,combo_mult,combo_mult,combo_mult,combo_mult,combo_mult,combo_mult,combo_sum,combo_sum,combo_sum,combo_sum,combo_sum,combo_sum,combo_sum,combo_sum
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
hit_at_1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,136.0,129.479534,41.593143,50.888723,102.185536,127.499947,153.279563,268.043406,136.0,156.343572,39.948947,75.620083,130.812713,155.479375,179.754178,284.688806
1,24.0,158.622288,53.439649,82.381546,120.849069,157.822374,186.660895,263.67546,24.0,183.838438,49.963871,110.596163,149.069217,183.963621,210.496181,280.692296


In [19]:
df_thresholds.loc[df_thresholds["no_ans"]==True, "hit_at_1"] = 2

df_thresholds[["combo_mult", "combo_sum", "hit_at_1"]].groupby("hit_at_1").describe()

Unnamed: 0_level_0,combo_mult,combo_mult,combo_mult,combo_mult,combo_mult,combo_mult,combo_mult,combo_mult,combo_sum,combo_sum,combo_sum,combo_sum,combo_sum,combo_sum,combo_sum,combo_sum
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
hit_at_1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,115.0,130.004457,42.43408,50.888723,106.132912,128.769637,153.462122,268.043406,115.0,156.799805,40.801738,75.620083,134.731564,156.691425,179.92398,284.688806
1,24.0,158.622288,53.439649,82.381546,120.849069,157.822374,186.660895,263.67546,24.0,183.838438,49.963871,110.596163,149.069217,183.963621,210.496181,280.692296
2,21.0,126.604953,37.462739,77.11891,92.068428,120.260877,148.25561,224.495285,21.0,153.845151,35.713321,105.035724,120.610835,148.529077,175.071212,244.972127


hits@1 = 0.3911764705882353

hits@2 = 0.4588235294117647

hits@5 = 0.4970588235294118

hits@10 = 0.5617647058823529

hits@20 = 0.611764705882353

hits@100 = 0.7794117647058824

hits@200 = 0.8558823529411764

In [20]:
# for i in range(len(test_questions)):
#     if items_hits_at_k[i] == 0:
#         print(test_questions[i])

#### Add q-A Relevance

In [20]:
preds = pd.read_csv('/Users/amc/Documents/glue_data/Margarita_1_100_ratio/test_results_mrpc.txt', 
                    sep='\t', encoding='utf-8')['prediction'].values
valid_df2valid_preds = pd.read_csv('~/Documents/TOIA-NYUAD/research/data/test_dev2test_preds.tsv', sep='\t', encoding='utf-8')

valid_preds = pd.DataFrame(
    {'q': valid_df2valid_preds['#1 String'].values, 
     'A': valid_df2valid_preds['#2 String'].values, 
     'y_pred': preds})



In [21]:
def reranked_hitsatk(k, j, document_store, test_questions, test_questions_emb, df_dial, annotation_cols):
    result_items = []
    result_probs = []
    for question, embedding in zip(test_questions, test_questions_emb):
        predictions = document_store.query_by_embedding(
            np.array(embedding), 
            top_k=j, 
            return_embedding=False
        )
        pred_answers = [pred.meta['answer'] for pred in predictions]
        qq_probs = np.array([pred.probability for pred in predictions])
        qa_probs = np.array([valid_preds[(valid_preds['q']==question) &
                                (valid_preds['A']==pred_ans)]['y_pred'].values[0] for 
                    pred_ans in pred_answers])
        comb_probs = qq_probs * qa_probs
        sorted_probs = np.sort((comb_probs))[::-1][:k]
        sorted_indices = np.argsort((comb_probs))[::-1][:k]
        pred_answers_reranked = [pred_answers[i] for i in sorted_indices]
        if any([pred_ans in df_dial[
            df_dial['Q'] == question][annotation_cols].values 
                for pred_ans in pred_answers_reranked]):
            result_items.append(1)
        else:
            result_items.append(0)
        result_probs.append(sorted_probs[0])
    return result_items, result_probs

In [22]:
for k in [1, 2, 5, 10, 20]:
    rr_hits_at_k, rr_probs = reranked_hitsatk(
        k, 10, new_document_store, test_questions, test_questions_emb, df_dial_test, annotation_cols)

    print("SR@{}_10: ".format(k), sum(rr_hits_at_k)/len(test_questions))

SR@1_10:  0.20625
SR@2_10:  0.25
SR@5_10:  0.3125
SR@10_10:  0.3375
SR@20_10:  0.3375


In [23]:
for k in [1, 2, 5, 10, 20]:
    rr_hits_at_k, rr_probs = reranked_hitsatk(
        k, 349, new_document_store, test_questions, test_questions_emb, df_dial_test, annotation_cols)

    print("SR@{}_300: ".format(k), sum(rr_hits_at_k)/len(test_questions))

SR@1_300:  0.21875
SR@2_300:  0.28125
SR@5_300:  0.39375
SR@10_300:  0.46875
SR@20_300:  0.5375


hits@1_10 = 0.45294117647058824 | multiplying 0.4588235294117647 | weighted sum (.5 * qq) 0.45588235294117646

- test set = 0.24166666666666667 | 0.2833333333333333

hits@2_10 = 0.5088235294117647 | multiplying 0.49411764705882355

- test mult: 0.3333333333333333

hits@5_10 = 0.5323529411764706 | multiplying 0.5323529411764706

- test mult: 0.4 hit@10_10 -- 0.44166666666666665

hits@10_200 = 0.6558823529411765

This is basically scoring all and summing scores:

hits@1_300 = 0.5411764705882353 | multiplying 0.5352941176470588

- test set = 0.21666666666666667 | 0.24166666666666667

hits@10_300 = 0.6794117647058824 | multiplying 0.7176470588235294


#### Worth trying to filter with Q-A and re-ranking with Q-Q.

In [24]:

rr_hits_at_k, rr_probs = reranked_hitsatk(
    1, 10, new_document_store, test_questions, test_questions_emb, df_dial_test, annotation_cols)


In [25]:
df_rr_thr = pd.DataFrame(
{
    "question": test_questions,
    "answer": answers,
    "hit_at_k": rr_hits_at_k,
    "prob": rr_probs,
    "no_ans": df_dial_test.BA1.isna()
})

In [26]:
df_rr_thr.groupby("hit_at_k").describe()

Unnamed: 0_level_0,prob,prob,prob,prob,prob,prob,prob,prob
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
hit_at_k,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,127.0,0.225602,0.348523,0.0,8e-06,0.000168,0.706131,0.915177
1,33.0,0.598176,0.38259,7e-06,0.004419,0.810861,0.849148,0.942493


In [27]:
df_rr_thr.loc[df_thresholds["no_ans"]==True, "hit_at_k"] = 2

df_rr_thr.groupby("hit_at_k").describe()

Unnamed: 0_level_0,prob,prob,prob,prob,prob,prob,prob,prob
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
hit_at_k,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,106.0,0.231319,0.35301,0.0,8e-06,0.000147,0.708181,0.915177
1,33.0,0.598176,0.38259,7e-06,0.004419,0.810861,0.849148,0.942493
2,21.0,0.196747,0.331615,0.0,2.2e-05,0.000619,0.245588,0.834083


In [28]:
thr_sel = df_rr_thr.groupby("hit_at_k").describe()["prob"]["25%"][1]
thr_sel

0.004418585643219317

In [29]:
def isNaN(string):
    return string != string

def reranked_hitsatk_thr(k, j, document_store, test_questions, test_questions_emb, df_dial, thr):
    result_items = []
    result_probs = []
    for question, embedding in zip(test_questions, test_questions_emb):
        predictions = document_store.query_by_embedding(
            np.array(embedding), 
            top_k=j, 
            return_embedding=False
        )
        pred_answers = [pred.meta['answer'] for pred in predictions]
        qq_probs = np.array([pred.probability for pred in predictions])
        qa_probs = np.array([valid_preds[(valid_preds['q']==question) &
                                (valid_preds['A']==pred_ans)]['y_pred'].values[0] for 
                    pred_ans in pred_answers])
        comb_probs = qq_probs * qa_probs        
        sorted_probs = np.sort((comb_probs))[::-1][:k]
        sorted_indices = np.argsort((comb_probs))[::-1][:k]
        pred_answers_reranked = [pred_answers[i] for i, p in zip(sorted_indices, sorted_probs) if p >= thr]
        annotated_answers = df_dial[df_dial['Q'] == question][annotation_cols].values
        if any([pred_ans in annotated_answers for pred_ans in pred_answers_reranked]):
            result_items.append(1)
            result_probs.append(sorted_probs[0])
        elif (len(pred_answers_reranked) == 0) & (isNaN(annotated_answers[0][0])):
            result_items.append(1)
            result_probs.append(1)
        else:
            result_items.append(0)
            result_probs.append(max(comb_probs))
    return result_items, result_probs

In [30]:
rr_test_hits_at_k, rr_test_probs = reranked_hitsatk_thr(1, 10,
                                                        new_document_store, 
                                                        test_questions, 
                                                        test_questions_emb, 
                                                        df_dial_test, thr_sel)

print(sum(rr_test_hits_at_k)/len(test_questions))


0.23125


In [31]:
rr_finetune_hits_at_k, rr_finetune_probs = reranked_hitsatk_thr(1, 10, new_document_store, finetune_questions, finetune_questions_emb, df_dial_finetune, thr_sel)

print(sum(rr_finetune_hits_at_k)/len(finetune_questions))

0.24528301886792453


In [33]:
rr_finetune_hits_at_k, rr_finetune_probs = reranked_hitsatk_thr(1, 349, new_document_store, finetune_questions, finetune_questions_emb, df_dial_finetune, thr_sel)

print(sum(rr_finetune_hits_at_k)/len(finetune_questions))

0.2138364779874214


# Get dialogues in QA format For Fine Tuning

In [37]:
import json

squadlike_dict = {"version": "v1.1",
                  "data": [{"title": "Margarita_squadFormat", "paragraphs": []}]}

print(json.dumps(squadlike_dict, indent = 2)) 

{
  "version": "v1.1",
  "data": [
    {
      "title": "Margarita_squadFormat",
      "paragraphs": []
    }
  ]
}


In [38]:
i = 0

for text, answer in zip(df["text"], df["answer"]):
    squadlike_dict["data"][0]["paragraphs"].append(
        {"qas": [{
            "question": text,
            "id": 'id' + str(i),
            "answers": [{"text": answer, 
                         "answer_start": len(text) + 1}],
            "is_impossible": False}],
         "context": "{} {}".format(text, answer)})
    i += 1

In [39]:
print(json.dumps(squadlike_dict, indent = 2)) 

{
  "version": "v1.1",
  "data": [
    {
      "title": "Margarita_squadFormat",
      "paragraphs": [
        {
          "qas": [
            {
              "question": "Please dance!",
              "id": "id0",
              "answers": [
                {
                  "text": "Okay! (Dances)",
                  "answer_start": 14
                }
              ],
              "is_impossible": false
            }
          ],
          "context": "Please dance! Okay! (Dances)"
        },
        {
          "qas": [
            {
              "question": "Please play something!",
              "id": "id1",
              "answers": [
                {
                  "text": "Okay! (Plays ukulele)",
                  "answer_start": 23
                }
              ],
              "is_impossible": false
            }
          ],
          "context": "Please play something! Okay! (Plays ukulele)"
        },
        {
          "qas": [
            {
              "quest

In [40]:
# Set documents such that the whole doc is Q + A

from typing import List
from haystack import Document

titles = df.text.to_list()
texts = ["{} {}".format(a, b) for a, b in zip(titles, df.answer.to_list())]
documents: List[Document] = []
for title, text in zip(titles, texts):
    documents.append(
        Document(
            text=text,
            meta={
                "name": title or ""
            }
        )
    )

In [206]:
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Collecting ipywidgets
  Downloading ipywidgets-7.6.3-py2.py3-none-any.whl (121 kB)
[K     |████████████████████████████████| 121 kB 6.7 MB/s eta 0:00:01
Collecting jupyterlab-widgets>=1.0.0
  Downloading jupyterlab_widgets-1.0.0-py3-none-any.whl (243 kB)
[K     |████████████████████████████████| 243 kB 27.9 MB/s eta 0:00:01
Collecting widgetsnbextension~=3.5.0
  Using cached widgetsnbextension-3.5.1-py2.py3-none-any.whl (2.2 MB)


Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-7.6.3 jupyterlab-widgets-1.0.0 widgetsnbextension-3.5.1
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [208]:
from haystack.generator.transformers import RAGenerator
from haystack.retriever.dense import DensePassageRetriever
from haystack.reader.transformers import TransformersReader


# qa_document_store = FAISSDocumentStore(
#     sql_url="postgresql://ironman:kolomino@localhost:5432/squadformat?client_encoding=utf8",
#     faiss_index_factory_str="Flat",
#     return_embedding=True
# )

# qa_retriever = DensePassageRetriever(
#     document_store=qa_document_store,
#     query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
#     passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
#     use_gpu=False,
#     embed_title=True,
# )

# qa_generator = RAGenerator(
#     model_name_or_path="facebook/rag-token-nq",
#     use_gpu=False,
#     top_k_answers=1,
#     max_length=200,
#     min_length=2,
#     embed_title=True,
#     num_beams=2,
# )

qa_reader = TransformersReader("deepset/roberta-base-squad2")


02/01/2021 17:01:28 - INFO - filelock -   Lock 140337376916432 acquired on /Users/amc/.cache/torch/transformers/f7d4b9379a9c487fa03ccf3d8e00058faa9d664cf01fc03409138246f48760da.6060f348ba2b58d6d30b5324910152ffc512e7c3891ed13f22844f1a9b5c0d0f.lock
02/01/2021 17:01:29 - INFO - filelock -   Lock 140337376916432 released on /Users/amc/.cache/torch/transformers/f7d4b9379a9c487fa03ccf3d8e00058faa9d664cf01fc03409138246f48760da.6060f348ba2b58d6d30b5324910152ffc512e7c3891ed13f22844f1a9b5c0d0f.lock


ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [42]:
qa_document_store.delete_all_documents()

qa_document_store.write_documents(documents)

qa_document_store.update_embeddings(
    retriever=qa_retriever
)

01/31/2021 17:35:38 - INFO - haystack.document_store.faiss -   Updating embeddings for 349 docs...
Creating Embeddings: 100%|██████████| 22/22 [01:23<00:00,  3.81s/ Batches]
01/31/2021 17:37:02 - INFO - haystack.document_store.faiss -   Indexing embeddings and updating vectors_ids...
100%|██████████| 1/1 [00:00<00:00,  7.73it/s]


In [43]:
QUESTIONS = [q for q, hit in zip(test_questions, rr_test_hits_at_k) if hit == 0]

In [None]:
# use finetune (I swapped the names)
# QUESTIONS = [q for q, hit in zip(finetune_questions, rr_finetune_hits_at_k) if hit == 0] 

In [45]:
# Now generate an answer for each question
keyword_search_queries = []
for question in QUESTIONS:
    # Retrieve related documents from retriever
    retriever_results = qa_retriever.retrieve(
        query=question
    )

    # Now generate answer from question and retrieved documents
    predicted_result = qa_generator.predict(
        query=question,
        documents=retriever_results,
        top_k=1
    )

    # Print you answer
    answers = predicted_result["answers"]
#     print(f'Generated answer is \'{answers[0]["answer"]}\' for the question = \'{question}\'')
    keyword_search_queries.append(answers[0]["answer"])

Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 11.95 Batches/s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 12.69 Batches/s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 13.20 Batches/s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 13.02 Batches/s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 11.07 Batches/s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 12.55 Batches/s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 10.41 Batches/s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 11.74 Batches/s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 10.56 Batches/s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 11.13 Batches/s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00,  9.37 Batches/s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00,  9.59 Batches/s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 11.81 Batches/s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00

In [46]:
# Now I fire up the ES doc store to use for BM25 queries using the keywords generated by the step before

# Recommended: Start Elasticsearch using Docker
! docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2
        
# wait until ES has started
! sleep 30

b9bacf69309bb402987d6930b111deb98c0191b5857477057be36dc70a02e899


In [47]:

from haystack.document_store.elasticsearch import ElasticsearchDocumentStore

es_document_store = ElasticsearchDocumentStore(host="localhost", username="", password="",
                                            index="document")


01/31/2021 17:59:13 - INFO - elasticsearch -   PUT http://localhost:9200/document [status:200 request:0.401s]
01/31/2021 17:59:13 - INFO - elasticsearch -   PUT http://localhost:9200/label [status:200 request:0.123s]


In [140]:
from haystack.retriever.sparse import ElasticsearchRetriever

es_retriever = ElasticsearchRetriever(es_document_store)

# from haystack.retriever.sparse import TfidfRetriever

# es_retriever = TfidfRetriever(es_document_store)

In [126]:
df2 = df.rename(columns={"text": "question", "answer": "text"})

In [50]:
# Convert Dataframe to list of dicts and index them in our DocumentStore
docs_to_index = df2.to_dict(orient="records")

es_document_store.delete_all_documents("document")

es_document_store.write_documents(docs_to_index)

01/31/2021 17:59:14 - INFO - elasticsearch -   POST http://localhost:9200/document/_delete_by_query [status:200 request:0.148s]
01/31/2021 17:59:15 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.491s]


In [141]:
from haystack import Finder

es_finder = Finder(reader=None, retriever=es_retriever)
prediction = es_finder.get_answers_via_similar_questions(question=keyword_search_queries[0], top_k_retriever=3)
print_answers(prediction, details="all")

            1. The 'Finder' class will be deprecated in the next Haystack release in 
            favour of a new `Pipeline` class that supports building custom search pipelines using Haystack components
            including Retriever, Readers, and Generators.
            For more details, please refer to the issue: https://github.com/deepset-ai/haystack/issues/544
            2. The `question` parameter in search requests & results is renamed to `query`.
01/31/2021 23:46:21 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.007s]


{   'answers': [   {   'answer': "I studied music and economics. I'm a music "
                                 'major, economics minor and in music I do '
                                 'mostly composition and sound engineering.',
                       'context': "I studied music and economics. I'm a music "
                                  'major, economics minor and in music I do '
                                  'mostly composition and sound engineering.',
                       'document_id': '4d085040-2448-48d6-b397-61efa7f793d6',
                       'meta': {   'id_video': '73bcb9476c0c28cba5c7ddde802b7c63',
                                   'question': 'What do you study?'},
                       'offset_end': 125,
                       'offset_start': 0,
                       'probability': 0.6681556424183887,
                       'question': None,
                       'score': 5.5988407},
                   {   'answer': 'A little bit. My economics minor is h

In [157]:
# %%capture --no-stdout --no-display

# hits_at_1 = 0
# hits_at_k = 0
# hits, probs, scores, answers = [], [], [], []
# for query, question in zip(keyword_search_queries, QUESTIONS):
#     prediction = es_finder.get_answers_via_similar_questions(question=query, top_k_retriever=10);
#     if len(prediction["answers"]) == 0:
#         hits.append(0)
#         hits_at_k += 0
#     else:    
#     answer = prediction['answers'][0]['answer']
#     k_answers = [pred['answer'] for pred in prediction['answers']]
#     if answer in df_dial[df_dial['Q'] == question][['BA1', 'BA2', 'BA3', 'BA4', 'BA5', 'BA6']].values:
#         hits_at_1 += 1
#         hits.append(1)
#     else:
#         hits_at_1 += 0
#         hits.append(0)
#     probs.append(prediction['answers'][0]['probability'])
#     scores.append(prediction['answers'][0]['score'])
#     answers.append(answer)
#     if any([pred_ans in df_dial[df_dial['Q'] == question][['BA1', 'BA2', 'BA3', 'BA4', 'BA5', 'BA6']].values for pred_ans in k_answers]):
#         hits_at_k += 1
#     else:
#         hits_at_k += 0
        
        
        
hits_at_k, hits_at_k_itemized, probs, scores, answers = hitsatk(
    1, 
    es_document_store,
    keyword_search_queries, 
    QUESTIONS,
    df_dial, 
    annotation_cols, 
    finder=es_finder)



02/01/2021 09:52:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.006s]
02/01/2021 09:52:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]
02/01/2021 09:52:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]
02/01/2021 09:52:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.007s]
02/01/2021 09:52:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.006s]
02/01/2021 09:52:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.006s]
02/01/2021 09:52:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.006s]
02/01/2021 09:52:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]
02/01/2021 09:52:28 - INFO - elasticsearch -   P

02/01/2021 09:52:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]
02/01/2021 09:52:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]
02/01/2021 09:52:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.006s]
02/01/2021 09:52:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.006s]
02/01/2021 09:52:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]
02/01/2021 09:52:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]
02/01/2021 09:52:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]
02/01/2021 09:52:28 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]
02/01/2021 09:52:28 - INFO - elasticsearch -   P

02/01/2021 09:52:29 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.007s]
02/01/2021 09:52:29 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.006s]
02/01/2021 09:52:29 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.009s]
02/01/2021 09:52:29 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]


In [158]:
print(hits_at_k/len(QUESTIONS))

0.056338028169014086


In [159]:
df_qa_nothit = pd.DataFrame(
{
    "question": QUESTIONS,
    "answer": answers,
    "hit_at_k": hits,
    "prob": probs
})

In [160]:
df_qa_nothit.groupby("hit_at_k").describe()

Unnamed: 0_level_0,prob,prob,prob,prob,prob,prob,prob,prob
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
hit_at_k,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,117.0,0.705296,0.07154,0.55907,0.652693,0.696313,0.741918,0.889022
1,8.0,0.729546,0.106643,0.629605,0.646989,0.677098,0.818107,0.900983


In [165]:
thr_qa = df_qa_nothit.groupby("hit_at_k").describe()["prob"]["min"][1]
thr_qa

0.6296047528376791

In [194]:
j = 10
k = 1
thr = thr_sel

result_items = []
result_probs = []
for question, embedding in zip(finetune_questions, finetune_questions_emb):
    predictions = document_store.query_by_embedding(
        np.array(embedding), 
        top_k=j, 
        return_embedding=False
    )
    pred_answers = [pred.meta['answer'] for pred in predictions]
    qq_probs = np.array([pred.probability for pred in predictions])
    qa_probs = np.array([valid_preds[(valid_preds['q']==question) &
                            (valid_preds['A']==pred_ans)]['y_pred'].values[0] for 
                pred_ans in pred_answers])
    comb_probs = qq_probs * qa_probs        
    sorted_probs = np.sort((comb_probs))[::-1][:k]
    sorted_indices = np.argsort((comb_probs))[::-1][:k]
    pred_answers_reranked = [pred_answers[i] for i, p in zip(sorted_indices, sorted_probs) if p >= thr]
#     print(question, "\n", pred_answers_reranked, "\n")
    if len(pred_answers_reranked) == 0:
        # Retrieve related documents from retriever
        qa_retriever_results = qa_retriever.retrieve(
            query=question
        )
        # Now generate answer from question and retrieved documents
        qa_predicted_result = qa_generator.predict(
            query=question,
            documents=qa_retriever_results,
            top_k=1
        )
        # Get you answer
        answers = qa_predicted_result["answers"]
        keywords_query = answers[0]["answer"]
        prediction = es_finder.get_answers_via_similar_questions(question=keywords_query, top_k_retriever=1)
        if len(prediction["answers"]) > 0:
            aa_prob = prediction["answers"][0]["probability"]
#             print(aa_prob, "\n")
            if aa_prob >= thr_qa:
                pred_answers_reranked.append(prediction["answers"][0]["answer"])
                comb_probs = [aa_prob]
#     print(pred_answers_reranked, "\n==========================================\n")

    annotated_answers = df_dial[df_dial['Q'] == question][annotation_cols].values
    if any([pred_ans in annotated_answers for pred_ans in pred_answers_reranked]):
        result_items.append(1)
        result_probs.append(sorted_probs[0])
    elif (len(pred_answers_reranked) == 0) & (isNaN(annotated_answers[0][0])):
        result_items.append(1)
        result_probs.append(1)
    else:
        result_items.append(0)
        result_probs.append(max(comb_probs))

Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 12.31 Batches/s]
02/01/2021 12:18:10 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.007s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 11.07 Batches/s]
02/01/2021 12:18:17 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 12.73 Batches/s]
02/01/2021 12:18:26 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 12.69 Batches/s]
02/01/2021 12:18:35 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 10.19 Batches/s]
02/01/2021 12:18:42 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.006s]
Creating Embeddings: 100%|██████████| 1/

Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 10.85 Batches/s]
02/01/2021 12:25:09 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 11.60 Batches/s]
02/01/2021 12:25:18 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00,  9.22 Batches/s]
02/01/2021 12:25:26 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00,  9.59 Batches/s]
02/01/2021 12:25:36 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.005s]
Creating Embeddings: 100%|██████████| 1/1 [00:00<00:00, 10.77 Batches/s]
02/01/2021 12:25:45 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.007s]
Creating Embeddings: 100%|██████████| 1/

In [195]:
print(sum(result_items)/len(finetune_questions))

# without thresholding it was 0.175
# 0.2125 on test questions

0.1949685534591195


In [196]:
df_qa_finetune = pd.DataFrame(
{
    "question": finetune_questions,
    "hit_at_k": result_items,
    "prob": result_probs
})

In [197]:
df_qa_finetune.to_csv("~/Documents/df_qa_finetune.csv")

In [187]:
rr_finetune = pd.DataFrame(
{
    "question": finetune_questions,
    "hit_at_k": rr_finetune_hits_at_k,
    "prob": rr_finetune_probs
})
rr_finetune.to_csv("~/Documents/rr_finetune.csv")