<a href="https://colab.research.google.com/github/ChenKua/xir/blob/main/Haystack_scifact_DensePassageRetriever.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare the dataset

In [None]:
!pip install beir
!pip install tensorflow-text
!pip install farm-haystack
!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git

In [None]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval

from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch

import logging
import pathlib, os
import random

from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers
from haystack.nodes import FARMReader, TransformersReader

Dataset

In [None]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader

dataset = "scifact"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
data_path = util.download_and_unzip(url, "datasets")
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

datasets/scifact.zip:   0%|          | 0.00/2.69M [00:00<?, ?iB/s]

  0%|          | 0/5183 [00:00<?, ?it/s]

In [None]:
import pandas as pd
pd_corpus = pd.DataFrame(corpus)

Logging

In [None]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

In [None]:
from typing import List
import requests
import pandas as pd
from haystack import Document
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import RAGenerator, DensePassageRetriever
from haystack.utils import fetch_archive_from_http

documents: List[Document] = []
for title, text in pd_corpus.iteritems():
  documents.append(Document(content=text['text'], meta={"name": title or ""}))

from haystack.document_stores import FAISSDocumentStore
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True)


In [None]:
# Delete existing documents in documents store
document_store.delete_documents()

# Write documents to document store
document_store.write_documents(documents)


Writing Documents:   0%|          | 0/5183 [00:00<?, ?it/s]

# Haystack Retriever

*   BaseGraphRetriever(BaseComponent) ！！！
*   BaseRetriever(BaseComponent)
*   BM25Retriever(BaseRetriever)
*   FilterRetriever(BM25Retriever)
*   TfidfRetriever(BaseRetriever)
*   DensePassageRetriever(BaseRetriever)
*   TableTextRetriever(BaseRetriever)
*   EmbeddingRetriever(BaseRetriever)
*   Text2SparqlRetriever(BaseGraphRetriever)！！！

See documentation at: https://github.com/deepset-ai/haystack/blob/master/docs/_src/api/api/retriever.md




In [None]:
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    max_seq_len_query=64,
    max_seq_len_passage=256,
    batch_size=16,
    use_gpu=True,
    embed_title=True,
    use_fast_tokenizers=True,
)

document_store.update_embeddings(retriever)

In [None]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

from haystack.pipelines import ExtractiveQAPipeline
pipe = ExtractiveQAPipeline(reader, retriever)

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find deepset/roberta-base-squad2 locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...


Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/473M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  Loaded deepset/roberta-base-squad2


Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.infer -  Got ya 2 parallel workers to do inference ...
INFO - haystack.modeling.infer -   0     0  
INFO - haystack.modeling.infer -  /w\   /w\ 
INFO - haystack.modeling.infer -  /'\   / \ 


In [None]:
prediction = pipe.run(
    query='Cardiac injury is common in critical cases of COVID-19.', params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

print_answers(prediction, details="minimum")


  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.83 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 21.87 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  3.17 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 14.14 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 18.88 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 13.49 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 33.71 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 28.21 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 20.62 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 40.04 Batches/s]


Query: Cardiac injury is common in critical cases of COVID-19.
Answers:
[   {   'answer': 'strategic-infarct dementia',
        'context': 'erioration from a lacunar infarct is thalamocortical '
                   'disconnection of white-matter tracts, in some instances '
                   'leading to "strategic-infarct dementia. "'},
    {   'answer': 'dementia',
        'context': 'vere verbal memory loss. Additional cognitive deficits '
                   'consistent with dementia occurred in four patients. A '
                   'right-sided infarct caused transient impa'},
    {   'answer': 'carvedilol',
        'context': 't splenic denervation 2 weeks prior to MCAO or received '
                   'injections of carvedilol, a pan adrenergic receptor '
                   'blocker, prazosin, an alpha1 receptor bloc'},
    {   'answer': 'the sympathetic nervous system',
        'context': 'is organ in stroke-induced neurodegeneration. Activation '
                   'of the sympa




In [None]:
qrels['1']

{'31715818': 1}