In [None]:
# Make sure you have a GPU running
!nvidia-smi

Wed Aug 24 18:33:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

Here are the packages and imports that we'll need:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install the latest release of Haystack in your own environment
#! pip install farm-haystack

# Install the latest master of Haystack
!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss]

In [None]:
from typing import List
import requests
import pandas as pd
from haystack import Document
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import RAGenerator, DensePassageRetriever
from haystack.utils import fetch_archive_from_http

In [None]:
# Initialize FAISS document store.
# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding
#document_store = FAISSDocumentStore(faiss_index_factory_str="Flat",embedding_dim=1596, return_embedding=True)

document_store=FAISSDocumentStore(sql_url="sqlite:////content/drive/MyDrive/Saved_Models_Dir/faiss_document_store.db",validate_index_sync=False, faiss_index_factory_str="Flat",return_embedding=True)


In [None]:
# Initialize DPR Retriever to encode documents, encode question and query documents
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
)

# Initialize RAG Generator
generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=True,
    top_k=1,
    max_length=200,
    min_length=2,
    embed_title=True,
    num_beams=2,
)

In [None]:
# Create dataframe with columns "title" and "text"
df = pd.read_csv(f"/content/drive/MyDrive/Anas New Dataset/selected_122235_data.csv", sep=",")
# Minimal cleaning
df.fillna(value="", inplace=True)
df.head()  

Unnamed: 0.1,Unnamed: 0,paper_id,title,abstract_x,body_text
0,0,d1aafb70c066a2068b02786f8929fd9c900897fb,Clinical features of culture-proven Mycoplasma pneumoniae infections at King...,OBJECTIVE: This retrospective chart review describes the epidemiology and cl...,Mycoplasma pneumoniae is a common cause of upper and lower respiratory tract...
1,1,03203ab50eb64271a9e825f94a1b1a6c46ea14b3,Recombination Every Day: Abundant Recombination in a Virus during a Single M...,Viral recombination can dramatically impact evolution and epidemiology. In v...,"As increasing numbers of full-length viral sequences become available, recom..."
2,2,d450fc8885843d48772df9a898552302f8c80b98,Draft versus finished sequence data for DNA and protein diagnostic signature...,"Sequencing pathogen genomes is costly, demanding careful allocation of limit...",Draft sequencing requires that the order of base pairs in cloned fragments o...
3,3,4ba79e54ecf81b30b56461a6aec2094eaf7b7f06,Relevance of human metapneumovirus in exacerbations of COPD,BACKGROUND AND METHODS: Human metapneumovirus (hMPV) is a recently discovere...,Respiratory viruses play an important role in exacerbations of COPD and this...
4,4,ccc36b04ad5c71de61967624f7f739e868d7c0a5,Development of a humanized monoclonal antibody with therapeutic potential ag...,Neutralization of West Nile virus (WNV) in vivo correlates with the developm...,Development of a humanized monoclonal antibody with therapeutic potential ag...


Dropping First Column

In [None]:
len(df)

111539

In [None]:
df.drop(['paper_id' ,'abstract_x'], axis=1, inplace=True)
df.head()

Unnamed: 0.1,Unnamed: 0,title,body_text
0,0,Clinical features of culture-proven Mycoplasma pneumoniae infections at King...,Mycoplasma pneumoniae is a common cause of upper and lower respiratory tract...
1,1,Recombination Every Day: Abundant Recombination in a Virus during a Single M...,"As increasing numbers of full-length viral sequences become available, recom..."
2,2,Draft versus finished sequence data for DNA and protein diagnostic signature...,Draft sequencing requires that the order of base pairs in cloned fragments o...
3,3,Relevance of human metapneumovirus in exacerbations of COPD,Respiratory viruses play an important role in exacerbations of COPD and this...
4,4,Development of a humanized monoclonal antibody with therapeutic potential ag...,Development of a humanized monoclonal antibody with therapeutic potential ag...


Renaming The Columns

In [None]:
#new1.rename(columns={'metadata': 'title', 'body_text': 'text'}, inplace=True)
df.rename(columns={'body_text': 'text'}, inplace=True)
df.head()

Unnamed: 0.1,Unnamed: 0,title,text
0,0,Clinical features of culture-proven Mycoplasma pneumoniae infections at King...,Mycoplasma pneumoniae is a common cause of upper and lower respiratory tract...
1,1,Recombination Every Day: Abundant Recombination in a Virus during a Single M...,"As increasing numbers of full-length viral sequences become available, recom..."
2,2,Draft versus finished sequence data for DNA and protein diagnostic signature...,Draft sequencing requires that the order of base pairs in cloned fragments o...
3,3,Relevance of human metapneumovirus in exacerbations of COPD,Respiratory viruses play an important role in exacerbations of COPD and this...
4,4,Development of a humanized monoclonal antibody with therapeutic potential ag...,Development of a humanized monoclonal antibody with therapeutic potential ag...


In [None]:
len(df)

111539

Let's download a csv containing some sample text and preprocess the data.


We can cast our data into Haystack Document objects.
Alternatively, we can also just use dictionaries with "text" and "meta" fields

In [None]:
# Use data to initialize Document objects
titles = list(df["title"].values)
texts = list(df["text"].values)
documents: List[Document] = []
for title, text in zip(titles, texts):
    documents.append(Document(content=text, meta={"name": title or ""}))

Here we initialize the FAISSDocumentStore, DensePassageRetriever and RAGenerator.
FAISS is chosen here since it is optimized vector storage.

We write documents to the DocumentStore, first by deleting any remaining documents then calling `write_documents()`.
The `update_embeddings()` method uses the retriever to create an embedding for each document.


In [None]:
# Delete existing documents in documents store
document_store.delete_documents()

# Write documents to document store
document_store.write_documents(documents)

# Add documents embeddings to index
document_store.update_embeddings(retriever=retriever)

Writing Documents:   0%|          | 0/111539 [00:00<?, ?it/s]

Updating Embedding:   0%|          | 0/111506 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/1520 [00:00<?, ? Docs/s]

Here are our OLD questions ----- 21th July generated fornewdataframe.csv 

In [None]:
retriever.save("/content/drive/MyDrive/Saved_Models_Dir/retriever.pt")
document_store.save("/content/drive/MyDrive/Saved_Models_Dir/index.faiss")

In [None]:
# retriever.save("/content/drive/MyDrive/Dataset/retriever.pt")
# document_store.save("/content/drive/MyDrive/Dataset/faiss.index")

In [None]:
QUESTIONS = [
    "When did a new era of vaccine research begin?",
    "What is “Covid blindness”?",
    "What is First-strand cDNA?",
    "At what dpi did LDT3-A group showed the highest antibody level?",
    "What is welfare?",
    "How many people need end of life PC every year globally?",
    "Where were cells cultured?",
    "What is Vitamin D-dependent type 1A rickets?",
    "What is the prevalence of obesity in ICU?",
    "Where did the “mad cow” disease spread from?"
]

Now let's run our system!
The retriever will pick out a small subset of documents that it finds relevant.
These are used to condition the generator as it generates the answer.
What it should return then are novel text spans that form and answer to your question!

In [None]:
from haystack.utils import print_documents
from haystack.pipelines import DocumentSearchPipeline


p_retrieval = DocumentSearchPipeline(retriever)

question = input()

res = p_retrieval.run(query=question, params={"Retriever": {"top_k": 2}})
print_documents(res, max_text_len=512)

# for question in QUESTIONS:
#   res = p_retrieval.run(query=question, params={"Retriever": {"top_k": 2}})
#   print_documents(res, max_text_len=512)

What is “Covid blindness

Query: What is “Covid blindness

{   'content': 'The COVID pandemic has reached tens of millions of people '
               'around the world and has resulted in the deaths of several '
               'millions people [4] . Long-term complications are now '
               'described as the long COVID and affect many organs such as the '
               'heart, brain, kidney, pancreas and digestive system [8] . The '
               'post-acute COVID syndrome or long COVID syndrome [11] has been '
               'described at debated. Clinical signs can include fatigue, '
               'dyspnea, myalgia, diffuse pain, headaches, anxiety/depression '
               'and cognitive impairments (brain ...',
    'name': 'Long COVID: cognitive complaints (brain fog) and dysfunction of '
            'the cingulate cortex'}

{   'content': 'The symptoms of SARS-CoV-2 infection are not limited to the '
               'acute phase and may persist many months after the tes

In [None]:
new_document_store = FAISSDocumentStore.load("/content/drive/MyDrive/Saved_Models_Dir/index.faiss")
#ew_retriever = DensePassageRetriever.load("/content/drive/MyDrive/Saved_Models_Dir/retriever.h5", document_store=new_document_store)

new_retriever = DensePassageRetriever.load("/content/drive/MyDrive/Saved_Models_Dir/retriever.pt", document_store=new_document_store)

In [None]:
from haystack.utils import print_documents
from haystack.pipelines import DocumentSearchPipeline


p_retrieval = DocumentSearchPipeline(new_retriever)

question = input()

res = p_retrieval.run(query=question, params={"Retriever": {"top_k": 2}})
print_documents(res, max_text_len=512)

What is “Covid" blindness ?

Query: What is “Covid" blindness ?

{   'content': 'The COVID pandemic has reached tens of millions of people '
               'around the world and has resulted in the deaths of several '
               'millions people [4] . Long-term complications are now '
               'described as the long COVID and affect many organs such as the '
               'heart, brain, kidney, pancreas and digestive system [8] . The '
               'post-acute COVID syndrome or long COVID syndrome [11] has been '
               'described at debated. Clinical signs can include fatigue, '
               'dyspnea, myalgia, diffuse pain, headaches, anxiety/depression '
               'and cognitive impairments (brain ...',
    'name': 'Long COVID: cognitive complaints (brain fog) and dysfunction of '
            'the cingulate cortex'}

{   'content': 'Long COVID refers to a long-term multi-system disability '
               'syndrome seen in COVID-19 survivors. The US Center