# Semantic-Search-for-COVID-19-ResearchPapers

This work builds a semantic search engine using BERT, to search a query through the dataset of research papers provided as part of Kaggle's competion CORD-19-research-challenge

This work, 
1.   first divides the dataset to paragraphs
2.   then uses BERT to embedded paragraphs of papers using bert-base-nli-mean-tokens pretrained model
3.   finally runs a query and returns the top 5 paragraphs and their papers' titles,abstract,abstract_summary

In [None]:
!pip install -U sentence-transformers

In [None]:
from google.colab import files

# Install Kaggle library
!pip install -q kaggle
u = files.upload()
# upload ur kaggle api
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
# kaggle dataset
!kaggle datasets download -d allen-institute-for-ai/CORD-19-research-challenge
!unzip  CORD-19-research-challenge.zip -d /content/CORD-19-research-challenge

## Data Processing

In [None]:
import glob
import json
import pandas as pd
from tqdm import tqdm
root_path = '/content/CORD-19-research-challenge/'
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json)

In [None]:
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df['abstract']=meta_df['abstract'].astype(str)
meta_df.head()

In [None]:
print(all_json[0])

with open(all_json[0]) as file:
    content = json.load(file)
print(content.keys())

In [None]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            print(self.paper_id)
            # self.abstract = []
            self.body_text = []
            # # Abstract
            # for entry in content['abstract']:
            #     self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            # self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.body_text[:200]}...'
first_row = FileReader(all_json[0])
print(first_row)

In [None]:
def get_breaks(content, length):
    data = ""
    # print(content)
    word = content.split(' ')
    total_chars = 0
    for i in range(len(word)):
        total_chars += len(word[i])
        if total_chars > length:
            data = data + "<br>" + word[i]
            total_chars = 0
        else:
            data = data + " " + word[i]

    return data

In [None]:
dict_ = {'paper_id' : [], 'abstract' : [], 'body_text' : [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json)//10) == 0:
        print(f'                            Processing index : {idx} out of {len(all_json)}')
    content = FileReader(entry)

    # get meta data
    meta_data = meta_df.loc[meta_df.sha == content.paper_id]
    if len(meta_data) == 0:
        continue
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append('\n'.join(meta_data.abstract))
    # print(meta_data['abstract'].values[0])
    dict_['body_text'].append(content.body_text)

    try:
        authors = meta_data['authors'].values[0].split(';')
         # we used .values[0] coz if we use .values then it will return array 
         # and will give error coz we cannot apply .split() to an array
        if len(authors) > 2:
            dict_['authors'].append('. '.join(authors[:2]) + "...")
        else:
            dict_['authors'].append('. '.join(authors))
    except Exception as e:
        # if #authors = 1 or 0 then it will come here
        dict_['authors'].append(meta_data['authors'].values[0])
    
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
        
    dict_['journal'].append(meta_data['journal'].values[0])

    if len(meta_data.abstract) == 0:
        # if no abstract
        dict_['abstract_summary'].append('Not Provided.')
    elif len(meta_data.abstract.values[0].split(' ')) > 100:
        # if abstract is too long cut it to 100 also per line 40 words only
        # ' '.join(info)
        info = meta_data.abstract.values[0].split(' ')[:100]
        print(info)
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # if less than 100 just append it with 40 words per line
        summary = get_breaks(' '.join(meta_data.abstract), 40)
        dict_['abstract_summary'].append(summary)


df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
print(df_covid.head())
print(f'df size : {len(df_covid)}')
df_covid.drop_duplicates(['abstract', 'body_text'], inplace=True)


In [None]:
print(df_covid['abstract'].describe(include = 'all'))
print(df_covid['body_text'].describe(include='all'))

In [None]:
df_covid.dropna(inplace=True)
df_covid.info()

### Data pre process

In [None]:
df_covid = df_covid.head(3000)

In [None]:
import re

df_covid['body_text'] = df_covid['body_text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]', '', x))
df_covid['abstract'] = df_covid['abstract'].apply(lambda x: re.sub('[^a-zA-z0-9\s]', '', x))

df_covid['body_text'] = df_covid['body_text'].apply(lambda x: x.lower())
df_covid['abstract'] = df_covid['abstract'].apply(lambda x: x.lower())

In [None]:
text = df_covid.drop(["authors", "journal", "Unnamed: 0"], axis=1)
text = text.reset_index()
text.head()

In [None]:
from tqdm import tqdm

text_dict = text.to_dict()
len_text = len(text_dict["paper_id"])

In [None]:
paper_id_list  = []
body_text_list = []

title_list = []
abstract_list = []
abstract_summary_list = []
for i in tqdm(range(0,len_text)):
  paper_id = text_dict["paper_id"][i]
  body_text = text_dict["body_text"][i].split("\n")
  title = text_dict["title"][i]
  abstract = text_dict["abstract"][i]
  abstract_summary = text_dict["abstract_summary"][i]
  for b in body_text:
    paper_id_list.append(paper_id)
    body_text_list.append(b)
    title_list.append(title)
    abstract_list.append(abstract)
    abstract_summary_list.append(abstract_summary)

In [None]:
df = pd.DataFrame({"paper_id":paper_id_list,"title":title_list,"abstract":abstract_list,"abstract_summary":abstract_summary_list},index=body_text_list)

In [None]:
from google.colab import drive
drive.mount('drive')
df.to_csv('covid_everything.csv')
!cp covid_everything.csv "drive/My Drive/datasets"

In [None]:
df_sentences = pd.DataFrame({"paper_id":paper_id_list},index=body_text_list)
df_sentences.head()

In [None]:
df_sentences.to_csv('covid_sent.csv')
!cp covid_sent.csv "drive/My Drive/datasets"

## Preparing Data for Embedding

In [None]:
#IMPORT 'covid_everything.csv' FROM DRIVE INTO GOOGLE-COLAB:

!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

#2.1 Get the file
downloaded = drive.CreateFile({'id':'1fTYcZZjXzMQmt-xjL2UMpFxlWcwWrIGB'})
downloaded.GetContentFile('covid_everything.csv')  # file name to be imported to colab

# for unzipping use
!unzip file_name


unzip:  cannot find or open file_name, file_name.zip or file_name.ZIP.


In [None]:
import pandas as pd

df_covid = pd.read_csv('covid_everything.csv')
df = pd.DataFrame(df_covid)


In [None]:
df = df.set_index("Unnamed: 0")
df.head()

Unnamed: 0_level_0,paper_id,title,abstract,abstract_summary
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
respiratory syncytial virus rsv is an important pathogen of the human respiratory tract borchers et al 2013 rsv infection results in viral bronchiolitis in around 30 of infants who become infected and it can result in lifethreatening severe bronchiolitis and viral pneumonia smyth and openshaw 2006 rsv causes significant mortality in the developing world resulting in an estimated 200000 annual deaths in young children globally in addition to major morbidity 338 million episodes worldwide annually nair et al 2010 besides rsv is a leading cause of morbidity and mortality in elderly and immunocompromised individuals kwon et al 2017,33d14ff5ca29615978c7925403da22600922f9b4,Imiquimod suppresses respiratory syncytial<br...,abstract respiratory syncytial virus rsv is a ...,Abstract Respiratory syncytial virus (RSV) is...
despite the prevalence of rsv bronchiolitis there is no vaccine available and apart from supportive measures there is no specific effective treatment since routine use of bronchodilators or antiviral ribavirin has been proven to be of no significant benefit tregoning and schwarze 2010 turner et al 2014 therefore a clearer understanding of host defense factors that contribute to effective protection against rsv infection and disease is urgently needed and could provide substantial help to the development of novel therapeutic strategies,33d14ff5ca29615978c7925403da22600922f9b4,Imiquimod suppresses respiratory syncytial<br...,abstract respiratory syncytial virus rsv is a ...,Abstract Respiratory syncytial virus (RSV) is...
during the early course of most viral infections antiviral immunity is induced through pattern recognition receptors such as tolllike receptors tlr which stimulate the innate immune response tlr can trigger cytokine secretion dendritic cell maturation and antigen presentation which in turn can enhance the adaptive immune response uematsu and akira 2006 because of this ability to induce innate and adaptive responses tlr agonists have been explored as antiviral therapeutic agents tlr3 tlr4 tlr7 tlr8 and tlr9 agonists have been successfully applied to nonhuman primate models of dengue virus and hepatitis b virus hbv and in murine models for influenza and herpes simplex virus hsv boivin et al 2008 zhang et al 2009 boukhvalova et al 2010 sariol et al 2011 shinya et al 2011 tuvim et al 2012 lanford et al 2013 lucifora et al 2018 to et al 2019 particularly in the mouse model of rsv prophylactic treatment with a tlr3 agonist not only reduces viral replication in the lungs but also results in an amelioration of the clinical illness and a reduction in lung inflammation guerreroplata et al 2005,33d14ff5ca29615978c7925403da22600922f9b4,Imiquimod suppresses respiratory syncytial<br...,abstract respiratory syncytial virus rsv is a ...,Abstract Respiratory syncytial virus (RSV) is...
in order to provide further insight into the potential of tlr agonists to induce antiviral and immunomodulatory activities in rsv infections the aim of the present study was to test the antirsv activity of different tlr agonists in epithelial cells moreover we extended the testing for imiquimod tlr7 agonist in order to analyze its antiviral mode of action against rsv in vitro as well as its effect on the production of different cytokines in rsvinfected epithelial and macrophages finally we studied the activity of imiquimod against rsv infection in a murine model of pulmonary infection,33d14ff5ca29615978c7925403da22600922f9b4,Imiquimod suppresses respiratory syncytial<br...,abstract respiratory syncytial virus rsv is a ...,Abstract Respiratory syncytial virus (RSV) is...
lps tlr4 ligand from escherichia coli serotype 055b5 n62phenylisopropyl adenosine rpia dibutyryl camp dbcamp and forskolin were obtained from sigma,33d14ff5ca29615978c7925403da22600922f9b4,Imiquimod suppresses respiratory syncytial<br...,abstract respiratory syncytial virus rsv is a ...,Abstract Respiratory syncytial virus (RSV) is...


In [None]:
#IMPORT 'covid_everything.csv' FROM DRIVE INTO GOOGLE-COLAB:

downloaded = drive.CreateFile({'id':'1FK3-pUpETV6qViCWx5TOIfO7LFsuGp_Q'})
downloaded.GetContentFile('covid_sent.csv')  # file name to be imported to colab

In [None]:
df_sentences = pd.read_csv("covid_sent.csv")
df_sentences = df_sentences.set_index("Unnamed: 0")
df_sentences.head()

Unnamed: 0_level_0,paper_id
Unnamed: 0,Unnamed: 1_level_1
respiratory syncytial virus rsv is an important pathogen of the human respiratory tract borchers et al 2013 rsv infection results in viral bronchiolitis in around 30 of infants who become infected and it can result in lifethreatening severe bronchiolitis and viral pneumonia smyth and openshaw 2006 rsv causes significant mortality in the developing world resulting in an estimated 200000 annual deaths in young children globally in addition to major morbidity 338 million episodes worldwide annually nair et al 2010 besides rsv is a leading cause of morbidity and mortality in elderly and immunocompromised individuals kwon et al 2017,33d14ff5ca29615978c7925403da22600922f9b4
despite the prevalence of rsv bronchiolitis there is no vaccine available and apart from supportive measures there is no specific effective treatment since routine use of bronchodilators or antiviral ribavirin has been proven to be of no significant benefit tregoning and schwarze 2010 turner et al 2014 therefore a clearer understanding of host defense factors that contribute to effective protection against rsv infection and disease is urgently needed and could provide substantial help to the development of novel therapeutic strategies,33d14ff5ca29615978c7925403da22600922f9b4
during the early course of most viral infections antiviral immunity is induced through pattern recognition receptors such as tolllike receptors tlr which stimulate the innate immune response tlr can trigger cytokine secretion dendritic cell maturation and antigen presentation which in turn can enhance the adaptive immune response uematsu and akira 2006 because of this ability to induce innate and adaptive responses tlr agonists have been explored as antiviral therapeutic agents tlr3 tlr4 tlr7 tlr8 and tlr9 agonists have been successfully applied to nonhuman primate models of dengue virus and hepatitis b virus hbv and in murine models for influenza and herpes simplex virus hsv boivin et al 2008 zhang et al 2009 boukhvalova et al 2010 sariol et al 2011 shinya et al 2011 tuvim et al 2012 lanford et al 2013 lucifora et al 2018 to et al 2019 particularly in the mouse model of rsv prophylactic treatment with a tlr3 agonist not only reduces viral replication in the lungs but also results in an amelioration of the clinical illness and a reduction in lung inflammation guerreroplata et al 2005,33d14ff5ca29615978c7925403da22600922f9b4
in order to provide further insight into the potential of tlr agonists to induce antiviral and immunomodulatory activities in rsv infections the aim of the present study was to test the antirsv activity of different tlr agonists in epithelial cells moreover we extended the testing for imiquimod tlr7 agonist in order to analyze its antiviral mode of action against rsv in vitro as well as its effect on the production of different cytokines in rsvinfected epithelial and macrophages finally we studied the activity of imiquimod against rsv infection in a murine model of pulmonary infection,33d14ff5ca29615978c7925403da22600922f9b4
lps tlr4 ligand from escherichia coli serotype 055b5 n62phenylisopropyl adenosine rpia dibutyryl camp dbcamp and forskolin were obtained from sigma,33d14ff5ca29615978c7925403da22600922f9b4


In [None]:
df_sentences = df_sentences['paper_id'].to_dict() # the body_text are the keys and the paper_id are the values
df_sentences_list = list(df_sentences.keys()) # just creating a list of keys
len(df_sentences_list)

95335

In [None]:
df_sentences_list = [str(x) for x in df_sentences_list]

In [None]:
df.head()

Unnamed: 0_level_0,paper_id,title,abstract,abstract_summary
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
respiratory syncytial virus rsv is an important pathogen of the human respiratory tract borchers et al 2013 rsv infection results in viral bronchiolitis in around 30 of infants who become infected and it can result in lifethreatening severe bronchiolitis and viral pneumonia smyth and openshaw 2006 rsv causes significant mortality in the developing world resulting in an estimated 200000 annual deaths in young children globally in addition to major morbidity 338 million episodes worldwide annually nair et al 2010 besides rsv is a leading cause of morbidity and mortality in elderly and immunocompromised individuals kwon et al 2017,33d14ff5ca29615978c7925403da22600922f9b4,Imiquimod suppresses respiratory syncytial<br...,abstract respiratory syncytial virus rsv is a ...,Abstract Respiratory syncytial virus (RSV) is...
despite the prevalence of rsv bronchiolitis there is no vaccine available and apart from supportive measures there is no specific effective treatment since routine use of bronchodilators or antiviral ribavirin has been proven to be of no significant benefit tregoning and schwarze 2010 turner et al 2014 therefore a clearer understanding of host defense factors that contribute to effective protection against rsv infection and disease is urgently needed and could provide substantial help to the development of novel therapeutic strategies,33d14ff5ca29615978c7925403da22600922f9b4,Imiquimod suppresses respiratory syncytial<br...,abstract respiratory syncytial virus rsv is a ...,Abstract Respiratory syncytial virus (RSV) is...
during the early course of most viral infections antiviral immunity is induced through pattern recognition receptors such as tolllike receptors tlr which stimulate the innate immune response tlr can trigger cytokine secretion dendritic cell maturation and antigen presentation which in turn can enhance the adaptive immune response uematsu and akira 2006 because of this ability to induce innate and adaptive responses tlr agonists have been explored as antiviral therapeutic agents tlr3 tlr4 tlr7 tlr8 and tlr9 agonists have been successfully applied to nonhuman primate models of dengue virus and hepatitis b virus hbv and in murine models for influenza and herpes simplex virus hsv boivin et al 2008 zhang et al 2009 boukhvalova et al 2010 sariol et al 2011 shinya et al 2011 tuvim et al 2012 lanford et al 2013 lucifora et al 2018 to et al 2019 particularly in the mouse model of rsv prophylactic treatment with a tlr3 agonist not only reduces viral replication in the lungs but also results in an amelioration of the clinical illness and a reduction in lung inflammation guerreroplata et al 2005,33d14ff5ca29615978c7925403da22600922f9b4,Imiquimod suppresses respiratory syncytial<br...,abstract respiratory syncytial virus rsv is a ...,Abstract Respiratory syncytial virus (RSV) is...
in order to provide further insight into the potential of tlr agonists to induce antiviral and immunomodulatory activities in rsv infections the aim of the present study was to test the antirsv activity of different tlr agonists in epithelial cells moreover we extended the testing for imiquimod tlr7 agonist in order to analyze its antiviral mode of action against rsv in vitro as well as its effect on the production of different cytokines in rsvinfected epithelial and macrophages finally we studied the activity of imiquimod against rsv infection in a murine model of pulmonary infection,33d14ff5ca29615978c7925403da22600922f9b4,Imiquimod suppresses respiratory syncytial<br...,abstract respiratory syncytial virus rsv is a ...,Abstract Respiratory syncytial virus (RSV) is...
lps tlr4 ligand from escherichia coli serotype 055b5 n62phenylisopropyl adenosine rpia dibutyryl camp dbcamp and forskolin were obtained from sigma,33d14ff5ca29615978c7925403da22600922f9b4,Imiquimod suppresses respiratory syncytial<br...,abstract respiratory syncytial virus rsv is a ...,Abstract Respiratory syncytial virus (RSV) is...


##  bert

In [None]:
from sentence_transformers import SentenceTransformer
import scipy.spatial
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

# Query sentences:
queries = ['What has been published about medical care?',
           'Knowledge of the frequency, manifestations, and course of extrapulmonary manifestations of COVID-19, including, but not limited to, possible cardiomyopathy and cardiac arrest',
           'Use of AI in real-time health care delivery to evaluate interventions, risk factors, and outcomes in a way that could not be done manually',
           'Resources to support skilled nursing facilities and long term care facilities.',
           'Mobilization of surge medical staff to address shortages in overwhelmed communities .',
           'Age-adjusted mortality data for Acute Respiratory Distress Syndrome (ARDS) with/without other organ failure – particularly for viral etiologies .']
query_embeddings = embedder.encode(queries,show_progress_bar=True)


## corpus embedding 

# corpus = df_sentences_list
# corpus_embeddings = embedder.encode(corpus, show_progress_bar=True)
# Load pickle file
from google.colab import drive
drive.mount('drive')
dr = "drive/My Drive/datasets/"
with open(dr + "corpus_embeddings.pkl" , "rb") as file_:
  corpus_embeddings = pkl.load(file_)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=1.0, style=ProgressStyle(description_width=…




In [None]:
# find the closest 5 sentences from the corpus

closest_n = 5
print("\n Top 5 similar senntences in corpus:")
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]
    # print(distances)

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])
    print("\n\n=========================================================")
    print("==========================Query==============================")
    print("===",query,"=====")
    print("=========================================================")
    i = 1
    for idx, distance in results[0:closest_n]:
        print(i, ". Score :   ", "(Score : %.4f)"%(1-distance), "\n")
        # print("Paragraph :   ", df['']corpus[idx].strip(), "\n")
        row_dict = df.loc[df.index== corpus[idx]].to_dict()
        print("paper_id:  " , row_dict["paper_id"][corpus[idx]] , "\n")
        print("Title:  " , row_dict["title"][corpus[idx]] , "\n")
        print("Abstract:  " , row_dict["abstract"][corpus[idx]] , "\n")
        print("Abstract_Summary:  " , row_dict["abstract_summary"][corpus[idx]] , "\n")
        print("-------------------------------------------")
        i += 1


 Top 5 similar senntences in corpus:


=== What has been published about medical care? =====
1 . Score :    (Score : 0.7647) 

paper_id:   795b422b30014b724a0b505606cf02272ac0abff 

Title:    A Phase 3 Open-label, Randomized, Controlled<br>Study to Evaluate the Efficacy and Safety of<br>Intravenously Administered Ravulizumab Compared with Best<br>Supportive Care in Patients with COVID-19 Severe<br>Pneumonia, Acute Lung Injury, or Acute Respiratory<br>Distress Syndrome: A structured summary of a study<br>protocol for a randomised controlled trial 

Abstract:   objectives primary objective  to evaluate the effect of ravulizumab a longacting complement c5 inhibitor plus best supportive care bsc compared with bsc alone on the survival of patients with covid19 secondary objectives  number of days free of mechanical ventilation at day 29  duration of intensive care unit stay at day 29  change from baseline in sequential organ failure assessment sofa score at day 29  change from baseline in 

In [None]:
## to send the pickle file to drive

# from google.colab import drive
# drive.mount('drive')
# dr = "drive/My Drive/datasets/"
# import pickle as pkl
# with open(dr+"corpus_embeddings.pkl" , "wb") as file_:
#  pkl.dump(corpus_embeddings,file_)

RangeIndex(start=0, stop=96216, step=1)