<a href="https://colab.research.google.com/github/antonpolishko/colab-notebooks-sink/blob/master/task-ties/Covid_19_Search_Engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

taks-ties

https://trello.com/c/4NGSHn76



In [0]:
!pip install whoosh
import pandas as pd
import numpy as np
import os
from pathlib import Path, PurePath
from ipywidgets import interact
import ipywidgets as widgets
from collections import defaultdict
import requests
from requests.exceptions import HTTPError, ConnectionError

Collecting whoosh
[?25l  Downloading https://files.pythonhosted.org/packages/ba/19/24d0f1f454a2c1eb689ca28d2f178db81e5024f42d82729a4ff6771155cf/Whoosh-2.7.4-py2.py3-none-any.whl (468kB)
[K     |▊                               | 10kB 17.3MB/s eta 0:00:01[K     |█▍                              | 20kB 1.7MB/s eta 0:00:01[K     |██                              | 30kB 2.5MB/s eta 0:00:01[K     |██▉                             | 40kB 1.7MB/s eta 0:00:01[K     |███▌                            | 51kB 2.1MB/s eta 0:00:01[K     |████▏                           | 61kB 2.5MB/s eta 0:00:01[K     |█████                           | 71kB 2.9MB/s eta 0:00:01[K     |█████▋                          | 81kB 3.3MB/s eta 0:00:01[K     |██████▎                         | 92kB 3.6MB/s eta 0:00:01[K     |███████                         | 102kB 2.8MB/s eta 0:00:01[K     |███████▊                        | 112kB 2.8MB/s eta 0:00:01[K     |████████▍                       | 122kB 2.8MB/s eta 

In [0]:
#set data paths
from google.colab import drive
drive.mount('/content/drive')

drive_path=PurePath('/content/drive/My Drive')

input_dir = drive_path/'COVID-19'/'CORD-19-research-challenge-v5'

list(Path(input_dir).glob('*'))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[PosixPath('/content/drive/My Drive/COVID-19/CORD-19-research-challenge-v5/comm_use_subset'),
 PosixPath('/content/drive/My Drive/COVID-19/CORD-19-research-challenge-v5/noncomm_use_subset'),
 PosixPath('/content/drive/My Drive/COVID-19/CORD-19-research-challenge-v5/biorxiv_medrxiv'),
 PosixPath('/content/drive/My Drive/COVID-19/CORD-19-research-challenge-v5/custom_license'),
 PosixPath('/content/drive/My Drive/COVID-19/CORD-19-research-challenge-v5/json_schema.txt'),
 PosixPath('/content/drive/My Drive/COVID-19/CORD-19-research-challenge-v5/COVID.DATA.LIC.AGMT.pdf'),
 PosixPath('/content/drive/My Drive/COVID-19/CORD-19-research-challenge-v5/metadata.readme'),
 PosixPath('/content/drive/My Drive/COVID-19/CORD-19-research-challenge-v5/metadata.csv')]

In [0]:
def set_column_width(ColumnWidth, MaxRows):
    pd.options.display.max_colwidth = ColumnWidth
    pd.options.display.max_rows = MaxRows
    print('Set pandas dataframe column width to', ColumnWidth, 'and max rows to', MaxRows)
    
interact(set_column_width, 
         ColumnWidth=widgets.IntSlider(min=50, max=400, step=50, value=200),
         MaxRows=widgets.IntSlider(min=50, max=500, step=100, value=100));

interactive(children=(IntSlider(value=200, description='ColumnWidth', max=400, min=50, step=50), IntSlider(val…

In [0]:
def get(url, timeout=6):
    try:
        r = requests.get(url, timeout=timeout)
        return r.text
    except ConnectionError:
        print(f'Cannot connect to {url}')
        print(f'Remember to turn Internet ON in the Kaggle notebook settings')
    except HTTPError:
        print('Got http error', r.status, r.text)

# Convert the doi to a url
def doi_url(d): 
    return f'http://{d}' if d.startswith('doi.org') else f'http://doi.org/{d}'

In [0]:
#read the metadata file into df
metadata_path = input_dir / 'metadata.csv'
metadata = pd.read_csv(metadata_path,
                               dtype={'publish_time': str, #to do: extract year from 'pubblish time' as int
                                      'authors':str,
                                      'title': str,
                                      'abstract':str,
                                      'doi': str},
                       parse_dates = ['publish_time']
                       )

#set the abstract to the paper title if it is null
metadata['abstract'] = metadata['abstract'] .fillna(metadata['title'])

#extract year from datetime
metadata['publish_year'] = pd.DatetimeIndex(metadata['publish_time']).year
metadata['publish_year'] = metadata['publish_year'].fillna(9999)

In [0]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45774 entries, 0 to 45773
Data columns (total 18 columns):
cord_uid                       45774 non-null object
sha                            31753 non-null object
source_x                       45774 non-null object
title                          45617 non-null object
doi                            42440 non-null object
pmcid                          26243 non-null object
pubmed_id                      34641 non-null float64
license                        45774 non-null object
abstract                       45766 non-null object
publish_time                   45765 non-null datetime64[ns]
authors                        43774 non-null object
journal                        41707 non-null object
Microsoft Academic Paper ID    964 non-null float64
WHO #Covidence                 1768 non-null object
has_full_text                  45774 non-null bool
full_text_file                 35558 non-null object
url                            45472 n

In [0]:
import whoosh
from whoosh.qparser import *
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, NUMERIC, NGRAMWORDS
from whoosh.analysis import StemmingAnalyzer,StandardAnalyzer, NgramFilter
from whoosh import index

In [0]:
#create schema for the index
schema = Schema(year=NUMERIC(stored=True),
                author=TEXT(stored=True),
                title=TEXT(analyzer=StandardAnalyzer(),stored=True),
                abstract=TEXT(analyzer=StandardAnalyzer(),stored=True),
                doi=TEXT(stored=True))

In [0]:
#to create an index in a dictionary (only need to run once)
if not os.path.exists(drive_path/'indexdir'):
    os.mkdir(drive_path/'indexdir')
ix = index.create_in(drive_path/'indexdir', schema)
#open an existing index object
ix = index.open_dir(drive_path/'indexdir')
#create a writer object to add documents to the index
writer = ix.writer()
#now we can add documents to the index
# ref_id = metadata['ref_id']
year = metadata['publish_year']
author = metadata['authors']
title = metadata['title']
abstract = metadata['abstract']
doi = metadata['doi']

for YEAR, AUTHOR, TITLE, ABSTRACT, DOI in zip(year,author,title,abstract,doi):

  writer.add_document(
                  year= YEAR,
                  author=str(AUTHOR),
                  title=str(TITLE),
                  abstract=str(ABSTRACT),
                  doi=str(DOI))

#close the writer and save the added documents in the index
#you should call the commit() function once you finish adding the documents otherwise you will cause an error-
#when you try to edit the index next time and open another writer. 
writer.commit()


In [0]:
# need to cancel writer if error or need to reset
writer.cancel()

In [0]:
ix = index.open_dir(drive_path/'indexdir')

In [0]:
# search in 'title' and 'abstract' fields
parser = MultifieldParser(["title", "abstract"], schema=schema)

# use quotation for phrases or words with hyphens e.g., "risk factors", "case fatality", "covid-19"
result=parser.parse('"covid-19" AND incubation') # use boolean operators in quotation
print(result)
#searcher object is used for searching the matched documents
#you can open the searcher using a with statement so the searcher is automatically closed when you’re done with it
#ix is the document index we created before
with ix.searcher() as searcher:
    results=searcher.search(result)#The Results object acts like a list of the matched documents
    print('Total Hits: {}\n'.format(len(results)))
    output_dict = defaultdict(list)
    for result in results:
      output_dict['title'].append(result['title'])
      output_dict['abstract'].append(result['abstract'])
      output_dict['publish_year'].append(result['year'])
      output_dict['authors'].append(result['author'])
      output_dict['doi'].append(result['doi'])

output_df = pd.DataFrame(output_dict)
output_df['doi'] = output_df['doi'].apply(lambda x: doi_url(x) if x !='nan' else x)  

((title:"covid 19" OR abstract:"covid 19") AND (title:incubation OR abstract:incubation))
Total Hits: 78



In [0]:
#GtLtPlugin() lets you use >, <, >=, <=, =>, or =< after a field specifier, 
#and translates the expression into the equivalent range:
parser.add_plugin(GtLtPlugin())
#Adds the ability to group arbitrary queries inside double quotes,
#to produce a query matching the individual sub-queries in sequence.
parser.add_plugin(SequencePlugin())
#IMPORTANT!!! Not like phrase query which specify the field outside the double quotation marks,
#you need to specify the field inside the double quotation marks for each subquery
#the query string below represents the query 'abstract:"(child OR childr*) ho*sehold"~3 AND title:tales'  

result=parser.parse('abstract:(sars OR "sars-cov-2" OR coronavirus* OR ncov OR "covid-19" OR mers OR "mers-cov") \
NOT abstract:(animal OR equine OR porcine OR calves OR dog*) \
AND abstract:incubation \
AND abstract:("symptom onset" OR characteristics OR exposure)\
AND year:>=2002') #sars outbreak
print ('Search Query: {}\n'.format(result))
with ix.searcher() as searcher:
    results=searcher.search(result, limit = None)#The Results object acts like a list of the matched documents.)
    print('Total Hits: {}\n'.format(len(results)))
    output_dict = defaultdict(list)
    for result in results:
      output_dict['title'].append(result['title'])
      output_dict['abstract'].append(result['abstract'])
      output_dict['publish_year'].append(result['year'])
      output_dict['authors'].append(result['author'])
      output_dict['doi'].append(result['doi'])
output_df = pd.DataFrame(output_dict)
output_df['publish_year'] = output_df['publish_year'].astype(int)
output_df['doi'] = output_df['doi'].apply(lambda x: doi_url(x) if x !='nan' else x)


Search Query: ((abstract:sars OR abstract:"sars cov" OR abstract:coronavirus* OR abstract:ncov OR abstract:"covid 19" OR abstract:mers OR abstract:"mers cov") AND NOT (abstract:animal OR abstract:equine OR abstract:porcine OR abstract:calves OR abstract:dog*) AND abstract:incubation AND (abstract:"symptom onset" OR abstract:characteristics OR abstract:exposure) AND year:[2002 TO ])

Total Hits: 71



In [0]:
output_df

Unnamed: 0,title,abstract,publish_year,authors,doi
0,The Incubation Period of Coronavirus Disease 2019 (COVID-19) From Publicly Reported Confirmed Cases: Estimation and Application,"BACKGROUND: A novel human coronavirus, severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), was identified in China in December 2019. There is limited support for many of its key epidemio...",2020,"Lauer, Stephen A.; Grantz, Kyra H.; Bi, Qifang; Jones, Forrest K.; Zheng, Qulu; Meredith, Hannah R.; Azman, Andrew S.; Reich, Nicholas G.; Lessler, Justin",http://doi.org/10.7326/m20-0504
1,Transmission of corona virus disease 2019 during the incubation period may lead to a quarantine loophole,"Background: The ongoing outbreak of novel corona virus disease 2019 (COVID-19) in Wuhan, China, is arousing international concern. This study evaluated whether and when the infected but asymptomat...",2020,Wei Xia; Jiaqiang Liao; Chunhui Li; Yuanyuan Li; Xi Qian; Xiaojie Sun; Hongbo Xu; Gaga Mahai; Xin Zhao; Lisha Shi; Juan Liu; Ling Yu; Meng Wang; Qianqian Wang; Asmagvl Namat; Ying Li; Jingyu Qu; Q...,http://doi.org/10.1101/2020.03.06.20031955
2,"Epidemiological and clinical features of COVID-19 patients with and without pneumonia in Beijing, China","Background:SARS-CoV-2-caused coronavirus disease (COVID-19) is posinga large casualty. The features of COVID-19patients withand without pneumonia,SARS-CoV-2 transmissibility in asymptomatic carrie...",2020,Penghui Yang; Yibo Ding; Zhe Xu; Rui Pu; Ping Li; Jin Yan; Jiluo Liu; Fanping Meng; Lei Huang; Lei Shi; Tianjun Jiang; Enqiang Qin; Min Zhao; Dawei Zhang; Peng Zhao; Lingxiang Yu; Zhaohai Wang; Zh...,http://doi.org/10.1101/2020.02.28.20028068
3,The outbreak of SARS-CoV-2 pneumonia calls for viral vaccines,"The outbreak of 2019-novel coronavirus disease (COVID-19) that is caused by SARS-CoV-2 has spread rapidly in China, and has developed to be a Public Health Emergency of International Concern. Howe...",2020,"Shang, Weilong; Yang, Yi; Rao, Yifan; Rao, Xiancai",
4,MERS-CoV outbreak following a single patient exposure in an emergency room in South Korea: an epidemiological outbreak study,"Summary Background In 2015, a large outbreak of Middle East respiratory syndrome coronavirus (MERS-CoV) infection occurred following a single patient exposure in an emergency room at the Samsung M...",2016,"Cho, Sun Young; Kang, Ji-Man; Ha, Young Eun; Park, Ga Eun; Lee, Ji Yeon; Ko, Jae-Hoon; Lee, Ji Yong; Kim, Jong Min; Kang, Cheol-In; Jo, Ik Joon; Ryu, Jae Geum; Choi, Jong Rim; Kim, Seonwoo; Huh, H...",http://doi.org/10.1016/s0140-6736(16)30623-7
...,...,...,...,...,...
66,Identification and containment of an outbreak of SARS in a community hospital,"BACKGROUND: Severe acute respiratory syndrome (SARS) is continuing to spread around the world. All hospitals must be prepared to care for patients with SARS. Thus, it is important to understand th...",2003,"Dwosh, Hy A.; Hong, Harry H.L.; Austgarden, Douglas; Herman, Stanley; Schabas, Richard",
67,US Federal Travel Restrictions for Persons with Higher-Risk Exposures to Communicable Diseases of Public Health Concern,Published guidance recommends controlled movement for persons with higher-risk exposures (HREs) to communicable diseases of public health concern; US federal public health travel restrictions (PHT...,2017,"Vonnahme, Laura A.; Jungerman, M. Robynne; Gulati, Reena K.; Illig, Petra; Alvarado-Ramy, Francisco",http://doi.org/10.3201/eid2313.170386
68,Analysis of family cluster infection with Novel Coronavirus Pneumonia,"Objective To explore the regularity and characteristics of the transmission of Novel Coronavirus Pneumonia(NCP) in crowd, for provide a reference for pre-hospital first aid to identify and screen ...",2020,"ZHANG, Wei; TIAN, Sijia; WANG, Ying; CHEN, Hui; ZHANG, Jinjun",
69,Clinical features of 2019 novel coronavirus pneumonia in the early stage from a fever clinic in Beijing,Objective: To summarize and analyze the clinical and imaging characteristics of patients with 2019 novel coronavirus pneumonia in the early stage in Beijing. Methods: A retrospective analysis of c...,2020,"Zhang, M. Q.; Wang, X. H.; Chen, Y. L.; Zhao, K. L.; Cai, Y. Q.; An, C. L.; Lin, M. G.; Mu, X. D.",http://doi.org/10.3760/cma.j.issn.1001-0939.2020.0013


In [0]:
output_df.to_csv(drive_path/'COVID-19'/'TestSearchResults_Incubation_v5.csv', index = False)

In [0]:
result=parser.parse('abstract:(sars OR "sars-cov-2" OR coronavirus* OR ncov OR "covid-19" OR mers OR "mers-cov")\
AND abstract:(predictors OR "risk factors" OR characteristics) \
NOT abstract:(animal OR equine OR porcine OR calves OR dog*) \
OR title:(sars OR "sars-cov-2" OR coronavirus* OR ncov OR "covid-19") \
OR title:(predictors OR "riskfactors" OR characteristics)')
print (result)
with ix.searcher() as searcher:
    results=searcher.search(result, limit = None)#The Results object acts like a list of the matched documents.)
    print('Total Hits: {}\n'.format(len(results)))
    output_dict = defaultdict(list)
    for result in results:
      output_dict['title'].append(result['title'])
      output_dict['abstract'].append(result['abstract'])
      output_dict['publish_time'].append(result['date'])
      output_dict['authors'].append(result['author'])
      output_dict['doi'].append(result['doi'])
pd.DataFrame(output_dict)  

((abstract:sars OR abstract:"sars cov" OR abstract:coronavirus* OR abstract:ncov OR abstract:"covid 19" OR abstract:mers OR abstract:"mers cov") AND (abstract:predictors OR abstract:"risk factors" OR abstract:characteristics) AND (NOT (abstract:animal OR abstract:equine OR abstract:porcine OR abstract:calves OR abstract:dog*) OR title:sars OR title:"sars cov" OR title:coronavirus* OR title:ncov OR title:"covid 19" OR title:predictors OR title:riskfactors OR title:characteristics))
Total Hits: 644



Unnamed: 0,title,abstract,publish_time,authors,doi
0,"Novel coronavirus 2019-nCoV: prevalence, biological and clinical characteristics comparison with SARS-CoV and MERS-CoV",OBJECTIVE: Human infections with zoonotic coronavirus contain emerging and reemerging pathogenic characteristics which have raised great public health concern. This study aimed at investigating th...,2020,"Meo, S. A.; Alhowikan, A. M.; Al-Khlaiwi, T.; Meo, I. M.; Halepoto, D. M.; Iqbal, M.; Usmani, A. M.; Hajjar, W.; Ahmed, N.",10.26355/eurrev_202002_20379
1,"Bat-to-human: spike features determining ‘host jump’ of coronaviruses SARS-CoV, MERS-CoV, and beyond",Both severe acute respiratory syndrome coronavirus (SARS-CoV) and Middle East respiratory syndrome coronavirus (MERS-CoV) are zoonotic pathogens that crossed the species barriers to infect humans....,2015-08-31,"Lu, Guangwen; Wang, Qihui; Gao, George F.",10.1016/j.tim.2015.06.003
2,"The origin, transmission and clinical therapies on coronavirus disease 2019 (COVID-19) outbreak – an update on the status","An acute respiratory disease, caused by a novel coronavirus (SARS-CoV-2, previously known as 2019-nCoV), the coronavirus disease 2019 (COVID-19) has spread throughout China and received worldwide ...",2020-03-13,"Guo, Yan-Rong; Cao, Qing-Dong; Hong, Zhong-Si; Tan, Yuan-Yang; Chen, Shou-Deng; Jin, Hong-Jun; Tan, Kai-Sen; Wang, De-Yun; Yan, Yan",10.1186/s40779-020-00240-0
3,Liver injury during highly pathogenic human coronavirus infections,"The severe acute respiratory syndrome coronavirus 2 (SARS-Cov-2), the pathogen of 2019 novel coronavirus disease (COVID-19), has posed a serious threat to global public health. The WHO has declare...",2020-03-14,"Xu, Ling; Liu, Jia; Lu, Mengji; Yang, Dongliang; Zheng, Xin",10.1111/liv.14435
4,Imaging and clinical features of patients with 2019 novel coronavirus SARS-CoV-2,"The pneumonia caused by the 2019 novel coronavirus (SARS-CoV-2, also called 2019-nCoV) recently break out in Wuhan, China, and was named as COVID-19. With the spread of the disease, similar cases ...",2020-02-28,"Xu, Xi; Yu, Chengcheng; Qu, Jing; Zhang, Lieguang; Jiang, Songfeng; Huang, Deyang; Chen, Bihua; Zhang, Zhiping; Guan, Wanhua; Ling, Zhoukun; Jiang, Rui; Hu, Tianli; Ding, Yan; Lin, Lin; Gan, Qingx...",10.1007/s00259-020-04735-9
...,...,...,...,...,...
639,The Role of Viral Infection in Pulmonary Exacerbations of Bronchiectasis in Adults A Prospective Study,"BACKGROUND Although viral infections are a major cause of exacerbations in patients with chronic airway diseases, their roles in triggering bronchiectasis exacerbations in adults remain unclear. T...",2015-06-30,"Gao, Yong-hua; Guan, Wei-jie; Xu, Gang; Lin, Zhi-ya; Tang, Yan; Lin, Zhi-min; Gao, Yang; Li, Hui-min; Zhong, Nan-shan; Zhang, Guo-jun; Chen, Rong-chang",10.1378/chest.14-1961
640,Viral infection in community acquired pneumonia patients with fever: a prospective observational study,"BACKGROUND: Patients with community acquired pneumonia (CAP) caused by viruses can develop severe complications, which result in hospitalization and death. The purpose of this study was to analyse...",,"Tao, Ru-Jia; Luo, Xiao-Li; Xu, Wen; Mao, Bei; Dai, Ruo-Xuan; Li, Cheng-Wei; Yu, Li; Gu, Fen; Liang, Shuo; Lu, Hai-Wen; Chen, Ke-Bin; Bai, Jiu-Wu; Ji, Xiao-Bin; Gu, Shu-Yi; Sun, Xiao-Li; Dai, Fa-Hu...",10.21037/jtd.2018.06.33
641,Prevalence and correlates of influenza vaccination among non-institutionalized elderly people: An exploratory cross-sectional survey,Abstract Background Worldwide pandemics of influenza virus caused extensive morbidity and mortality around the world and influenza vaccination is the most effective method for preventing influenza...,2009-06-30,"Lau, Lam; Lau, Ying; Lau, Ying Hon",10.1016/j.ijnurstu.2008.12.006
642,"724. Neurologic Complications in Hospitalized Pediatric Patients with Influenza Infection, A Multicenter Retrospective Study in Korea",BACKGROUND: The aim of the study was to evaluate the incidence and characteristics of influenza associated neurologic complications (IANCs) in hospitalized pediatric patients in Korea. METHODS: We...,2018 Nov 26,"Choi, Gwang-Jun; Park, Ji Young; Choi, Joon-Sik; Kim, Bitna; Choi, Sae Rom; Kim, Dong Sub; Kang, Ji-Man; Lee, Jun Wha; Woo, Young-Jong; Lee, Jeehun; Kim, Yae-Jean",10.1093/ofid/ofy210.731


# References

Whoosh tutorial

https://drive.google.com/file/d/1fw7yK3orYw4kWFaO0WISZQy-P4WD8QHG/view?usp=sharing 


Browsing research papers with a BM25 search engine

https://www.kaggle.com/dgunning/browsing-research-papers-with-a-bm25-search-engine