# COVID-19 Open Research Dataset Challenge

https://www.youtube.com/watch?v=S6GVXk6kbcs

##### Import Libraries

In [6]:
!pip install rank_bm25 nltk

Collecting rank_bm25
  Downloading https://files.pythonhosted.org/packages/d2/e4/38d03d6d5e2deae8d2838b81d6ba2742475ced42045f5c46aeb00c5fb79c/rank_bm25-0.2.tar.gz
Building wheels for collected packages: rank-bm25
  Building wheel for rank-bm25 (setup.py) ... [?25ldone
[?25h  Created wheel for rank-bm25: filename=rank_bm25-0.2-cp37-none-any.whl size=4163 sha256=2657ddad49320196843f207152609bd14baef5631c0deade72f1bf844333f1cd
  Stored in directory: /Users/alderik/Library/Caches/pip/wheels/6f/0c/1f/78945dd6a5478bbcdb50d73ac96ae5af2ffcdfcd374fd9b1bf
Successfully built rank-bm25
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2


In [65]:
import os 
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
import heapq
from rank_bm25 import BM25Okapi
import nltk
from nltk.corpus import stopwords
from requests.exceptions import HTTPError, ConnectionError
from ipywidgets import interact
import ipywidgets as widgets

In [1]:
#https://www.kaggle.com/dgunning/browsing-research-papers-with-a-bm25-search-engine?scriptVersionId=31027514

from ipywidgets import interact
import ipywidgets as widgets
import pandas as pd

def set_column_width(ColumnWidth, MaxRows):
    pd.options.display.max_colwidth = ColumnWidth
    pd.options.display.max_rows = MaxRows
    print('Set pandas dataframe column width to', ColumnWidth, 'and max rows to', MaxRows)
    
interact(set_column_width, 
         ColumnWidth=widgets.IntSlider(min=50, max=400, step=50, value=200),
         MaxRows=widgets.IntSlider(min=50, max=500, step=100, value=100));

interactive(children=(IntSlider(value=200, description='ColumnWidth', max=400, min=50, step=50), IntSlider(val…

##### Import Data

In [87]:
#Import metadata
metadata = pd.read_csv("metadata.csv", dtype={'Microsoft Academic Paper ID': str,
                                      'pubmed_id': str}) 
metadata = metadata.dropna(subset=['sha'])
metadata.rename(columns={"sha": "paper_id"}, inplace = True)
metadata.rename(columns={"source_x": "source"}, inplace = True)
metadata= metadata.drop(columns = ['title','abstract'])

In [88]:
for i in metadata['doi']:
    url = doi_url(str(i))
    metadata.loc[metadata['doi'] == i, 'doi'] = url

In [9]:
#import text from papers json objects
#https://www.youtube.com/watch?v=S6GVXk6kbcs
dirs = ['biorxiV_medrxiv', 'comm_use_subset', 'custom_license', 'noncomm_use_subset']

docs = []
for d in dirs:
    print(d)
    for file in tqdm(os.listdir(f"{d}/{d}")):
        filepath = f"{d}/{d}/{file}"
        j = json.load(open(filepath,'rb'))
        title = j['metadata']['title']
        paper_id = j['paper_id']
        try: 
            abstract = j['abstract'][0]['text']
        except:
            abstract = ''
            
        fulltext = ''
        for text in j['body_text']:
            fulltext += text['text']
        docs.append([paper_id,title, abstract, fulltext])


  5%|▍         | 43/885 [00:00<00:01, 427.76it/s]

biorxiV_medrxiv


100%|██████████| 885/885 [00:01<00:00, 718.70it/s]
  1%|          | 60/9118 [00:00<00:15, 592.93it/s]

comm_use_subset


100%|██████████| 9118/9118 [00:16<00:00, 553.10it/s]
  0%|          | 0/16959 [00:00<?, ?it/s]

custom_license


100%|██████████| 16959/16959 [00:29<00:00, 568.21it/s]
  3%|▎         | 74/2353 [00:00<00:03, 736.15it/s]

noncomm_use_subset


100%|██████████| 2353/2353 [00:03<00:00, 668.94it/s]


In [95]:
df = pd.DataFrame(docs, columns = ['paper_id','title', 'abstract', 'fulltext'])

In [96]:
#Join metadata with paper text on paper_id
allpapers_df = pd.merge(df, metadata, on="paper_id")
allpapers_df['journal'] = allpapers_df['journal'].astype(str)
peer_reviewed = allpapers_df['journal'] !='nan'
#Make column to say whether a paper was peer reviewed
#basically anything from bioRxiv/medRxiv subset 
allpapers_df.insert(12, "peer_reviewed", peer_reviewed, True) 

In [97]:
#Made data frame for all journal papers
journals_df = allpapers_df[allpapers_df['journal']!='nan']
#dataframe for unpublished papers
unpublished_df = allpapers_df[allpapers_df['journal']=='nan']

In [98]:
print(f'Total number of papers including journals {len(allpapers_df)} \n\nTotal number of journals {len(journals_df)} \n\nNumber of unpublsihed papers {len(unpublished_df)}')

Total number of papers including journals 27690 

Total number of journals 26796 

Number of unpublsihed papers 894


##### Python Object Oriented Programming
https://www.kaggle.com/dgunning/browsing-research-papers-with-a-bm25-search-engine?scriptVersionId=31027514

In [90]:
def get(url, timeout=6):
    try:
        r = requests.get(url, timeout=timeout)
        return r.text
    except ConnectionError:
        print(f'Cannot connect to {url}')
        print(f'Remember to turn Internet ON in the Kaggle notebook settings')
    except HTTPError:
        print('Got http error', r.status, r.text)
        
# Convert the doi to a url
def doi_url(d): 
    return f'http://{d}' if d.startswith('doi.org') else f'http://doi.org/{d}'

class ResearchPapers:
    
    def __init__(self, metadata: pd.DataFrame):
        self.metadata = metadata
        
    def __getitem__(self, item):
        return Paper(self.metadata.iloc[item])
    
    def __len__(self):
        return len(self.metadata)
    
    def head(self, n):
        return ResearchPapers(self.metadata.head(n).copy().reset_index(drop=True))
    
    def tail(self, n):
        return ResearchPapers(self.metadata.tail(n).copy().reset_index(drop=True))
    
    def abstracts(self):
        return self.metadata.abstract
    
    def titles(self):
        return self.metadata.title.dropna()
    
    #i added
    def texts(self):
        return self.metadata.fulltext
        
    def _repr_html_(self):
        return self.metadata._repr_html_()
    
    
class Paper:
    
    '''
    A single research paper
    '''
    def __init__(self, item):
        self.paper = item.to_frame().fillna('')
        self.paper.columns = ['Value']
    
    def text(self):
        return self.paper.loc['fulltext'].values[0]
    
    def abstract(self):
        return self.paper.loc['abstract'].values[0]
    
    def title(self):
        return self.paper.loc['title'].values[0]
    
    def doi(self):
        return self.paper.loc['doi'].values[0]
    
    def peer_reviewed(self):
        return self.paper.loc['peer_reviewed'].values[0]
    
    def journal(self):
        return self.paper.loc['journal'].values[0]
    
    def authors(self, split=False):
        '''
        Get a list of authors
        '''
        authors = self.paper.loc['authors'].values[0]
        if not authors:
            return []
        if not split:
            return authors
        if authors.startswith('['):
            authors = authors.lstrip('[').rstrip(']')
            return [a.strip().replace("\'", "") for a in authors.split("\',")]
        
        # Todo: Handle cases where author names are separated by ","
        return [a.strip() for a in authors.split(';')]
        
    def _repr_html_(self):
        return self.paper._repr_html_()
    
papers = ResearchPapers(allpapers_df)

#### BM25 Query
https://pypi.org/project/rank-bm25/  
http://www.cs.otago.ac.nz/homepages/andrew/papers/2014-2.pdf

In [122]:
english_stopwords = list(set(stopwords.words('english')))

def strip_characters(text):
    t = re.sub('\(|\)|:|,|;|\.|’|”|“|\?|%|>|<', '', text)
    t = re.sub('/', ' ', t)
    t = t.replace("'",'')
    return t

def clean(text):
    t = text.lower()
    t = strip_characters(t)
    return t

def tokenize(text):
    words = nltk.word_tokenize(text)
    return list(set([word for word in words 
                     if len(word) > 1
                     and not word in english_stopwords
                     and not (word.isnumeric() and len(word) is not 4)
                     and (not word.isnumeric() or word.isalpha())] )
               )

def preprocess(text):
    t = clean(text)
    tokens = tokenize(t)
    return tokens

class SearchResults:
    
    def __init__(self, 
                 data: pd.DataFrame,
                 columns = None):
        self.results = data
        if columns:
            self.results = self.results[columns]
            
    def __getitem__(self, item):
        return Paper(self.results.loc[item])
    
    def __len__(self):
        return len(self.results)
        
    def _repr_html_(self):
        return self.results._repr_html_()

SEARCH_DISPLAY_COLUMNS = ['title', 'abstract', 'doi', 'peer_reviewed', 'journal','fulltext']
    
class RankBM25Index:
    
    def __init__(self, corpus: pd.DataFrame, columns=SEARCH_DISPLAY_COLUMNS):
        self.corpus = corpus
        self.columns = columns
        raw_search_str = self.corpus.abstract + ' ' + self.corpus.title
        self.index = raw_search_str.apply(preprocess).to_frame()
        self.index.columns = ['terms']
        self.index.index = self.corpus.index
        self.bm25 = BM25Okapi(self.index.terms.tolist())
        
    def search(self, search_string, n=4):
        search_terms = preprocess(search_string)
        doc_scores = self.bm25.get_scores(search_terms)
        ind = np.argsort(doc_scores)[::-1][:n]
        results = self.corpus.iloc[ind][self.columns]
        results['Score'] = doc_scores[ind]
        results = results[results.Score > 0]
        return SearchResults(results.reset_index(), self.columns + ['Score'])

bm25_index = RankBM25Index(allpapers_df)

In [123]:
results = bm25_index.search('smoking')

In [124]:
len(results)

4

In [125]:
results

Unnamed: 0,title,abstract,doi,peer_reviewed,journal,fulltext,Score
0,"Eff ects of smoking and solid-fuel use on COPD, lung cancer, and tuberculosis in China: a time-based, multiple risk factor, modelling study","Background Chronic obstructive pulmonary disease (COPD), lung cancer, and tuberculosis are three leading causes of death in China, where prevalences of smoking and solid-fuel use are also high. We aimed to predict the eff ects of risk-factor tren...",http://doi.org/10.1016/S0140-6736(08)61345-8,True,The Lancet,"Chronic obstructive pulmonary disease (COPD), lung cancer, and tuberculosis are the second, sixth, and eighth leading causes of death in China, accounting for almost 2 million deaths in 2002 (20·5% of all deaths in China). 1 A half of Chinese men...",8.399632
1,Environmental factors and their regulation of immunity in multiple sclerosis,"Epidemiological and clinical studies have shown that environmental factors such as infections, smoking and vitamin D are associated with the risk of developing multiple sclerosis (MS). Some of these factors also play a role in the MS disease cour...",http://doi.org/10.1016/j.jns.2012.10.021,True,Journal of the Neurological Sciences,Multiple sclerosis (MS) is an inflammatory disease of the central nervous system (CNS) in which an interplay of genetic and environmental factors leads to the chronic activation of immune cells and to neuronal injury. Epidemiological studies have...,7.727815
2,"Bulk and single-cell transcriptomics identify tobacco-use disparity in lung gene expression of ACE2, the receptor of 2019-nCov","In current severe global emergency situation of 2019-nCov outbreak, it is imperative to identify vulnerable and susceptible groups for effective protection and care. Recently, studies found that 2019-nCov and SARS-nCov share the same receptor, AC...",http://doi.org/10.1101/2020.02.05.20020107,False,,"In the past two decades, pathogenic coronaviruses (CoVs) have caused epidemic infections, including the server acute respiratory syndrome (SARS)-CoV outbreak in 2003, the Middle East Respiratory Syndrome Coronavirus (MERS-CoV) outbreak in 2012 an...",7.097105
3,Brief Communication Psychological Responses among Humidifier Disinfectant Disaster Victims and Their Families,"To substantiate psychological symptoms following humidifier disinfectant (HD) disasters, counseling records of 26 victims and 92 family members of victims (45 were bereaved) were analyzed retrospectively. Among the victims, 34.6% had Clinical Glo...",http://doi.org/10.3346/jkms.2019.34.e29,True,J Korean Med Sci,"In Korea, several types of chemical disinfectants that had been widely used in humidifiers since 1994 were found to be associated with lung injury, including interstitial pneumonitis and widespread lung fibrosis, 1,2 collectively referred to as h...",6.872736


In [129]:
for i in range(len(results)):
    print(results[i].text())
    


Chronic obstructive pulmonary disease (COPD), lung cancer, and tuberculosis are the second, sixth, and eighth leading causes of death in China, accounting for almost 2 million deaths in 2002 (20·5% of all deaths in China). 1 A half of Chinese men smoke and more than 70% of Chinese households use solid fuels, such as wood, crop residues, and coal for heating and cooking. 2 Tobacco smoking and indoor air pollution from solid-fuel use are the most important global risk factors for COPD and lung cancer and account for a signifi cant proportion of deaths from these diseases in developing countries. 3, 4 Without interventions, the annual numbers of COPD and lung cancer deaths in China are predicted to double over the next 30 years. 1 Systematic reviews have concluded that smoking is also an independent risk factor for tuberculosis [5] [6] [7] and suggested a positive association between indoor air pollution and the disease. 5 Integrated programmes that incorporate multiple risk factor and th

In [130]:
import re
import nltk

# Get a list of stopwords from nltk
stopwords = nltk.corpus.stopwords.words("english")

def clean_text(text):
    # Removing Square Brackets and Extra Spaces
    text = re.sub(r'\[[0-9]*\]',' ', text)
    text = re.sub(r'\s+',' ', text)
 
    text = re.sub(r'\{\{[\s\S]*?\}\}', '', text)
    return text

def clean_spchar_digs(text):
    # Removing special characters and digits
    text = re.sub('[^a-zA-Z]', ' ', text )
    text = re.sub(r'\s+', ' ', text)
    
    return text

def word_freq(formatted_text):
    #creates a dictionary of words as keys and frequency as values
    word_frequencies = {}
    for word in nltk.word_tokenize(formatted_text):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1

    maximum_frequency = max(word_frequencies.values())
    #divides the values by the maximum frequency
    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequency)
    
    return word_frequencies

def sent_scores(sentence_list, word_frequencies):
    #uses the word frequencies to score the sentences by adding up the scores
    #of the words that make up the sentence
    sentence_scores = {}
    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) <60: #limits sentence to less than 60 words
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]
                    
    return sentence_scores

def get_summary(dirty_text):
    '''
    Input: text that is already filtered for desired keywords
    '''
    #cleans the text
    text = clean_text(dirty_text)
    #remove special chars and nums
    formatted_text = clean_spchar_digs(text)
    #tokenize
    sentence_list = nltk.sent_tokenize(text)
    #wordfrequencies
    word_frequencies = word_freq(formatted_text)
    #dictionary of sentences with sentence as key and word frequency score as value
    sentence_scores = sent_scores(sentence_list,word_frequencies)
    
    summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get) #first value is number highest scoring sentences to print
    summary = '\n\n '.join(summary_sentences)
    return summary

def run_nlp(fulltexts, searchlist):
    '''
    Input: list of key words
    Output: sentence with a keyword and a covid aliases summarizer thought were important
    '''
    #get summary where all you have to do is provide the words you are searching for in a list
    covid_alias = ['CoV', 'COVID', 'Covid', 'corona virus', 'coronavirus', 'Coronavirus', 'Corona virus'] #depending on here the results w
    #dictionary of sentences that contain the key terms
    desired_sents = {}
    #dictionary of sentences that contain a covid alias
    covid_sents = {}
    
    for text in fulltexts:
        for sentence in text.split('. '):
            for i in searchlist:
                if i.lower() in sentence.lower(): #using .lower changes the results dramatically
                    if sentence not in desired_sents.keys():
                        desired_sents[sentence] = sentence 
            for j in covid_alias:
                if j in sentence:
                    if sentence not in covid_sents.keys():
                        covid_sents[sentence] = sentence
    desired_sents = set(desired_sents.keys())
    covid_sents = set(covid_sents.keys())
    #list of the intersetion of sentences with key terms and covid aliases
    desired_sents = list(desired_sents.intersection(covid_sents))
    
    desired_text = ''
    for x in desired_sents:
        desired_text += ' ' + x
    
    result = get_summary(desired_text)
    return result

In [103]:
# tasks = [('What is known about transmission, incubation, and environmental stability?', 
#         'transmission incubation environment coronavirus'),
#         ('What do we know about COVID-19 risk factors?', 'risk factors'),
#         ('What do we know about virus genetics, origin, and evolution?', 'genetics origin evolution'),
#         ('What has been published about ethical and social science considerations','ethics ethical social'),
#         ('What do we know about diagnostics and surveillance?','diagnose diagnostic surveillance'),
#         ('What has been published about medical care?', 'medical care'),
#         ('What do we know about vaccines and therapeutics?', 'vaccines vaccine vaccinate therapeutic therapeutics')] 
# tasks = pd.DataFrame(tasks, columns=['Task', 'Keywords'])


# def show_task(Task):
#     print(Task)
#     keywords = tasks[tasks.Task == Task].Keywords.values[0]
#     search_results = bm25_index.search(keywords, n=200)
#     return search_results
    
# results = interact(show_task, Task = tasks.Task.tolist());

## Text Summarizer
Now that we have the queries working we need to summarize the text