# COVID-19 Open Research Dataset Challenge

https://www.youtube.com/watch?v=S6GVXk6kbcs

##### Import Libraries

In [6]:
!pip install rank_bm25 nltk

Collecting rank_bm25
  Downloading https://files.pythonhosted.org/packages/d2/e4/38d03d6d5e2deae8d2838b81d6ba2742475ced42045f5c46aeb00c5fb79c/rank_bm25-0.2.tar.gz
Building wheels for collected packages: rank-bm25
  Building wheel for rank-bm25 (setup.py) ... [?25ldone
[?25h  Created wheel for rank-bm25: filename=rank_bm25-0.2-cp37-none-any.whl size=4163 sha256=2657ddad49320196843f207152609bd14baef5631c0deade72f1bf844333f1cd
  Stored in directory: /Users/alderik/Library/Caches/pip/wheels/6f/0c/1f/78945dd6a5478bbcdb50d73ac96ae5af2ffcdfcd374fd9b1bf
Successfully built rank-bm25
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2


In [7]:
import os 
import pandas as pd
import json
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
import heapq
from rank_bm25 import BM25Okapi
import nltk
from nltk.corpus import stopwords
from requests.exceptions import HTTPError, ConnectionError
from ipywidgets import interact
import ipywidgets as widgets

In [1]:
#https://www.kaggle.com/dgunning/browsing-research-papers-with-a-bm25-search-engine?scriptVersionId=31027514

from ipywidgets import interact
import ipywidgets as widgets
import pandas as pd

def set_column_width(ColumnWidth, MaxRows):
    pd.options.display.max_colwidth = ColumnWidth
    pd.options.display.max_rows = MaxRows
    print('Set pandas dataframe column width to', ColumnWidth, 'and max rows to', MaxRows)
    
interact(set_column_width, 
         ColumnWidth=widgets.IntSlider(min=50, max=400, step=50, value=200),
         MaxRows=widgets.IntSlider(min=50, max=500, step=100, value=100));

interactive(children=(IntSlider(value=200, description='ColumnWidth', max=400, min=50, step=50), IntSlider(val…

##### Import Data

In [50]:
#Import metadata
metadata = pd.read_csv("metadata.csv", dtype={'Microsoft Academic Paper ID': str,
                                      'pubmed_id': str}) 
metadata = metadata.dropna(subset=['sha'])
metadata.rename(columns={"sha": "paper_id"}, inplace = True)
metadata.rename(columns={"source_x": "source"}, inplace = True)
metadata= metadata.drop(columns = ['title','abstract'])

In [51]:
for i in metadata['doi']:
    url = doi_url(str(i))
    metadata.loc[metadata['doi'] == i, 'doi'] = url

KeyboardInterrupt: 

In [None]:
metadata.head()

In [9]:
#import text from papers json objects
#https://www.youtube.com/watch?v=S6GVXk6kbcs
dirs = ['biorxiV_medrxiv', 'comm_use_subset', 'custom_license', 'noncomm_use_subset']

docs = []
for d in dirs:
    print(d)
    for file in tqdm(os.listdir(f"{d}/{d}")):
        filepath = f"{d}/{d}/{file}"
        j = json.load(open(filepath,'rb'))
        title = j['metadata']['title']
        paper_id = j['paper_id']
        try: 
            abstract = j['abstract'][0]['text']
        except:
            abstract = ''
            
        fulltext = ''
        for text in j['body_text']:
            fulltext += text['text']
        docs.append([paper_id,title, abstract, fulltext])


  5%|▍         | 43/885 [00:00<00:01, 427.76it/s]

biorxiV_medrxiv


100%|██████████| 885/885 [00:01<00:00, 718.70it/s]
  1%|          | 60/9118 [00:00<00:15, 592.93it/s]

comm_use_subset


100%|██████████| 9118/9118 [00:16<00:00, 553.10it/s]
  0%|          | 0/16959 [00:00<?, ?it/s]

custom_license


100%|██████████| 16959/16959 [00:29<00:00, 568.21it/s]
  3%|▎         | 74/2353 [00:00<00:03, 736.15it/s]

noncomm_use_subset


100%|██████████| 2353/2353 [00:03<00:00, 668.94it/s]


In [10]:
df = pd.DataFrame(docs, columns = ['paper_id','title', 'abstract', 'fulltext'])

In [11]:
#Join metadata with paper text on paper_id
allpapers_df = pd.merge(df, metadata, on="paper_id")
allpapers_df['journal'] = allpapers_df['journal'].astype(str)
peer_reviewed = allpapers_df['journal'] !='nan'
#Make column to say whether a paper was peer reviewed
#basically anything from bioRxiv/medRxiv subset 
allpapers_df.insert(12, "peer_reviewed", peer_reviewed, True) 

In [12]:
#Made data frame for all journal papers
journals_df = allpapers_df[allpapers_df['journal']!='nan']
#dataframe for unpublished papers
unpublished_df = allpapers_df[allpapers_df['journal']=='nan']

In [13]:
print(f'Total number of papers including journals {len(allpapers_df)} \n\nTotal number of journals {len(journals_df)} \n\nNumber of unpublsihed papers {len(unpublished_df)}')

Total number of papers including journals 27690 

Total number of journals 26796 

Number of unpublsihed papers 894


##### Python Object Oriented Programming
https://www.kaggle.com/dgunning/browsing-research-papers-with-a-bm25-search-engine?scriptVersionId=31027514

In [55]:
def get(url, timeout=6):
    try:
        r = requests.get(url, timeout=timeout)
        return r.text
    except ConnectionError:
        print(f'Cannot connect to {url}')
        print(f'Remember to turn Internet ON in the Kaggle notebook settings')
    except HTTPError:
        print('Got http error', r.status, r.text)
        
# Convert the doi to a url
def doi_url(d): 
    return f'http://{d}' if d.startswith('doi.org') else f'http://doi.org/{d}'

class ResearchPapers:
    
    def __init__(self, metadata: pd.DataFrame):
        self.metadata = metadata
        
    def __getitem__(self, item):
        return Paper(self.metadata.iloc[item])
    
    def __len__(self):
        return len(self.metadata)
    
    def head(self, n):
        return ResearchPapers(self.metadata.head(n).copy().reset_index(drop=True))
    
    def tail(self, n):
        return ResearchPapers(self.metadata.tail(n).copy().reset_index(drop=True))
    
    def abstracts(self):
        return self.metadata.abstract.dropna()
    
    def titles(self):
        return self.metadata.title.dropna()
        
    def _repr_html_(self):
        return self.metadata._repr_html_()
    
    
class Paper:
    
    '''
    A single research paper
    '''
    def __init__(self, item):
        self.paper = item.to_frame().fillna('')
        self.paper.columns = ['Value']
    
    def doi(self):
        return self.paper.loc['doi'].values[0]
    
    def text(self):
        return self.paper.loc['fulltext'].values[0]
    
    def abstract(self):
        return self.paper.loc['abstract'].values[0]
    
    def title(self):
        return self.paper.loc['title'].values[0]
    
    def link(self):
        return doi_url(self.paper.loc['doi'].values[0])
    
    def peer_reviewed(self):
        return self.paper.loc['peer_reviewed'].values[0]
    
    def journal(self):
        return self.paper.loc['journal'].values[0]
    
    def authors(self, split=False):
        '''
        Get a list of authors
        '''
        authors = self.paper.loc['authors'].values[0]
        if not authors:
            return []
        if not split:
            return authors
        if authors.startswith('['):
            authors = authors.lstrip('[').rstrip(']')
            return [a.strip().replace("\'", "") for a in authors.split("\',")]
        
        # Todo: Handle cases where author names are separated by ","
        return [a.strip() for a in authors.split(';')]
        
    def _repr_html_(self):
        return self.paper._repr_html_()
    
papers = ResearchPapers(metadata)

#### BM25 Query
https://pypi.org/project/rank-bm25/  
http://www.cs.otago.ac.nz/homepages/andrew/papers/2014-2.pdf

In [56]:
english_stopwords = list(set(stopwords.words('english')))

def strip_characters(text):
    t = re.sub('\(|\)|:|,|;|\.|’|”|“|\?|%|>|<', '', text)
    t = re.sub('/', ' ', t)
    t = t.replace("'",'')
    return t

def clean(text):
    t = text.lower()
    t = strip_characters(t)
    return t

def tokenize(text):
    words = nltk.word_tokenize(text)
    return list(set([word for word in words 
                     if len(word) > 1
                     and not word in english_stopwords
                     and not (word.isnumeric() and len(word) is not 4)
                     and (not word.isnumeric() or word.isalpha())] )
               )

def preprocess(text):
    t = clean(text)
    tokens = tokenize(t)
    return tokens

class SearchResults:
    
    def __init__(self, 
                 data: pd.DataFrame,
                 columns = None):
        self.results = data
        if columns:
            self.results = self.results[columns]
            
    def __getitem__(self, item):
        return Paper(self.results.loc[item])
    
    def __len__(self):
        return len(self.results)
        
    def _repr_html_(self):
        return self.results._repr_html_()

SEARCH_DISPLAY_COLUMNS = ['title', 'abstract', 'doi_link', 'peer_reviewed', 'publication']
    
class RankBM25Index:
    
    def __init__(self, corpus: pd.DataFrame, columns=SEARCH_DISPLAY_COLUMNS):
        self.corpus = corpus
        self.columns = columns
        raw_search_str = self.corpus.abstract + ' ' + self.corpus.title
        self.index = raw_search_str.apply(preprocess).to_frame()
        self.index.columns = ['terms']
        self.index.index = self.corpus.index
        self.bm25 = BM25Okapi(self.index.terms.tolist())
        
    def search(self, search_string, n=4):
        search_terms = preprocess(search_string)
        doc_scores = self.bm25.get_scores(search_terms)
        ind = np.argsort(doc_scores)[::-1][:n]
        results = self.corpus.iloc[ind][self.columns]
        results['Score'] = doc_scores[ind]
        results = results[results.Score > 0]
        return SearchResults(results.reset_index(), self.columns + ['Score'])
    
bm25_index = RankBM25Index(metadata.head(10000))

AttributeError: 'DataFrame' object has no attribute 'abstract'

In [None]:
results = bm25_index.search('cruise ship')