# What do we know about vaccines and therapeutics?
COVID-19 Open Research Dataset Challenge (CORD-19)

**What do we know about vaccines and therapeutics? What has been published concerning research and development and evaluation efforts of vaccines and therapeutics?**

Specifically, we want to know what the literature reports about:

* Effectiveness of drugs being developed and tried to treat COVID-19 patients.
* Clinical and bench trials to investigate less common viral inhibitors against COVID-19 such as naproxen, clarithromycin, and minocyclinethat that may exert effects on viral replication.
* Methods evaluating potential complication of Antibody-Dependent Enhancement (ADE) in vaccine recipients.
* Exploration of use of best animal models and their predictive value for a human vaccine.
* Capabilities to discover a therapeutic (not vaccine) for the disease, and clinical effectiveness studies to discover therapeutics, to include antiviral agents.
* Alternative models to aid decision makers in determining how to prioritize and distribute scarce, newly proven therapeutics as production ramps up. This could include identifying approaches for expanding production capacity to ensure equitable and timely distribution to populations in need.
* Efforts targeted at a universal coronavirus vaccine.
* Efforts to develop animal models and standardize challenge studies
* Efforts to develop prophylaxis clinical studies and prioritize in healthcare workers
* Approaches to evaluate risk for enhanced disease after vaccination
* Assays to evaluate vaccine immune response and process development for vaccines, alongside suitable animal models [in conjunction with therapeutics]

In [2]:
import os 
import pandas as pd
import json
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
import heapq

import re
import nltk

In [3]:
# Get a list of stopwords from nltk
stopwords = nltk.corpus.stopwords.words("english")

dirs = ['biorxiV_medrxiv', 'comm_use_subset', 'custom_license', 'noncomm_use_subset']

docs = []
for d in dirs:
    print(d)
    for file in tqdm(os.listdir(f"{d}/{d}")):
        filepath = f"{d}/{d}/{file}"
        j = json.load(open(filepath,'rb'))
        title = j['metadata']['title']
        try: 
            abstract = j['abstract'][0]['text']
        except:
            abstract = ''
            
        fulltext = ''
        for text in j['body_text']:
            fulltext += text['text'] + "\n\n"
        docs.append([title, abstract, fulltext])

  9%|▉         | 80/885 [00:00<00:01, 797.83it/s]

biorxiV_medrxiv


100%|██████████| 885/885 [00:01<00:00, 831.30it/s]
  1%|          | 50/9118 [00:00<00:18, 490.86it/s]

comm_use_subset


100%|██████████| 9118/9118 [00:15<00:00, 574.01it/s]
  0%|          | 0/16959 [00:00<?, ?it/s]

custom_license


100%|██████████| 16959/16959 [00:29<00:00, 571.26it/s]
  4%|▎         | 83/2353 [00:00<00:02, 821.69it/s]

noncomm_use_subset


100%|██████████| 2353/2353 [00:03<00:00, 644.07it/s]


In [4]:
df = pd.DataFrame(docs, columns = ['title', 'abstract', 'fulltext'])
#trying to keep track of titles with text so people can refer to the paper
L1 = df['title'].values
L2 = df['fulltext'].values
fulltext_dict = {k:v for k,v in zip(L1,L2)}


In [None]:
def clean_text(text):
    # Removing Square Brackets and Extra Spaces
    text = re.sub(r'\[[0-9]*\]',' ', text)
    text = re.sub(r'\s+',' ', text)
 
    text = re.sub(r'\{\{[\s\S]*?\}\}', '', text)

    # Remove doi links
    #text = re.sub(r'^https://$', '',text)
    return text

def clean_spchar_digs(text):
    # Removing special characters and digits
    text = re.sub('[^a-zA-Z]', ' ', text )
    text = re.sub(r'\s+', ' ', text)
    
    return text

def word_freq(formatted_text):
    #creates a dictionary of words as keys and frequency as values
    word_frequencies = {}
    for word in nltk.word_tokenize(formatted_text):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1

    maximum_frequency = max(word_frequencies.values())
    #divides the values by the maximum frequency
    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_frequency)
    
    return word_frequencies

def sent_scores(sentence_list, word_frequencies):
    #uses the word frequencies to score the sentences by adding up the scores
    #of the words that make up the sentence
    sentence_scores = {}
    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) <60: #limits sentence to less than 60 words
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]
                    
    return sentence_scores

def get_summary(dirty_text):
    text = clean_text(dirty_text)
    formatted_text = clean_spchar_digs(text)

    sentence_list = nltk.sent_tokenize(text)

    word_frequencies = word_freq(formatted_text) 
    sentence_scores = sent_scores(sentence_list,word_frequencies)
    
    
    summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get) #first value is number highest scoring sentences to print
    summary = '\n\n '.join(summary_sentences)
    return summary

def get_improved_summary(searchlist):
    #get summary where all you have to do is provide the words you are searching for in a list
    covid_alias = ['CoV', 'COVID', 'Covid', 'corona virus', 'coronavirus', 'Coronavirus', 'Corona virus'] #depending on here the results w
    desired_sents = {}
    covid_sents = {}
    for text in full_text.keys():
        for sentence in text.split('. '):
            for i in searchlist:
                if i.lower() in sentence.lower(): #using .lower changes the results dramatically
                    if sentence not in desired_sents.keys():
                        desired_sents[sentence] = sentence 
            for j in covid_alias:
                if j in sentence:
                    if sentence not in covid_sents.keys():
                        covid_sents[sentence] = sentence
                        
    desired_sents = set(desired_sents.keys())
    covid_sents = set(covid_sents.keys())
    desired_sents = list(desired_sents.intersection(covid_sents))
    desired_text = ''
    for x in desired_sents:
        desired_text += ' ' + x
    text = clean_text(desired_text)
    formatted_text = clean_spchar_digs(text)

    sentence_list = nltk.sent_tokenize(text)

    word_frequencies = word_freq(formatted_text) 
    sentence_scores = sent_scores(sentence_list,word_frequencies)
    
    
    summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get) #first value is number highest scoring sentences to print
    summary = '\n\n '.join(summary_sentences)
    return summary