## Task 2 - CORD 19 Information Retrieval


### The report is in section Evaluation

In this notebook we preprocess the papers, create the correspondent BoW vectors for each of the papers and find relevant sentences according to a query from those papers.

In [108]:
# we implemented 2 tasks from kaggle, for the first one(what are the risk factor of covid-19) set task to 0
# for the second task(What is known about transmission, incubation, and environmental stability?) set to 1

task = 0


# NOTE THAT YOU ONLY NEED TO RUN STARTING WITH STEP 3
# STEP 1 AND STEP 2 WERE RUN BEFORE AND RESULTS ARE SAVED

### Step 1: Prepare a csv file containing all papers

__IMPORTANT:__ We provide you this particular csv file, therefore you do not need to run the following script.If you do not want to wait for a long time, please skip this step! 

In [None]:
'''
This script should be run in the task2 folder
This script creates a pd.DataFrame with paper_ids and text. The text is cleaned from its references and citations.
'''

import pandas as pd
import os
import json
import numpy as np
# Removes all cite spans and ref spans including the brackets
# INPUT: json_file['body_text'][i]as
# OUTPUT: plain text in a string without any references or annotations
def removeCitsRefs(json_paragraph):
    ranges = []
    places = []

    places.extend(json_paragraph['cite_spans'])
    places.extend(json_paragraph['ref_spans'])

    # Get ranges to delete
    for d in places:
        tmp = (d['start'], d['end'])
        ranges.append(tmp)
    text_list = list(json_paragraph['text'])
    for plc in ranges:
        for i in range(plc[0], plc[1]):
            text_list[i] = " "
    text = ''.join(text_list)
    return text


# INPUT: json_file['body_text']
# OUTPUT: Plain Text string
def getText(json_body):
    main_txt = ""
    for i in range(len(json_body)):
        tmp_txt = removeCitsRefs(json_body[i])
        tmp_txt += "\n"
        main_txt += tmp_txt
    return main_txt


# PaperId, Title + Body,
def getFileText(json_file):
    body = json_file['metadata']['title'] + " \n\n" + getText(json_file['body_text'])
    paper_id = json_file['paper_id']

    return [paper_id, body]


def createDataFrame(json_paths):
    files = []
    for j in range(len(json_paths)):
        # load json
        with open(json_paths[j], 'r') as f:
            article_json = json.load(f)
        files.append(getFileText(article_json))
    df = pd.DataFrame(files, columns=['paper_id', 'text'])
    return df


# Get data
# metadata = pd.read_csv("./data/metadata.csv", low_memory=False)
if task==1:
    path_csv = "./data/papers_all.csv"
else:
    path_csv = "./data/papers19-20.csv"
    
path_metadata = './data/metadata.csv'
    
#Get metadata
metadata = pd.read_csv(path_metadata, low_memory=False)
#Drop all rows that doesn't have sha identifier and publish time
to_drop = list(metadata[pd.isna(metadata['publish_time'])].index)
to_drop = to_drop + list(metadata[pd.isna(metadata['sha'])].index)
to_drop = np.array(to_drop)
to_drop = list(np.unique(to_drop))
metadata = metadata.drop(to_drop, axis=0)
print("Got metadata")

#Filter only rows with publish_time either 2019 or 2020
# p19 = metadata[metadata['publish_time'].str.contains('2019',regex=False)]
# p20 = metadata[metadata['publish_time'].str.contains('2020',regex=False)]
# metadata_papers = pd.concat([p19,p20],axis=0)
metadata_papers = metadata.copy()
print("Got metadata from 2019 and 2020")

files = pd.DataFrame(columns=["name", "path"])

#Get JSON Files' paths
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        files = files.append({'name': filename, 'path': os.path.join(dirname, filename)}, ignore_index=True)
print("Got file paths")

#Drop irrelevant files that are not json
idx = files[files.name.str.contains(".json") == False].index
files.drop(index=idx, inplace=True)
files.reset_index(inplace=True)
files.drop("index", axis=1, inplace=True)
print("Got all file paths")
#Erase extensions


def getName(string):
    for i in range(len(string)):
        if string[i] == ".":
            return string[:i]
files['file_name'] = files['name'].map(lambda file_name: getName(file_name))
print("Delete extensions")

# Mark papers from 2019 or 2020
files['sha_flag'] = files['file_name'].isin(list(metadata_papers['sha']))
path_files = list(files[files['sha_flag']]['path'])


print("Cleaned paths without .json extension")
df_text = createDataFrame(path_files)
print("Created Dataframe")

df_text.to_csv(path_csv,index=False)
print("Saved DataFrame to ",os.path.abspath(path_csv))


### Step 2: Compute BoW vectors of the papers

__IMPORTANT:__ Since it takes a lot of time to compute the BoW vectors we provide you the BoW vectors. If you do not want to wait for a lot of time please skip this step.

In [None]:
# -*- coding: utf-8 -*-
import nltk
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from gensim import corpora
import os
import pickle
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Get data
if task ==1:
    PATH_CSV = os.path.abspath("./data/papers_all.csv")
    PATH_INDEX = './data/similarities.index'
    PATH_DICTIONARY = './data/dictionary_all.dict'
    PATH_TFIDF = './data/tfidf_all.pkl'
    PATH_PICKLE = './data/bow_corpus_all.pickle'
else:
    PATH_CSV = os.path.abspath("./data/papers19-20.csv")
    PATH_INDEX = './data/similarities.index'
    PATH_DICTIONARY = './data/dictionary19-20.dict'
    PATH_TFIDF = './data/tfidf_all.pkl'
    PATH_PICKLE = './data/bow_corpus19-20.pickle'
    
data = pd.read_csv(PATH_CSV)
print("Got csv")
papers = data['text'].values

# Tokenize
punctuation = ",.?!()-_\"\'\\\n\r\t;:+*<>@#§^$%&|/"
processed = [[w.lower() for w in word_tokenize(document)] for document in papers]
print("Tokenized")

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


# Lemmatize
lemmatizer = WordNetLemmatizer()
processed = [[lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in doc] for doc in processed]
print("Lemmatized")
# Filter
processed = [[w for w in doc if (w not in stopwords.words('english')) and (w not in punctuation)] for doc in processed]
print("Filtered")

# Compute frequency
frequency = defaultdict(int)
for document in processed:
    for token in document:
        frequency[token] += 1
print("Frequencies computed")
# Get only words with frequency >1
processed_corpus = [[w for w in document if frequency[w] > 1] for document in processed]

# Save it into dictionary
dictionary = corpora.Dictionary(processed_corpus)
dictionary.save(PATH_DICTIONARY)
print("Dictionary created and saved into ", PATH_DICTIONARY)
# Create BoW vectors
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
print("BoW vectors are created")

with open(PATH_PICKLE,"wb") as f :
    pickle.dump(bow_corpus,f)
print("DONE")

### Step 3: Find the relevant sentences

In [109]:
import pandas as pd
import numpy as np
from gensim.corpora import Dictionary
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from gensim import corpora, similarities
from gensim import models
from nltk.corpus import wordnet
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import os
import pickle


def getBowVectors(papers, dictionary):
    # Tokenize
    punctuation = ",.?!()-_\"\'\\\n\r\t;:+*<>@#§^$%&|/"
    processed = [[w.lower() for w in word_tokenize(document)] for document in papers]


    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)


    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    processed = [[lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in doc] for doc in processed]
    # Filter
    processed = [[w for w in doc if (w not in stopwords.words('english')) and (w not in punctuation)] for doc in processed]
    # Compute frequency
    frequency = defaultdict(int)
    for document in processed:
        for token in document:
            frequency[token] += 1
    # Get only words with frequency >1
    processed = [[w for w in document if frequency[w] > 1] for document in processed]
    bow_vectors = [dictionary.doc2bow(text) for text in processed]
    return bow_vectors



#Number of relevant papers to fetch from corpus
n_papers = 10
n_sentences = 10

#Get data
if task ==1:
    PATH_CSV = os.path.abspath("./data/papers_all.csv")
    PATH_INDEX = './data/similarities.index'
    PATH_DICTIONARY = './data/dictionary_all.dict'
    PATH_TFIDF = './data/tfidf_all.pkl'
    PATH_PICKLE = './data/bow_corpus_all.pickle'
else:
    PATH_CSV = os.path.abspath("./data/papers19-20.csv")
    PATH_INDEX = './data/similarities.index'
    PATH_DICTIONARY = './data/dictionary19-20.dict'
    PATH_TFIDF = './data/tfidf.pkl'
    PATH_SENTENCES = './data/query_results.csv'
    PATH_PICKLE = './data/bow_corpus19-20.pickle'

with open(PATH_PICKLE, "rb") as file:
    bow_corpus = pickle.load(file)


data = pd.read_csv(PATH_CSV)
print("Got csv")
papers = data['text']

# Load dictionary
dictionary = Dictionary.load(PATH_DICTIONARY)
print("Loaded dictionary")
b = np.array(bow_corpus)
print('shape of bow corpus', b.shape)
tfidf = models.TfidfModel(bow_corpus, id2word=dictionary, normalize=True, slope=0.25)
# Similarity Index
# Use especially similarity but not matrix similarity, since
index = similarities.Similarity(None, tfidf[bow_corpus], num_features=len(dictionary))
#df = pd.DataFrame([],columns=['paper_id','sentence'])


Got csv
Loaded dictionary
shape of bow corpus (36009,)


### Query

After you run all the cells above, you only need to run the following cells again in order to obtain the result for a query. The query sentence can be changed below(line 7)

In [114]:
import pandas as pd
import numpy as np
from gensim.corpora import Dictionary
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from gensim import corpora, similarities
from gensim import models
from nltk.corpus import wordnet
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import os
import pickle

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def process_query(query):
    lemmatizer = WordNetLemmatizer()
    #print("Start: {}".format(query))
    query = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in query.split(' ') if w!='']
    #print("Lemmatized: {}".format(query))
    query = ' '.join(query)
    #print("Joined: {}".format(query))
    query = dictionary.doc2bow(query.lower().split(" "))
    query = tfidf[query]
    return query



### DO NOT FORGET TO CHANGE TASK VARIABLE AT THE BEGINNING OF THE NOTEBOOK 

### Task 1: queries
# query = "do risk factors of covid-19 change depending on the age of patient"
# query = "smoking pre-existing pulmonary disease"


### Task 2: queries
query = "range of incubation periods for the disease in humans"
# query = "transmission incubation seasonality"


bow_query = process_query(query)
print("Query procesed ")

#Get similar docs and sort them according to their value in descending order
similars = index[bow_query]
similars = sorted(enumerate(similars), key=lambda item: -item[1])

#Get  relevant paper_ids
ids = []

for i, s in enumerate(similars):
    if len(ids) >= n_papers:
        break
    ids.append(s[0])

#After you have the relevant paper ids
#Get relevant papers' bow vectors
relevant_papers = data.iloc[ids,:]
bow_papers = getBowVectors(relevant_papers['text'], dictionary)
c = np.array(bow_papers)
print('shape of relevant papers bow ', c.shape)


Query procesed 
shape of relevant papers bow  (10,)


In [115]:
df = pd.DataFrame([],columns=['paper_id', 'sentence', 'score'])


########### try 2 approaches
########### 1) get sentences containing words with high tfidf value(assuming most important in document is most related to query)
########### 2) analyse similarity of query and sentences one by one and return sentences having highest scores

OPTION1 = False

if OPTION1:
    #look deeper in each paper
    for i in range(len(bow_papers)):
        sentences = []
        #Extract most important keywords of the document
        keyword_ids = [s[0] for s in sorted(tfidf[bow_papers[i]],key=lambda tup: -tup[-1])]
        keyword_ids = keyword_ids[:10]
        keywords = []
        for j in keyword_ids:
            keywords.append(dictionary[j])

        # Get paper and split it into sentences
        tmp_paper_id = relevant_papers.iloc[i,0]
        tmp_paper = relevant_papers.iloc[i,1]
        tmp_paper = tmp_paper.split('.')
        #Find the sentences containing those words
        for sent in tmp_paper:
            for k in keywords:
                if k in sent:
                    sentences.append([tmp_paper_id, sent])
                    break
        df = df.append(pd.DataFrame(sentences,columns=['paper_id','sentence']),ignore_index=True)
    
    pd.set_option("display.max_rows", None, "display.max_columns", None)
    pd.options.display.max_colwidth = 500
    df.to_csv(PATH_SENTENCES)
    print("Relevant sentences are saved into ", PATH_SENTENCES)
else:
    def sentence_similarity(sent1, sent2):
        magnitude1 = 0
        magnitude2 = 0
        for element in sent1:
            magnitude1 += element[1]**2
        for element in sent2:
            magnitude2 += element[1]**2
        magnitude1 = np.sqrt(magnitude1)
        magnitude2 = np.sqrt(magnitude2)

        if magnitude2 == 0:
            magnitude2 = 0.00000001
        if magnitude1 == 0:
            magnitude1 = 0.00000001

        dot_product = 0
        # compute scalar product
        for element1 in sent1:
            for element2 in sent2:
                if element1[0] == element2[0]:
                    # We add 0.15 as a bonus because tfidf scores depend on the sentence length
                    # sometimes a short sentence containing one similar word can have a high score
                    # we want to favor number of common words between sentence and query.
                    dot_product += element1[1] * element2[1] + 0.15 
        return dot_product / (magnitude1*magnitude2)


    for i in range(len(relevant_papers)):
        paper_id = relevant_papers.iloc[i, 0]
        paper_text = relevant_papers.iloc[i, 1]
        paper_sentences = paper_text.split('.')

        sentences_bow = [process_query(sent) for sent in paper_sentences]
        
        similarities = np.array([sentence_similarity(bow_query, sent) for sent in sentences_bow])
        sorted_indices = np.argsort(similarities)[::-1]
        #take best n_sentences
        paper_sentences = np.array(paper_sentences)
        most_similar_sentences = paper_sentences[sorted_indices[:n_sentences]]
        #print(most_similar_sentences[1])
        best_sentences = pd.DataFrame([], columns=['paper_id', 'sentence', 'score'])
        best_sentences['sentence'] = most_similar_sentences
        best_sentences['paper_id'] = paper_id
        best_sentences['score'] = similarities[sorted_indices[:n_sentences]]
        df = df.append(best_sentences, ignore_index=True)


    df.sort_values('score', axis=0, ascending=False, inplace=True)
    df = df[df['score'] != 0]
    pd.set_option("display.max_rows", None, "display.max_columns", None)
    pd.options.display.max_colwidth = 500
    df.to_csv(PATH_SENTENCES)
    print("Relevant sentences are saved into ", PATH_SENTENCES)



Relevant sentences are saved into  ./data/query_results.csv


Finally, run the cell below to see the results

In [116]:
df

Unnamed: 0,paper_id,sentence,score
80,d09c0f71b1a404a592d0dcad2c3409d43c375f99,\nThe incubation period of 2,1.2224
81,d09c0f71b1a404a592d0dcad2c3409d43c375f99,\nThe incubation period is essential in the control of infectious diseases,1.192823
20,599f44a88bfd9fcd7cc5b03f3b0bf01c9b3c5ba8,"We suggest that these two genogroups, the noroviruses, be considered to have the same incubation period",1.163002
10,4589d4013cf69c396e0fdb67131022fc11119654,The mean incubation period of human infections was around 3,1.159445
21,599f44a88bfd9fcd7cc5b03f3b0bf01c9b3c5ba8,We report the range of incubation periods such that an incubation period within that range would be consistent with the predictions of most investigators (i,1.122524
11,4589d4013cf69c396e0fdb67131022fc11119654,"4 days , similar to the incubation period for human infections with influenza A(H5N1) , and longer than the incubation period for human infections with seasonal influenza viruses",1.113978
22,599f44a88bfd9fcd7cc5b03f3b0bf01c9b3c5ba8,Median incubation periods ranged from 1,1.081131
23,599f44a88bfd9fcd7cc5b03f3b0bf01c9b3c5ba8,"Statements of the incubation period tend to be a single number (""The incubation period for rotavirus disease is approximately 2 days",1.074579
0,24de65541e25ca372d5fb686b733b45a1dd5c0b6,I first listed each possible incubation period for every patient for whom incubation period data were available,1.068867
70,6d3b3f4ab80a61c45f82c61c6c756cfc6ddf4bf2,The distribution of the incubation period is continuous;\n(A3),1.068587


## Evaluation of Query Results  

### Explanation of the new method

Since we already explained our model in detail on the previous submission. Here, a new method that is implemented in order to find relevant sentences after selecting relevant papers will be explained and the results will be shown and evaluated.

Previously, after finding relevant papers as explained in the report, the words with highest tfidf scores in the papers were chosen. Then, all sentences containing one of those words were selected as answer to query. Addtional to that the following method is implemented.

Each relevant paper is split in to its sentences. For each sentences the tfidf vector with respect to the corpus dictionary is computed. The same representation is also computed for the query. Later, cosine similarity between the query and all the sentences are computed. However, there is one trick we are doing while computing cosine similarity. Since tfidf depends on the length of sentences, shorter sentences containing only one or two common words with the query might receive higher scores even though they are not so relevant. To regularize this effect, a value of 0.15 is added to the score for every word that shows up in the query and in a particular sentence. The value is empirically decided.

Both of the methods are in the code and one can choose which one to you by changing the variable "OPTION1".
If it is false, the new method will be used.





### Task - What do we know about COVID-19 risk factors?

Below we interpret the results of the some queries :

Below you see some sentences retrieved by our method:    
**Query: "do risk factors of covid-19 change depending on the age of patient"**

- For instance, age is the strongest risk factor for CVD and the effect of aging on immune function may be equally important for COVID-19 suceptibility and severity

- \nThe median age of COVID-19 patients was 67 years old, which was significantly higher than that of H1N1 patients (52 years old, p<0	

- While no studies on COVID-19 have included patients with congenital heart disease, it stands to reason that patients with congenital heart disease could be considered at higher risk for complications from COVID-19	

- \nIn addition to the mechanisms by which COVID-19 can affect patients with CVD risk factors, it is also important to consider COVID-19 in the context of an especially vulnerable group of patients, such as individuals awaiting or post heart transplantation	

- One study of 204 patients with confirmed COVID-19 suggests 48	

**Evaluation:** Most of the chosen sentences are plausible and contains the information parallel to the query, e.g., the exemplary sentences includes the words age, risk and covid. 
However, as seen at the last sentence, it is not guaranteed to get  sentences that make sense.  
In the light of these sentences one can state that ... 

    age is an integral risk factor.     
    the median age of COVID-19 patients is 67 years old.   
    patients with congenital heart disease and CVD risk factors could be considered at higher risk for complications.  
    

**Query: "smoking pre-existing pulmonary disease"**  

- No statistical analysis for evaluating the association between the severity of the disease outcome and smoking status was conducted in that study	

- They concluded that advancing age and pre-existing diseases were the main risk factors leading to death and severe infection	

- A recent systematic review on COVID-19 and smoking including five studies found that smoking was most likely associated with the negative outcomes	

- identified that pre-existing disease and whether the patient is a healthcare professional or not are the main factors influencing recovery	

- We showed that COPD and smoking in COVID-19 is associated with greater disease severity and higher mortality	

**Evaluation:** From the sentences it is obvious that the papers are not decisive on effects of smoking on the patients, however, it is mentioned that pre-existing diseases are the main risk factors.

### Task - What is known about transmission, incubation, and environmental stability?

**Query: "transmission incubation seasonality"**

- It is important to note that the effects of seasonality were not statistically significant

- When grouped thus, the seasonality of the seasonal salmonellas (2) resembled that of the remaining strains (4), while the overall seasonality of serotypes that individually showed little evidence of seasonality were not obviously seasonal when combined	

- Seasonality may be explained by a mixture of factors including climate, social, behavioural, agricultural, environmental, stochastic changes in immune populations, and other drivers	

- To the best of our knowledge, there is a single study, which anecdotally examined seasonality and time series patterns of MERS-CoV to date	

- Even then, our analysis showed that, unlike the trend component, seasonality did not affect the series over time	

**Evaluation:** One should mark that the queries focus on seasonality rather than transmission and incubation.  
About the seasonality of the virus, we gathered obvious information. However, it is mentioned in some sentences that the effect of seasonality is not statistically significant.  


**Query: "range of incubation periods for the disease in humans"**

- The mean incubation period of human infections was around 3

- This finding indicates that with the given data set, an incubation period of 10 days is almost as likely to occur as an incubation period of 6 days

- \nIn previous studies, we found that the length of incubation period in patients infected by SARS and MERS coronaviruses was also significantly correlated with the severity of the disease but in the opposite direction, with a shorter incubation period for fatal cases	

- Few previous studies have investigated the hypothesis that the incubation period might be correlated with the severity of disease, although some studies have examined the correlation between infecting dose and severity of disease	

- A shorter incubation period could be indicative of a higher infective dose, leading to faster/greater pathogen replication, out-running adaptive immune responses or leading to a more aggressive and damaging inflammatory response, and thus leading to more severe disease	

**Evaluation:** On the incubation period contradictary hypothesis are made and no decisive prediction exists. The hypothesis state 3, 6 or 10 days of incubation period. The incubation period might be correlated with the severity of the disease, a short incubation period might indicate a higher infective dose and leading to faster pathogen replication. 

Overall the results suggest that the tool we created is a good start to explore a large amount of dataset and retrieve information. Yet, it should be noted, it is not guaranteed that the retrieved sentences are always plausible, there are cases where the sentences are somehow irrelevant to the query. Therefore, the presented method is in no means an automatic way to gather information, it still needs human intervention to be successfull on its task.




### Visualization

In [79]:
import cufflinks as cf
import plotly.offline as pyo
import plotly.graph_objs as go
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
from sklearn.feature_extraction.text import CountVectorizer
data = pd.read_csv(PATH_CSV)

def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(data['text'], 20)
for word, freq in common_words:
    print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['text' , 'count'])
df2.groupby('text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in papers after removing stop words')


cells 64446
virus 53421
patients 48785
infection 48723
10 41185
using 40321
viral 39805
study 38185
cell 36838
used 35370
data 34705
protein 33512
disease 32532
cov 31935
cases 29732
human 26269
time 26171
viruses 25284
preprint 25027
health 24628


In [81]:
#data = pd.read_csv(PATH_CSV)

def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(data['text'], 20)
for word, freq in common_words:
    print(word, freq)
df4 = pd.DataFrame(common_words, columns = ['text' , 'count'])
df4.groupby('text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in review after removing stop words')


covid 19 18145
sars cov 16873
mers cov 10754
copyright holder 10511
holder preprint 10494
author funder 10218
peer reviewed 9822
https doi 9131
doi org 9075
org 10 8879
10 1101 8070
et al 7683
funder granted 6740
granted medrxiv 6740
medrxiv license 6740
license display 6726
preprint perpetuity 6725
display preprint 6721
preprint peer 6486
1101 2020 6031


In [98]:
## plot text length
counts = pd.DataFrame(data['text'].str.len())
print(counts.values.shape)


counts.iplot(
    kind='bar', yTitle='Counts', linecolor='black', title='Document Lengths', bins=10)


(5329, 1)
