In [42]:
import pandas as pd
import numpy as np
import os
from pathlib import Path, PurePath
from ipywidgets import interact
import ipywidgets as widgets
from collections import defaultdict

!pip install whoosh
import whoosh
from whoosh.qparser import *
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, NUMERIC, NGRAMWORDS
from whoosh.analysis import StemmingAnalyzer,StandardAnalyzer, NgramFilter
from whoosh import index



In [0]:
!pip install git+https://github.com/bmabey/pyLDAvis.git@master#egg=pyLDAvis

import pyLDAvis.sklearn

In [0]:
def setup_local_data():
  from google.colab import drive
  drive.mount('/content/drive')
  drive_path=PurePath('/content/drive/My Drive')
  input_dir = drive_path/'COVID-19'
  print(list(Path(input_dir).glob('*')))
  return input_dir

In [0]:
#read the metadata file into df
def read_metadata_csv(input_dir):
  metadata_path = input_dir / 'metadata.csv'
  metadata = pd.read_csv(metadata_path,
                         dtype={'publish_time': str, #to do: extract year from 'pubblish time' as int
                                'authors':str,
                                'title': str,
                                'abstract':str,
                                'doi': str}
                         )
  #set the abstract to the paper title if it is null
  metadata['abstract'] = metadata['abstract'] .fillna(metadata['title'])

  return metadata

In [0]:
def read_full_data_csv(input_dir):
  path = input_dir / 'covid_data_full_v5.csv'
  full_data = pd.read_csv(path)
  return full_data

In [0]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

def get_count_vectorizer():
  vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
  return vectorizer

def get_lda_model(num_topics):
  lda_model = LatentDirichletAllocation(n_components=num_topics, max_iter=10, learning_method='online')
  return lda_model

In [0]:
#get hardcoded schema for the index
def get_search_schema():
  schema = Schema(date=TEXT(stored=True),
                  author=TEXT(stored=True),
                  title=TEXT(analyzer=StandardAnalyzer(),stored=True),
                  abstract=TEXT(analyzer=StandardAnalyzer(),stored=True),
                  doi=TEXT(stored=True))
  return schema

# creates an index in a dictionary (only need to run once)
# noop if it's already created
def create_search_index(drive_path, search_schema):
  if not os.path.exists(drive_path/'indexdir'):
      os.mkdir(drive_path/'indexdir')
  ix = index.create_in(drive_path/'indexdir', search_schema)
  #open an existing index object
  ix = index.open_dir(drive_path/'indexdir')
  return ix


def add_documents_to_index(ix, metadata):
  #create a writer object to add documents to the index
  writer = ix.writer()

  #now we can add documents to the index
  # ref_id = metadata['ref_id']
  date = metadata['publish_time']
  author = metadata['authors']
  title = metadata['title']
  abstract = metadata['abstract']
  doi = metadata['doi']

  for DATE, AUTHOR, TITLE, ABSTRACT, DOI in zip(date,author,title,abstract,doi):
    writer.add_document(date= str(DATE),
                        author=str(AUTHOR),
                        title=str(TITLE),
                        abstract=str(ABSTRACT),
                        doi=str(DOI))

  #close the writer and save the added documents in the index
  #you should call the commit() function once you finish adding the documents otherwise you will cause an error-
  #when you try to edit the index next time and open another writer. 
  writer.commit()

  # need to cancel writer if error or need to reset
  # writer.cancel()
  return

# get a multifield parser for the list of inptted fields
def get_multifield_parser(fields, search_schema):
  parser = MultifieldParser(fields, schema=search_schema)
  return parser

# this takes in a parser and query string to return the actual query that'll be sent to the searcher
def get_parser_query(parser, query):
  result = parser.parse(query) # use boolean operators in quotation
  print(result)
  return result

# Convert the doi to a url
def doi_url(d): 
    return f'http://{d}' if d.startswith('doi.org') else f'http://doi.org/{d}'
    
# this method takes in a search index and query to return a dataframe of results
# ix is the document index we created before
# query is the string found from the parser
def get_search_results(ix, query):
  #you can open the searcher using a with statement so the searcher is automatically closed when you’re done with it
  with ix.searcher() as searcher:
      results = searcher.search(query)#The Results object acts like a list of the matched documents
      print('Total Hits: {}\n'.format(len(results)))
      print(results)
      output_dict = defaultdict(list)
      num = 0
      for result in results:
        print(result)
        num = num + 1
        output_dict['title'].append(result['title'])
        output_dict['abstract'].append(result['abstract'])
        output_dict['publish_time'].append(result['date'])
        output_dict['authors'].append(result['author'])
        output_dict['doi'].append(result['doi'])
      print("Loop ran {} times.".format(num))

  print(len(output_dict['title']))
  output_df = pd.DataFrame(output_dict)
  print(output_df.shape)
  output_df['doi'] = output_df['doi'].apply(lambda x: doi_url(x) if x !='nan' else x)  
  return output_df

In [0]:
from ipywidgets import interact, Layout, HBox, VBox, Box
from IPython.display import HTML, display, clear_output
import ipywidgets as widgets
from IPython.display import update_display

def get_new_text_box():
  textW = widgets.Textarea(
        value='',
        placeholder='Type something like "covid" or incubation',
        description='',
        disabled=False,
        layout=Layout(width='100%', height='50px')
    )
  return textW

def get_new_plus_button():
  button = widgets.Button(description="+")
  return button

def get_new_dropdown():
  dropdown = widgets.Dropdown(
      options=['AND', 'OR', 'NOT'],
      value='AND',
      description='Operator: ',
      disabled=False,
    )
  return dropdown

def dynamic_search_query(parser, ix):
  textW = widgets.Textarea(
        value='',
        placeholder='Type something like "covid" or incubation',
        description='',
        disabled=False,
        layout=Layout(width='100%', height='50px')
    )
  
  button = widgets.Button(description="+")
  search_rows_list = []
  search_rows_list.append( HBox([textW, button], layout=Layout(align_items='center')) )
  # vboxes = [
  #           VBox(search_rows_list, layout=Layout(align_items='center')),
  #           VBox([button], layout=Layout(object_position='bottom'))
  #           ]
  # display_handle = display(vboxes, display_id='disp')
  display_handle = display(VBox(search_rows_list, layout=Layout(align_items='center')), display_id='disp')
  # display(HBox([textW, button], layout=Layout(align_items='center')))

  #search_rows_list is a list of HBox objects
  # the first index will just be a text box and '+' button
  # subsequent rows will have operator, text box, and '+' button
  def on_button_clicked(b):
    global STORED_SEARCH_QUERY
    # global search_rows_list
    clear_output(wait=True)
    new_text_box = get_new_text_box()
    dropdown = get_new_dropdown()
    # new_plus_button = get_new_plus_button()
    search_rows_list.append( HBox([dropdown, new_text_box, button], layout=Layout(align_items='center')) )
    # print(len(search_rows_list))
    # display(VBox(search_rows_list, layout=Layout(align_items='center')))
    display_handle.update(VBox(search_rows_list, layout=Layout(align_items='center')))

    combined = ''
    for i in range(0, len(search_rows_list)-1): #we do len - 1 since newet row has no values
      row = search_rows_list[i]
      # print(row)
      # print(i)
      if i == 0:
        temp = combined + row.children[0].value
        combined = temp
      else:
        temp = combined + ' ' + row.children[0].value + ' ' + row.children[1].value
        combined = temp
    
    print("Current raw search query:\n" + combined)
    print("Current query from parser:")
    query = get_parser_query(parser, combined) #already prints in method
    STORED_SEARCH_QUERY = query
    # update_display(display_id)

  button.on_click(on_button_clicked)


# Main Code

In [50]:
local_dir = setup_local_data()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[PosixPath('/content/drive/My Drive/COVID-19/metadata.csv'), PosixPath('/content/drive/My Drive/COVID-19/covid_data_full_v5.json'), PosixPath('/content/drive/My Drive/COVID-19/indexdir')]


In [51]:
metadata = read_metadata_csv(local_dir)
print(metadata.info())
print(metadata.shape)
print(metadata.head(5))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45774 entries, 0 to 45773
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   cord_uid                     45774 non-null  object 
 1   sha                          31753 non-null  object 
 2   source_x                     45774 non-null  object 
 3   title                        45617 non-null  object 
 4   doi                          42440 non-null  object 
 5   pmcid                        26243 non-null  object 
 6   pubmed_id                    34641 non-null  float64
 7   license                      45774 non-null  object 
 8   abstract                     45766 non-null  object 
 9   publish_time                 45765 non-null  object 
 10  authors                      43774 non-null  object 
 11  journal                      41707 non-null  object 
 12  Microsoft Academic Paper ID  964 non-null    float64
 13  WHO #Covidence  

In [0]:
search_schema = get_search_schema()
ix = create_search_index(local_dir, search_schema)
add_documents_to_index(ix, metadata)

In [0]:
ix = index.open_dir(local_dir/'indexdir')

fields = ["title", "abstract"]
parser = get_multifield_parser(fields, search_schema)
STORED_SEARCH_QUERY = '' #this is stored as a global so the last search query from the parser can be used in later cells
dynamic_search_query(parser, ix)

VBox(children=(HBox(children=(Textarea(value='"covid-19"', layout=Layout(height='50px', width='100%'), placeho…

Current raw search query:
"covid-19" AND incubation
Current query from parser:
((title:"covid 19" OR abstract:"covid 19") AND (title:incubation OR abstract:incubation))


In [54]:
print(STORED_SEARCH_QUERY)

((title:"covid 19" OR abstract:"covid 19") AND (title:incubation OR abstract:incubation))


In [102]:
output = get_search_results(ix, STORED_SEARCH_QUERY)
print(output.shape)
output

Total Hits: 78

<Top 10 Results for And([Or([Phrase('title', ['covid', '19'], slop=1, boost=1.000000), Phrase('abstract', ['covid', '19'], slop=1, boost=1.000000)]), Or([Term('title', 'incubation'), Term('abstract', 'incubation')])]) runtime=0.03502632799973071>
<Hit {'abstract': 'Motivation: Wuhan pneumonia is an acute infectious disease caused by the 2019 novel coronavirus (COVID-19). It is being treated as a Class A infectious disease though it was classified as Class B according to the Infectious Disease Prevention Act of China. Accurate estimation of the incubation period of the coronavirus is essential to the prevention and control. However, it remains unclear about its exact incubation period though it is believed that symptoms of COVID-19 can appear in as few as 2 days or as long as 14 or even more after exposure. The accurate incubation period calculation requires original chain-of-infection data that may not be fully available in the Wuhan regions. In this study, we aim to ac

Unnamed: 0,title,abstract,publish_time,authors,doi
0,Estimate the incubation period of coronavirus ...,Motivation: Wuhan pneumonia is an acute infect...,2020-02-29,Henry Han,http://doi.org/10.1101/2020.02.24.20027474
1,A Chinese Case of COVID-19 Did Not Show Infect...,Controversy remains over whether the novel cor...,2020-03-02,"Bae, Jong-Myon",http://doi.org/10.3961/jpmph.20.048
2,Transmission of COVID-19 in the terminal stage...,Abstract We report a familial cluster of 2019 ...,2020-03-16,"Li, Peng; Fu, Ji-Bo; Li, Ke-Feng; Chen, Yan; W...",http://doi.org/10.1016/j.ijid.2020.03.027
3,The Incubation Period of Coronavirus Disease 2...,"BACKGROUND: A novel human coronavirus, severe ...",2020-03-10,"Lauer, Stephen A.; Grantz, Kyra H.; Bi, Qifang...",http://doi.org/10.7326/m20-0504
4,Estimating the distribution of the incubation ...,Objectives: Amid the continuing spread of the ...,2020-02-18,Char Leung,http://doi.org/10.1101/2020.02.13.20022822
5,Epidemiologic Characteristics of COVID-19 in G...,"At the end of 2019, a coronavirus disease 2019...",2020-03-06,Kaike Ping,http://doi.org/10.1101/2020.03.01.20028944
6,Estimation of incubation period distribution o...,Background: The current outbreak of coronaviru...,2020-03-10,Qin Jing; Chong You; Qiushi Lin; Taojun Hu; Sh...,http://doi.org/10.1101/2020.03.06.20032417
7,[Epidemiological analysis on a family cluster ...,Objective: To understand the possible transmis...,2020,"Qiu, Y. Y.; Wang, S. Q.; Wang, X. L.; Lu, W. X...",http://doi.org/10.3760/cma.j.cn112338-20200221...
8,Epidemiological analysis on a family cluster o...,Objective To understand the possible transmiss...,2020,"QIU, Yuanying; WANG, Songqiang; WANG, Xiaoli; ...",
9,Epidemiological characteristics of 1212 COVID-...,Based on publicly released data for 1212 patie...,2020-02-23,Pei Wang; Junan Lu; Yanyu Jin; Mengfan Zhu; Li...,http://doi.org/10.1101/2020.02.21.20026112


In [90]:
# data = pd.concat([metadata['title'], metadata['abstract']], axis=1)
# metadata = metadata.dropna(subset=['title', 'abstract'])
# data = metadata['title'] + ' ' + metadata['abstract'] #space ensures title_last_word and abstract_first_word are stored as separate words

# output = output.dropna(subset=['title', 'abstract'])
print(output.shape)
data = output['title'] + ' ' + output['abstract']

print(type(data))
print(len(data))
data = data.tolist()
print(data[0])

(10, 5)
<class 'pandas.core.series.Series'>
10
Estimate the incubation period of coronavirus 2019 (COVID-19) Motivation: Wuhan pneumonia is an acute infectious disease caused by the 2019 novel coronavirus (COVID-19). It is being treated as a Class A infectious disease though it was classified as Class B according to the Infectious Disease Prevention Act of China. Accurate estimation of the incubation period of the coronavirus is essential to the prevention and control. However, it remains unclear about its exact incubation period though it is believed that symptoms of COVID-19 can appear in as few as 2 days or as long as 14 or even more after exposure. The accurate incubation period calculation requires original chain-of-infection data that may not be fully available in the Wuhan regions. In this study, we aim to accurately calculate the incubation period of COVID-19 by taking advantage of the chain-of-infection data, which is well-documented and epidemiologically informative, outside 

In [0]:
count_vectorizer = get_count_vectorizer()
data_vectorized = count_vectorizer.fit_transform(data)

In [59]:
print(count_vectorizer.get_feature_names())
print(len(count_vectorizer.get_feature_names()))
lda_model = get_lda_model(num_topics=10)
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

['analysis', 'cases', 'china', 'collected', 'control', 'coronavirus', 'data', 'days', 'disease', 'epidemiological', 'estimated', 'infection', 'methods', 'novel', 'period', 'periods', 'possible', 'results', 'spread', 'wuhan']
20
(10, 10)


In [60]:
print(lda_Z[0]) #we can see that this adds up to 1 since LDA is a summation of topic probabilities
print(lda_model.components_) 
print(lda_model.components_.shape) # this is num_components/topics by num_features, num_features comes from feature names of count vectorizer

[0.00250066 0.00250058 0.00250008 0.00250111 0.00250063 0.0025007
 0.97749526 0.00250008 0.00250082 0.00250008]
[[ 0.30619458  2.2050618   1.86891172  1.13214257  0.36082922  3.48732519
   1.07603373  0.28451145  1.07434196  0.42799543  1.05473556  1.90982817
   1.11622731  2.68618903  6.79127235  1.06508479  0.36834202  1.1324525
   1.15221187  0.3670752 ]
 [ 0.27596323  0.27505769  0.28621845  0.28986656  0.26242644  1.10398316
   0.33081843  0.33290873  0.29322442  2.70700141  0.3423138   0.27866432
   0.28329516  1.10430412  4.39946189  0.29476001  0.26843801  0.27977835
   0.28876665  0.27017695]
 [ 0.29336851  0.31659914  0.2785367   0.2702323   0.26631201  0.24527498
   0.28570375  0.30662175  0.27387818  0.24561147  0.28314497  0.28940707
   0.28435678  0.2518657   0.2372315   0.26774431  0.2605265   0.28708527
   0.28321949  0.30367904]
 [ 1.09576967  8.92353543  4.89341818  0.99755302  1.88327049  4.22396301
   0.99741647  6.64673451  4.24647908  0.28061151  1.81991726  2.643

In [61]:
avg_topic = lda_Z.mean(axis=0)
print(avg_topic)
print(avg_topic.shape)

[0.19043396 0.09610588 0.005197   0.19821268 0.09931433 0.1019717
 0.1026975  0.005197   0.19567295 0.00519699]
(10,)


In [62]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
        
print("LDA Model:")
print_topics(lda_model, count_vectorizer)

LDA Model:
Topic 0:
[('period', 6.791272354537003), ('coronavirus', 3.4873251937034135), ('novel', 2.6861890259043384), ('cases', 2.205061798334391), ('infection', 1.9098281718237424), ('china', 1.8689117181024524), ('spread', 1.1522118694350119), ('results', 1.132452497862127), ('collected', 1.1321425721695293), ('methods', 1.1162273089333186)]
Topic 1:
[('period', 4.399461886746125), ('epidemiological', 2.707001409783347), ('novel', 1.1043041164305767), ('coronavirus', 1.1039831627268377), ('estimated', 0.34231380191856464), ('days', 0.33290872960567564), ('data', 0.33081842684784396), ('periods', 0.2947600067102226), ('disease', 0.2932244160969522), ('collected', 0.2898665555199142)]
Topic 2:
[('cases', 0.31659913679786805), ('days', 0.306621745525995), ('wuhan', 0.30367904453606404), ('analysis', 0.2933685062199552), ('infection', 0.28940707404778715), ('results', 0.28708527470358347), ('data', 0.28570375040867846), ('methods', 0.28435678371334033), ('spread', 0.2832194922769728), 

In [64]:
!pip install --upgrade pandas

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, count_vectorizer, mds='tsne')
panel

ERROR! Session/line number was not unique in database. History logging moved to new session 59
Collecting pandas
[?25l  Downloading https://files.pythonhosted.org/packages/bb/71/8f53bdbcbc67c912b888b40def255767e475402e9df64050019149b1a943/pandas-1.0.3-cp36-cp36m-manylinux1_x86_64.whl (10.0MB)
[K     |████████████████████████████████| 10.0MB 3.3MB/s 
[31mERROR: pyldavis 2.1.3 has requirement pandas<0.24.0a,>=0.17.0, but you'll have pandas 1.0.3 which is incompatible.[0m
Installing collected packages: pandas
  Found existing installation: pandas 0.23.4
    Uninstalling pandas-0.23.4:
      Successfully uninstalled pandas-0.23.4
Successfully installed pandas-1.0.3


In [0]:
from scipy.spatial import distance

In [66]:
print(distance.cosine(avg_topic, lda_Z[0]))

# For one d arrays below, it doesn't make sense to transpose as 2d since calc is always 0
# Cosine score of 0 means 2 vectors are at 90degrees/orthagonal, have no match
# avg_topic = np.array(avg_topic).reshape(-1,1)
# print(avg_topic.shape)
# distances = distance.cdist(avg_topic, avg_topic, metric='cosine')
# print(distances)
# print(avg_topic)
# print(np.array(lda_Z[0]).reshape(-1, 1))
# example = distance.cdist(avg_topic, np.array(lda_Z[0]).reshape(-1, 1), metric='cosine')
# print(example)

0.7324319634802485


In [73]:
print(data_vectorized.shape)
metadata = metadata.dropna(subset=['title', 'abstract'])
metadata_condensed = metadata['title'] + ' ' + metadata['abstract'] #space ensures title_last_word and abstract_first_word are stored as separate words
metadata_condensed = metadata_condensed.tolist()
print(metadata_condensed[0])

metadata_condensed_vectorized = count_vectorizer.transform(metadata_condensed)
print(metadata_condensed_vectorized.shape)
lda_corpus = lda_model.transform(metadata_condensed_vectorized)

(10, 20)
SIANN: Strain Identification by Alignment to Near Neighbors Next-generation sequencing is increasingly being used to study samples composed of mixtures of organisms, such as in clinical applications where the presence of a pathogen at very low abundance may be highly important. We present an analytical method (SIANN: Strain Identification by Alignment to Near Neighbors) specifically designed to rapidly detect a set of target organisms in mixed samples that achieves a high degree of species- and strain-specificity by aligning short sequence reads to the genomes of near neighbor organisms, as well as that of the target. Empirical benchmarking alongside the current state-of-the-art methods shows an extremely high Positive Predictive Value, even at very low abundances of the target organism in a mixed sample. SIANN is available as an Illumina BaseSpace app, as well as through Signature Science, LLC. SIANN results are presented in a streamlined report designed to be comprehensible 

In [82]:
#for each entry in the corpus, add a column of cosine distance compared to the avg topic from search query
print(lda_corpus.shape)
for entry in lda_corpus[:5]:
  print(entry)

distances = []
for entry in lda_corpus:
  distances.append(distance.cosine(avg_topic, entry))
print(distances[0])
metadata['cosine_dist_from_topic'] = np.asarray(distances)
metadata.head(5)

(45617, 10)
[0.02500537 0.0250004  0.02500114 0.02500642 0.02500023 0.02500499
 0.02500407 0.02500084 0.77497529 0.02500124]
[0.03333719 0.0333337  0.03333409 0.69997358 0.03333357 0.03334198
 0.03334436 0.03333422 0.03333358 0.03333373]
[0.84998213 0.01666963 0.01666709 0.01666784 0.01666852 0.01667321
 0.01667071 0.01666701 0.01666679 0.01666707]
[0.02000122 0.02000026 0.02000057 0.02000119 0.02000651 0.0200027
 0.81998433 0.02000073 0.02000205 0.02000042]
[0.00769418 0.00769246 0.00769266 0.00769377 0.00769294 0.93076057
 0.00769451 0.0076926  0.00769369 0.00769262]
0.43781609758594153


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file,url,cosine_dist_from_topic
0,vho70jcx,f056da9c64fbf00a4645ae326e8a4339d015d155,biorxiv,SIANN: Strain Identification by Alignment to N...,10.1101/001727,,,biorxiv,Next-generation sequencing is increasingly bei...,2014-01-10,Samuel Minot; Stephen D Turner; Krista L Ternu...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/001727,0.437816
1,i9tbix2v,daf32e013d325a6feb80e83d15aabc64a48fae33,biorxiv,Spatial epidemiology of networked metapopulati...,10.1101/003889,,,biorxiv,An emerging disease is one infectious epidemic...,2014-06-04,Lin WANG; Xiang Li,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/003889,0.403593
2,62gfisc6,f33c6d94b0efaa198f8f3f20e644625fa3fe10d2,biorxiv,Sequencing of the human IG light chain loci fr...,10.1101/006866,,,biorxiv,Germline variation at immunoglobulin gene (IG)...,2014-07-03,Corey T Watson; Karyn Meltz Steinberg; Tina A ...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/006866,0.475124
3,058r9486,4da8a87e614373d56070ed272487451266dce919,biorxiv,Bayesian mixture analysis for metagenomic comm...,10.1101/007476,,,biorxiv,Deep sequencing of clinical samples is now an ...,2014-07-25,Sofia Morfopoulou; Vincent Plagnol,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/007476,0.683339
4,wich35l7,eccef80cfbe078235df22398f195d5db462d8000,biorxiv,Mapping a viral phylogeny onto outbreak trees ...,10.1101/010389,,,biorxiv,Developing methods to reconstruct transmission...,2014-11-11,Stephen P Velsko; Jonathan E Allen,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/010389,0.721292


In [86]:
#then display top results from overall corpus
metadata_sorted = metadata.sort_values('cosine_dist_from_topic', ascending=False)
# metadata_sorted.head(5)
metadata_sorted[:20]['title']

344      Epidemiological characteristics of 1212 COVID-...
572      Estimation of incubation period distribution o...
385      Estimate the incubation period of coronavirus ...
2038     A Chinese Case of COVID-19 Did Not Show Infect...
7350     A comparison of smartphones to paper-based que...
9050     Utilizing Nontraditional Data Sources for Near...
34603    Spatial and temporal epidemiological analysis ...
23502    SOURCES OF DATA FOR IMPROVED SURVEILLANCE OF H...
10172    De novo assembly of highly polymorphic metagen...
12324    Evaluation of Data Exchange Process for Intero...
39950    Health-Care Data Collecting, Sharing, and Usin...
7371     Exploration of diarrhoea seasonality and its d...
13759    The use and reporting of airline passenger dat...
6858     Outcomes of Influenza A(H1N1)pdm09 Virus Infec...
1279     Analysis of factors associated with disease ou...
13887    Patterns of seasonal and pandemic influenza-as...
13140    Bayesian inference of transmission chains usin.

TODO:
* Get average topic of subset of corpus (start with arbitary random count, use search queries later.
* Try this on the full corpus of text
* Experiment with different count vecotorizer parameters like ngrams and lda parameters like learning_method
* Integrate pyldavis library for better visuation of topics with LDA
* Use LDA2vec

# References

* https://nlpforhackers.io/topic-modeling/