<a href="https://colab.research.google.com/github/antonpolishko/task-ties/blob/master/TIES_LDA_POC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Set-up Code

In [0]:
!pip install git+https://github.com/bmabey/pyLDAvis.git@master#egg=pyLDAvis -q
!pip install whoosh -q

import pyLDAvis.sklearn

import pandas as pd
import numpy as np
import os
from pathlib import Path, PurePath
from ipywidgets import interact
import ipywidgets as widgets
from collections import defaultdict
import json

import whoosh
from whoosh.qparser import *
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, NUMERIC, NGRAMWORDS
from whoosh.analysis import StemmingAnalyzer,StandardAnalyzer, NgramFilter
from whoosh import index

[K     |████████████████████████████████| 8.9MB 2.8MB/s 
[K     |████████████████████████████████| 552kB 37.4MB/s 
[?25h  Building wheel for pyLDAvis (setup.py) ... [?25l[?25hdone
  Building wheel for funcy (setup.py) ... [?25l[?25hdone
[31mERROR: xarray 0.15.1 has requirement pandas>=0.25, but you'll have pandas 0.23.4 which is incompatible.[0m
[31mERROR: plotnine 0.6.0 has requirement pandas>=0.25.0, but you'll have pandas 0.23.4 which is incompatible.[0m
[31mERROR: mizani 0.6.0 has requirement pandas>=0.25.0, but you'll have pandas 0.23.4 which is incompatible.[0m
[31mERROR: google-colab 1.0.0 has requirement pandas~=1.0.0; python_version >= "3.0", but you'll have pandas 0.23.4 which is incompatible.[0m
[K     |████████████████████████████████| 471kB 2.8MB/s 
[?25h

In [0]:
# set data paths, this requires local drive to have a folder calld "COVID-19" with the clean_metadata.csv file
# returns a string to the local path setup
def setup_local_data():
  from google.colab import drive
  drive.mount('/content/drive')
  drive_path=PurePath('/content/drive/My Drive')
  input_dir = drive_path/'COVID-19'
  print(list(Path(input_dir).glob('*')))
  return input_dir

In [0]:
#read the metadata file into df
def read_metadata_csv(input_dir):
  metadata_path = input_dir / 'clean_metadata.csv'
  metadata = pd.read_csv(metadata_path,
                         dtype={'cord_uid':str,
                                'sha':str,
                                'publish_time': str, 
                                'authors':str,
                                'title': str,
                                'abstract':str,
                                'url': str},
                         parse_dates = ['publish_time']
                         )
  #set the abstract to the paper title if it is null
  metadata['abstract'] = metadata['abstract'] .fillna(metadata['title'])
  metadata['publish_year'] = pd.DatetimeIndex(metadata['publish_time']).year
  metadata = metadata.drop('Unnamed: 0', axis=1)
  return metadata

In [0]:
def read_full_data_csv(input_dir):
  path = input_dir / 'covid_data_full_v5.csv'
  full_data = pd.read_csv(path)
  return full_data

In [0]:
def read_query_dictionary(input_dir):
  path = input_dir / 'covid_query_dictionary.json'
  with open(path) as f:
    query_dict = json.load(f)
  return query_dict

In [0]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

def get_count_vectorizer():
  vectorizer = CountVectorizer(min_df =1, max_df=1, #min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
  return vectorizer

def get_lda_model(num_topics):
  lda_model = LatentDirichletAllocation(n_components=num_topics, max_iter=10, learning_method='online')
  return lda_model

  token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')


In [0]:
#get hardcoded schema for the index
def get_search_schema():
  schema = Schema(uid = TEXT(stored=True),
                  sha = TEXT(stored=True),
                  year=NUMERIC(stored=True),
                  author=TEXT(stored=True),
                  title=TEXT(analyzer=StandardAnalyzer(),stored=True),
                  abstract=TEXT(analyzer=StandardAnalyzer(),stored=True),
                  url=TEXT(stored=True))
  return schema

# creates an index in a dictionary (only need to run once)
# noop if it's already created
def create_search_index(drive_path, search_schema):
  if not os.path.exists(drive_path/'indexdir'):
      os.mkdir(drive_path/'indexdir')
  ix = index.create_in(drive_path/'indexdir', search_schema)
  # #open an existing index object
  # ix = index.open_dir(drive_path/'indexdir')
  return ix


def add_documents_to_index(ix, metadata):
  #cancel writer in case re-indexing is needed
  if 'writer' in locals():
    writer.cancel()
  #create a writer object to add documents to the index
  writer = ix.writer()

  #now we can add documents to the index
  uid = metadata['cord_uid']
  sha = metadata['sha']
  year = metadata['publish_year']
  author = metadata['authors']
  title = metadata['title']
  abstract = metadata['abstract']
  url = metadata['url']

  for UID, SHA, YEAR, AUTHOR, TITLE, ABSTRACT, URL in zip(uid, sha, year,author,title,abstract,url):
    writer.add_document(uid = str(UID),
                        sha= str(SHA),
                        year= int(YEAR),
                        author=str(AUTHOR),
                        title=str(TITLE),
                        abstract=str(ABSTRACT),
                        url=str(URL))

  #close the writer and save the added documents in the index
  #you should call the commit() function once you finish adding the documents otherwise you will cause an error-
  #when you try to edit the index next time and open another writer. 
  writer.commit()

  # need to cancel writer if error or need to reset
  # writer.cancel()
  return

# get a multifield parser for the list of inptted fields
def get_multifield_parser(fields, search_schema):
  parser = MultifieldParser(fields, schema=search_schema)
  parser.add_plugin(GtLtPlugin())
  parser.add_plugin(SequencePlugin())
  parser.add_plugin(PhrasePlugin())
  return parser

# this takes in a parser and query string to return the actual query that'll be sent to the searcher
def get_parser_query(parser, query):
  result = parser.parse(query) # use boolean operators in quotation
  print(result)
  return result

# Convert the doi to a url
# def doi_url(d): 
#     return f'http://{d}' if d.startswith('doi.org') else f'http://doi.org/{d}'
    
# this method takes in a search index and query to return a dataframe of results
# ix is the document index we created before
# query is the string found from the parser
def get_search_results(ix, query):
  #you can open the searcher using a with statement so the searcher is automatically closed when you’re done with it
  with ix.searcher() as searcher:
      results = searcher.search(query, limit=None)#The Results object acts like a list of the matched documents
      print('Total Hits: {}\n'.format(len(results)))
      # print(results)
      output_dict = defaultdict(list)
      num = 0
      for result in results:
        # print(result)
        num = num + 1
        output_dict['cord_uid'].append(result['uid'])
        output_dict['sha'].append(result['sha'])
        output_dict['bm25_score'].append(result.score)
        output_dict['title'].append(result['title'])
        output_dict['abstract'].append(result['abstract'])
        output_dict['publish_year'].append(result['year'])
        output_dict['authors'].append(result['author'])
        output_dict['url'].append(result['url'])
      print("Loop ran {} times.".format(num))

  # print(len(output_dict['title']))
  output_df = pd.DataFrame(output_dict)
  print(output_df.shape)
  # output_df['doi'] = output_df['doi'].apply(lambda x: doi_url(x) if x !='nan' else x)  
  return output_df

In [0]:
from ipywidgets import interact, Layout, HBox, VBox, Box
from IPython.display import HTML, display, clear_output
import ipywidgets as widgets
from IPython.display import update_display

def get_new_text_box():
  textW = widgets.Textarea(
        value='',
        placeholder='Type something like "covid" or incubation',
        description='',
        disabled=False,
        layout=Layout(width='100%', height='50px')
    )
  return textW

def get_new_plus_button():
  button = widgets.Button(description="+")
  return button

def get_new_dropdown():
  dropdown = widgets.Dropdown(
      options=['AND', 'OR', 'NOT'],
      value='AND',
      description='Operator: ',
      disabled=False,
    )
  return dropdown

def dynamic_search_query(parser, ix):
  textW = widgets.Textarea(
        value='',
        placeholder='Type something like "covid" or incubation',
        description='',
        disabled=False,
        layout=Layout(width='100%', height='50px')
    )
  
  button = widgets.Button(description="+")
  search_rows_list = []
  search_rows_list.append( HBox([textW, button], layout=Layout(align_items='center')) )
  # vboxes = [
  #           VBox(search_rows_list, layout=Layout(align_items='center')),
  #           VBox([button], layout=Layout(object_position='bottom'))
  #           ]
  # display_handle = display(vboxes, display_id='disp')
  display_handle = display(VBox(search_rows_list, layout=Layout(align_items='center')), display_id='disp')
  # display(HBox([textW, button], layout=Layout(align_items='center')))

  #search_rows_list is a list of HBox objects
  # the first index will just be a text box and '+' button
  # subsequent rows will have operator, text box, and '+' button
  def on_button_clicked(b):
    global STORED_SEARCH_QUERY
    # global search_rows_list
    clear_output(wait=True)
    new_text_box = get_new_text_box()
    dropdown = get_new_dropdown()
    # new_plus_button = get_new_plus_button()
    search_rows_list.append( HBox([dropdown, new_text_box, button], layout=Layout(align_items='center')) )
    # print(len(search_rows_list))
    # display(VBox(search_rows_list, layout=Layout(align_items='center')))
    display_handle.update(VBox(search_rows_list, layout=Layout(align_items='center')))

    combined = ''
    for i in range(0, len(search_rows_list)-1): #we do len - 1 since newet row has no values
      row = search_rows_list[i]
      # print(row)
      # print(i)
      if i == 0:
        temp = combined + row.children[0].value
        combined = temp
      else:
        temp = combined + ' ' + row.children[0].value + ' ' + row.children[1].value
        combined = temp
    
    print("Current raw search query:\n" + combined)
    print("Current query from parser:")
    query = get_parser_query(parser, combined) #already prints in method
    STORED_SEARCH_QUERY = query
    # update_display(display_id)

  button.on_click(on_button_clicked)


In [0]:
def get_query_dropdown():
  query_dictonary = read_query_dictionary(local_dir)
  dropdown = widgets.Dropdown(
      options=list(query_dictonary.keys()),
      value='What is the range of incubation periods?',
      description='Select a question:',
      disabled=False,
  )
  return dropdown
def precoded_search_query(parser, ix):
  query_dictonary = read_query_dictionary(local_dir)
  select_question = get_query_dropdown()
  def on_change(change):
    if change['name'] == 'value' and (change['new'] != change['old']): #### bug to fix 
      global STORED_PRECODED_QUERY
      precoded_query = query_dictonary[select_question.value]
      query = get_parser_query(parser, precoded_query) #already prints in method
      STORED_PRECODED_QUERY = query
  
  select_question.observe(on_change)
  display(select_question)

# Main Code

##Search Engine

In [0]:
local_dir = setup_local_data()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[PosixPath('/content/drive/My Drive/COVID-19/TestSearchResults_Incubation.csv'), PosixPath('/content/drive/My Drive/COVID-19/TestSearchResults_Incubation.gsheet'), PosixPath('/content/drive/My Drive/COVID-19/TestSearchResults_Incubation_v5.csv'), PosixPath('/content/drive/My Drive/COVID-19/TestSearchResults_Incubation_v5_target.csv'), PosixPath('/content/drive/My Drive/COVID-19/CORD-19-research-challenge-v5.zip'), PosixPath('/content/drive/My Drive/COVID-19/data'), PosixPath('/content/drive/My Drive/COVID-19/covid_vectors_part_0.json'), PosixPath('/content/drive/My Drive/COVID-19/covid_vectors_part_1.json'), PosixPath('/content/drive/My Drive/COVID-19/covid_vectors_part_2.json'), PosixPath('/content/drive/My Drive/COVID-19/covid_vectors_part_3.json'), PosixPath('/content/drive/My Drive/COVID-19/covid_vectors_part_4.json'), PosixPath('/content/drive/My Drive/C

In [0]:
metadata = read_metadata_csv(local_dir)
print(metadata.info())
print(metadata.shape)
print(metadata.head(5))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31745 entries, 0 to 31744
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   cord_uid                     31745 non-null  object        
 1   sha                          31744 non-null  object        
 2   source_x                     31745 non-null  object        
 3   title                        31710 non-null  object        
 4   doi                          31745 non-null  object        
 5   pmcid                        16636 non-null  object        
 6   pubmed_id                    24739 non-null  float64       
 7   license                      31745 non-null  object        
 8   abstract                     31744 non-null  object        
 9   publish_time                 31745 non-null  datetime64[ns]
 10  authors                      31191 non-null  object        
 11  journal                      30623 non-nu

In [0]:
search_schema = get_search_schema()
##uncomment this line if indexing database for the first time
# ix = create_search_index(local_dir, search_schema)
# add_documents_to_index(ix, metadata)

In [0]:
#read saved index from directory
ix = index.open_dir(local_dir/'indexdir')

#set default search fields to title OR abstract
fields = ["title", "abstract"] 
parser = get_multifield_parser(fields, search_schema)
STORED_SEARCH_QUERY = '' #this is stored as a global so the last search query from the parser can be used in later cells


##Search Instructions

#####The search engine searches in both title and abstract by default. To search keywords in title or abstract separtely, see example below:
#####**title:("covid-19") AND abstract:("incubation period" OR exposure)**
#####Notes: 
#####Enclose hyphenated words or phrases with quoatation
#####Enclose a group of words to search within a field with parenthesis")


###Manual dynamic search

In [0]:
#STORED_SEARCH_QUERY
dynamic_search_query(parser, ix)

VBox(children=(HBox(children=(Textarea(value='coronavirus', layout=Layout(height='50px', width='100%'), placeh…

Current raw search query:
coronavirus AND incubation
Current query from parser:
((title:coronavirus OR abstract:coronavirus) AND (title:incubation OR abstract:incubation))


In [0]:
print(STORED_SEARCH_QUERY)




###Select question from list

In [0]:
#STORED_PRECODED_QUERY
precoded_search_query(parser, ix)

Dropdown(description='Select a question:', options=('What is the range of incubation periods?', 'What is the r…

((abstract:coronavirus* OR abstract:"corona virus" OR abstract:"covid 19" OR abstract:"2019 ncov" OR abstract:ncov OR abstract:"sars cov") AND NOT (abstract:animal OR abstract:equine* OR abstract:porcine OR abstract:calves OR abstract:dog* OR abstract:canine* OR abstract:feline* OR abstract:bat* OR abstract:camel*) AND (abstract:elderly OR abstract:"older adults" OR abstract:senior* OR abstract:geriatric*))


In [0]:
print(STORED_PRECODED_QUERY)

((abstract:coronavirus* OR abstract:"corona virus" OR abstract:"covid 19" OR abstract:"2019 ncov" OR abstract:ncov OR abstract:"sars cov") AND NOT (abstract:animal OR abstract:equine* OR abstract:porcine OR abstract:calves OR abstract:dog* OR abstract:canine* OR abstract:feline* OR abstract:bat* OR abstract:camel*) AND (abstract:"personal protective equipment" OR abstract:mask OR abstract:facemask OR abstract:n95 OR abstract:n99 OR abstract:gown* OR abstract:gloves OR abstract:"face shield" OR abstract:"eye protection" OR abstract:goggles OR abstract:footwear) AND (abstract:"attack rate" OR abstract:"viral load" OR abstract:"secondary transmission" OR abstract:"nosocomial transmission"))


In [0]:
#plug in STORED_SEARCH_QUERY if using manual search
#plug in STORED_PRECODED_QUERY if using a precoded-query
output = get_search_results(ix, STORED_PRECODED_QUERY)
# print(output.shape)
# output

Total Hits: 58

Loop ran 58 times.
(58, 8)


In [0]:
# data = pd.concat([metadata['title'], metadata['abstract']], axis=1)
# metadata = metadata.dropna(subset=['title', 'abstract'])
# data = metadata['title'] + ' ' + metadata['abstract'] #space ensures title_last_word and abstract_first_word are stored as separate words

# output = output.dropna(subset=['title', 'abstract'])
print(output.shape)
data = output['title'] + ' ' + output['abstract']

print(type(data))
print(len(data))
data = data.tolist()
print(data[0])

(58, 8)


ImportError: ignored

In [0]:
output.head()

Unnamed: 0,cord_uid,sha,bm25_score,title,abstract,publish_year,authors,url
0,xfjexm5b,67ff35bb162142475e1d16e78e56dbcf59ed65bc,45.107886,Impact of self-imposed prevention measures and...,Background: With new cases of COVID-19 surging...,2020,Alexandra Teslya; Thi Mui Pham; Noortje E. God...,https://doi.org/10.1101/2020.03.12.20034827


In [0]:
#save to csv
if not os.path.exists(local_dir/'SearchEngineOutput'):
  os.mkdir(local_dir/'SearchEngineOutput')
output.to_csv(local_dir/'SearchEngineOutput'/'search_engine_duration_infectiousness.csv', index=False)

##Use topic model to find similar articles

In [0]:
#vectorize search output
count_vectorizer = get_count_vectorizer()
data_vectorized = count_vectorizer.fit_transform(data)

In [0]:
#train LDA using vectorized search output
print(count_vectorizer.get_feature_names())
print(len(count_vectorizer.get_feature_names()))
lda_model = get_lda_model(num_topics=10)
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

['activity', 'affect', 'affected', 'attack', 'awareness', 'background', 'burden', 'buy', 'cases', 'containment', 'control', 'countries', 'covid-', 'delay', 'delaying', 'developed', 'diagnoses', 'diminish', 'disease', 'dissemination', 'distancing', 'early', 'early-initiated', 'effective', 'effectiveness', 'efficacy', 'epidemic', 'especially', 'evaluate', 'exceeds', 'fast', 'findings', 'government', 'government-imposed', 'handwashing', 'healthcare', 'impact', 'implementation', 'importance', 'increased', 'increasing', 'influenza', 'information', 'interpretation', 'intervention', 'interventions', 'keywords', 'large', 'mask-wearing', 'mathematical', 'measures', 'methods', 'mitigate', 'mitigating', 'model', 'moving', 'new', 'non-case-based', 'number', 'peak', 'phase', 'population', 'postpone', 'prediction', 'prepare', 'prevented', 'preventing', 'prevention', 'rapid', 'rate', 'reaction', 'reduce', 'sars-cov-', 'seasonal', 'self-imposed', 'short-term', 'significantly', 'slow', 'social', 'sprea

In [0]:
print(lda_Z[0]) #we can see that this adds up to 1 since LDA is a summation of topic probabilities
print(lda_model.components_) 
print(lda_model.components_.shape) # this is num_components/topics by num_features, num_features comes from feature names of count vectorizer

[6.17305559e-04 6.17305627e-04 6.17307909e-04 6.17306244e-04
 6.17305875e-04 6.17305679e-04 6.17306168e-04 6.17306229e-04
 9.94444244e-01 6.17307054e-04]
[[0.29329012 0.27306565 0.3089743  0.26320958 0.29082012 0.29125081
  0.28020646 0.27313928 0.26996816 0.29208697 0.27923722 0.25356407
  0.28991256 0.29327835 0.31162978 0.31195942 0.26142973 0.27638028
  0.2812661  0.25822519 0.23726487 0.27765377 0.25596306 0.26287333
  0.27953855 0.27348177 0.26982021 0.27956333 0.28141922 0.27755235
  0.2685208  0.26506778 0.27538361 0.24849985 0.2938771  0.2749859
  0.29915028 0.24361572 0.27806391 0.30281972 0.28891482 0.28353083
  0.27464767 0.24313674 0.28731214 0.2894859  0.26080701 0.26751695
  0.26075271 0.27460909 0.27659312 0.27583983 0.27873792 0.27215158
  0.2720165  0.24528604 0.26476565 0.2527197  0.23977534 0.27686315
  0.29987662 0.31768227 0.27419653 0.28846663 0.29561231 0.30149295
  0.28525188 0.26747028 0.2518046  0.26869794 0.28782225 0.26999371
  0.25328713 0.27540325 0.27855

In [0]:
#get average topic distribution of all search output
avg_topic = lda_Z.mean(axis=0)
print(avg_topic)
print(avg_topic.shape)

[6.17305559e-04 6.17305627e-04 6.17307909e-04 6.17306244e-04
 6.17305875e-04 6.17305679e-04 6.17306168e-04 6.17306229e-04
 9.94444244e-01 6.17307054e-04]
(10,)


In [0]:
#print topic components
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
        
print("LDA Model:")
print_topics(lda_model, count_vectorizer)

LDA Model:
Topic 0:
[('population', 0.3176822681316154), ('strategies', 0.3152047930401017), ('developed', 0.3119594232830865), ('delaying', 0.31162978379343687), ('affected', 0.3089743010846718), ('increased', 0.3028197152888651), ('prevented', 0.3014929511846381), ('phase', 0.2998766153653389), ('impact', 0.29915028179303815), ('prepare', 0.29561231358709983)]
Topic 1:
[('self-imposed', 0.3267476118704375), ('postpone', 0.3206839254698412), ('slow', 0.31517832978256743), ('social', 0.3107827894689542), ('importance', 0.30908886988533907), ('affect', 0.308198510003177), ('background', 0.3048900079910111), ('reduce', 0.30480388743182424), ('developed', 0.30415336621041533), ('phase', 0.3041301684489805)]
Topic 2:
[('measures', 0.349901474064611), ('distancing', 0.34254692838154044), ('countries', 0.32611609195248603), ('buy', 0.3192200221830686), ('effective', 0.3180265101953831), ('postpone', 0.31469320294227093), ('covid-', 0.3138798429846301), ('mathematical', 0.3134862114435917), (

In [0]:
# !pip install --upgrade pandas
# visualize topics using pyLDAvis
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, count_vectorizer, mds='tsne')
panel

In [0]:
from scipy.spatial import distance

In [0]:
print(distance.cosine(avg_topic, lda_Z[0]))

# For one d arrays below, it doesn't make sense to transpose as 2d since calc is always 0
# Cosine score of 0 means 2 vectors are at 90degrees/orthagonal, have no match
# avg_topic = np.array(avg_topic).reshape(-1,1)
# print(avg_topic.shape)
# distances = distance.cdist(avg_topic, avg_topic, metric='cosine')
# print(distances)
# print(avg_topic)
# print(np.array(lda_Z[0]).reshape(-1, 1))
# example = distance.cdist(avg_topic, np.array(lda_Z[0]).reshape(-1, 1), metric='cosine')
# print(example)

0.5916081819765387


In [0]:
#vectorize all title + abstract in metadata
print(data_vectorized.shape)
metadata = metadata.dropna(subset=['title', 'abstract'])
metadata_condensed = metadata['title'] + ' ' + metadata['abstract'] #space ensures title_last_word and abstract_first_word are stored as separate words
metadata_condensed = metadata_condensed.tolist()
print(metadata_condensed[0])

metadata_condensed_vectorized = count_vectorizer.transform(metadata_condensed)
print(metadata_condensed_vectorized.shape)
#derive topic distributions for each document in corpus
lda_corpus = lda_model.transform(metadata_condensed_vectorized)

(1, 90)
SIANN: Strain Identification by Alignment to Near Neighbors Next-generation sequencing is increasingly being used to study samples composed of mixtures of organisms, such as in clinical applications where the presence of a pathogen at very low abundance may be highly important. We present an analytical method (SIANN: Strain Identification by Alignment to Near Neighbors) specifically designed to rapidly detect a set of target organisms in mixed samples that achieves a high degree of species- and strain-specificity by aligning short sequence reads to the genomes of near neighbor organisms, as well as that of the target. Empirical benchmarking alongside the current state-of-the-art methods shows an extremely high Positive Predictive Value, even at very low abundances of the target organism in a mixed sample. SIANN is available as an Illumina BaseSpace app, as well as through Signature Science, LLC. SIANN results are presented in a streamlined report designed to be comprehensible t

In [0]:
#for each entry in the corpus, add a column of cosine distance compared to the avg topic from search query
print(lda_corpus.shape)
for entry in lda_corpus[:5]:
  print(entry)

distances = []
for entry in lda_corpus:
  distances.append(distance.cosine(avg_topic, entry))
print(distances[0])
metadata['cosine_dist_from_topic'] = np.asarray(distances)
metadata.head(5)

(31710, 10)
[0.03333558 0.0333356  0.0333362  0.03333659 0.0333358  0.03333608
 0.03333626 0.03333556 0.69997665 0.03333568]
[0.00909148 0.00909142 0.00909145 0.00909138 0.00909151 0.00909139
 0.00909151 0.00909152 0.91817683 0.00909152]
[0.05000694 0.05000281 0.05000484 0.05000449 0.05000403 0.05000548
 0.05000454 0.050004   0.54995775 0.05000515]
[0.01250062 0.01250068 0.01250067 0.01250088 0.01250066 0.01250084
 0.01250072 0.01250062 0.88749351 0.01250079]
[0.00476214 0.00476222 0.00476215 0.00476225 0.00476222 0.00476221
 0.00476222 0.00476221 0.95714013 0.00476224]
0.009791032576991876


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file,url,publish_year,cosine_dist_from_topic
0,vho70jcx,f056da9c64fbf00a4645ae326e8a4339d015d155,biorxiv,SIANN: Strain Identification by Alignment to N...,http://doi.org/10.1101/001727,,,biorxiv,Next-generation sequencing is increasingly bei...,2014-01-10,Samuel Minot; Stephen D Turner; Krista L Ternu...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/001727,2014,0.009791
1,i9tbix2v,daf32e013d325a6feb80e83d15aabc64a48fae33,biorxiv,Spatial epidemiology of networked metapopulati...,http://doi.org/10.1101/003889,,,biorxiv,An emerging disease is one infectious epidemic...,2014-06-04,Lin WANG; Xiang Li,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/003889,2014,0.000387
2,62gfisc6,f33c6d94b0efaa198f8f3f20e644625fa3fe10d2,biorxiv,Sequencing of the human IG light chain loci fr...,http://doi.org/10.1101/006866,,,biorxiv,Germline variation at immunoglobulin gene (IG)...,2014-07-03,Corey T Watson; Karyn Meltz Steinberg; Tina A ...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/006866,2014,0.034759
3,058r9486,4da8a87e614373d56070ed272487451266dce919,biorxiv,Bayesian mixture analysis for metagenomic comm...,http://doi.org/10.1101/007476,,,biorxiv,Deep sequencing of clinical samples is now an ...,2014-07-25,Sofia Morfopoulou; Vincent Plagnol,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/007476,2014,0.000815
4,wich35l7,eccef80cfbe078235df22398f195d5db462d8000,biorxiv,Mapping a viral phylogeny onto outbreak trees ...,http://doi.org/10.1101/010389,,,biorxiv,Developing methods to reconstruct transmission...,2014-11-11,Stephen P Velsko; Jonathan E Allen,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/010389,2014,8.5e-05


In [0]:
#then display top results from overall corpus
metadata_sorted = metadata.sort_values('cosine_dist_from_topic', ascending=True) #ascending because smaller distance is better
# metadata_sorted.head(5)
metadata_sorted[:20]['title']

558      Impact of self-imposed prevention measures and...
7478     XXIV World Allergy Congress 2015: Seoul, Korea...
8828     36th International Symposium on Intensive Care...
30338    Early dynamics of transmission and control of ...
723      The Effectiveness of Social Distancing in Miti...
7649     A systematic review of community-based interve...
28038    Interventions to mitigate early spread of SARS...
663      Chinese Public Attention to COVID-19 Epidemic:...
30780    Feasibility of controlling COVID-19 outbreaks ...
366      Analysis of epidemiological characteristics of...
2900     Analysis of CDC social control measures using ...
12918    Are countries’ self-reported assessments of th...
196      A spatial model of CoVID-19 transmission in En...
644      Modeling and Forecasting Trend of COVID-19 Epi...
428      Prediction of New Coronavirus Infection Based ...
513      The Impact of School Closure for COVID-19 on t...
7691     Visual analytics of geo-social interaction pat.

In [0]:
metadata_sorted['query'] = 'How effective are personal protective equipment?'

In [0]:
#save to csv
if not os.path.exists(local_dir/'TopicModelOutput'):
  os.mkdir(local_dir/'TopicModelOutput')
metadata_sorted[:50].to_csv(local_dir/'TopicModelOutput'/'PPE_top50.csv', index=False)


In [0]:
frames = []
for f in list(Path(local_dir/'TopicModelOutput').glob('*')):
  frames.append(pd.read_csv(f))
AllOutput = pd.concat(frames).reset_index(drop=True)
AllOutput.info()
AllOutput = AllOutput[['cord_uid','sha','cosine_dist_from_topic','title','abstract','publish_year','authors','url','query']]
AllOutput.to_csv(local_dir/'PrelimResults'/'LDA_baseline_results.csv', index=False)
AllOutput.to_json(local_dir/'PrelimResults'/'LDA_baseline_results.json', orient='records')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   cord_uid                     250 non-null    object 
 1   sha                          250 non-null    object 
 2   source_x                     250 non-null    object 
 3   title                        250 non-null    object 
 4   doi                          250 non-null    object 
 5   pmcid                        134 non-null    object 
 6   pubmed_id                    184 non-null    float64
 7   license                      250 non-null    object 
 8   abstract                     250 non-null    object 
 9   publish_time                 250 non-null    object 
 10  authors                      247 non-null    object 
 11  journal                      217 non-null    object 
 12  Microsoft Academic Paper ID  12 non-null     float64
 13  WHO #Covidence      

TODO:
* Get average topic of subset of corpus (start with arbitary random count, use search queries later.
* Try this on the full corpus of text
* Experiment with different count vecotorizer parameters like ngrams and lda parameters like learning_method
* Integrate pyldavis library for better visuation of topics with LDA
* Use LDA2vec

# References

* https://nlpforhackers.io/topic-modeling/