<a href="https://colab.research.google.com/github/antonpolishko/task-ties/blob/master/SciBert_Faiss_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install faiss-cpu --no-cache -q
!pip install --upgrade git+https://github.com/zalandoresearch/flair.git -q
!pip install transformers -q 

[K     |████████████████████████████████| 7.2MB 1.4MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 983kB 1.4MB/s 
[K     |████████████████████████████████| 798kB 7.6MB/s 
[K     |████████████████████████████████| 573kB 8.7MB/s 
[K     |████████████████████████████████| 256kB 11.0MB/s 
[K     |████████████████████████████████| 1.0MB 10.7MB/s 
[K     |████████████████████████████████| 870kB 14.6MB/s 
[K     |████████████████████████████████| 3.7MB 16.6MB/s 
[?25h  Building wheel for flair (PEP 517) ... [?25l[?25hdone
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Building wheel for segtok (setup.py) ... [?25l[?25hdone
  Building wheel for mpld3 (setup.py) ... [?25l[?25hdone
  Building wheel for sqlitedict (setup.py) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l

In [0]:
import pandas as pd
import numpy as np
from pathlib import Path, PurePath
import faiss
from flair.data import Sentence
from flair.embeddings import BertEmbeddings,DocumentPoolEmbeddings
from transformers import *

## Similarity Search

In [0]:
# set data paths, this requires local drive to have a folder calld "COVID-19" with the metadata.csv file
# returns a string to the local path setup
def setup_local_data():
  from google.colab import drive
  drive.mount('/content/drive')
  drive_path=PurePath('/content/drive/My Drive')
  input_dir = drive_path/'COVID-19'
  print(list(Path(input_dir).glob('*')))
  return input_dir

In [0]:
#read the metadata file into df
def read_metadata_csv(input_dir):
    metadata_path = input_dir/ 'clean_metadata.csv'
    metadata = pd.read_csv(metadata_path, 
                         dtype={'title':str,
                                'abstract':str})
    #set the abstract to the paper title if it is null
    metadata['abstract'] = metadata['abstract'].fillna(metadata['title'])
    #remove if abstract is empty or contains only one word
    metadata = metadata.dropna(subset=['abstract'], axis = 0)
    metadata['number_tokens'] = metadata['abstract'].apply(lambda x: len(x.split()))
    metadata = metadata[metadata['number_tokens']>1].reset_index(drop=True)
    metadata = metadata.drop('Unnamed: 0', axis=1)
    return metadata

In [0]:
# read preprocessed SciBERT embeddings
def read_summary_data(input_dir):
  summary_path = input_dir/'AbstractSummaries' 
  summaries = pd.concat([pd.read_json(f) for f in Path(summary_path).glob('*')]).reset_index(drop=True)
  return summaries

def read_embeddings(input_dir):
  vector_path = input_dir/'AbstractEmbeddings' 
  embeddings = pd.concat([pd.read_json(f) for f in Path(vector_path).glob('*')]).reset_index(drop=True)
  return embeddings

In [0]:
def get_embeddings(text, model):
  sentence = Sentence(text)
  document_embedding = DocumentPoolEmbeddings([model],
                                             pooling= 'mean')
  document_embedding.embed(sentence)
  # now check out the embedded sentence.
  return sentence.get_embedding().data.numpy()

In [0]:
# read database to be indexed
local_dir = setup_local_data()
# metadata = read_metadata_csv(local_dir)
# summaries = read_summary_data(local_dir)
# embeddings = read_embeddings(local_dir)
# print(metadata.info())
# print(summaries.info())
# print(embeddings.info())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[PosixPath('/content/drive/My Drive/COVID-19/TestSearchResults_Incubation.csv'), PosixPath('/content/drive/My Drive/COVID-19/TestSearchResults_Incubation.gsheet'), PosixPath('/content/drive/My Drive/COVID-19/TestSearchResults_Incubation_v5.csv'), PosixPath('/content/drive/My Drive/COVID-19/TestSearchResults_Incubation_v5_target.csv'), PosixPath('/content/drive/My Drive/COVID-19/CORD-19-research-challenge-v5.zip'), PosixPath('/content/drive/My Drive/COVID-19/data'), PosixPath('/content/drive/My Drive/COVID-19/covid_vectors_part_0.json'), PosixPath('/content/drive/My Drive/COVID-19/covid_vectors_part_1.json'), PosixPath('/content/drive/My Drive/COVID-19/covid_vectors_part_2.json'), PosixPath('/content/drive/My Drive/COVID-19/covid_vectors_part_3.json'), PosixPath('/content/drive/My Drive/COVID-19/covid_vectors_part_4.json'), PosixPath('/content/drive/My Drive/C

In [0]:
merged = metadata.merge(summaries, on=['cord_uid','sha']).merge(embeddings, on=['cord_uid','sha'])
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31647 entries, 0 to 31646
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   cord_uid                     31647 non-null  object 
 1   sha                          31646 non-null  object 
 2   source_x                     31647 non-null  object 
 3   title                        31647 non-null  object 
 4   doi                          31647 non-null  object 
 5   pmcid                        16634 non-null  object 
 6   pubmed_id                    24729 non-null  float64
 7   license                      31647 non-null  object 
 8   abstract                     31647 non-null  object 
 9   publish_time                 31647 non-null  object 
 10  authors                      31149 non-null  object 
 11  journal                      30525 non-null  object 
 12  Microsoft Academic Paper ID  357 non-null    float64
 13  WHO #Covidence  

In [0]:
merged[pd.isnull(merged['scibert_emb'])]['summary']

1079      
1120      
1228      
1235      
1258      
        ..
30994     
31008     
31194     
31274     
31557     
Name: summary, Length: 1154, dtype: object

In [0]:
merged[pd.isnull(merged['scibert_emb'])]['abstract']

1079                Fear of the novel coronavirus
1120                Gene Chip for Viral Discovery
1228     Emerging diseases threaten conservation.
1235                              The Big Picture
1258              Ecological Change: Life Lessons
                           ...                   
30994      The danger of stories in global health
31008                      Travel-related illness
31194                          [Figure: see text]
31274            Washing our hands of the problem
31557                 A Disease Around the Corner
Name: abstract, Length: 1154, dtype: object

In [0]:
for row in merged.loc[merged['scibert_emb'].isnull(), 'scibert_emb'].index:
  try:
    merged.at[row, 'scibert_emb'] = get_embeddings(merged.iloc[row]['abstract'], emb_model)
  except RuntimeError:
    #two articles have very long abstracts that exceeds bert's sequence length limit
    merged.at[row, 'scibert_emb'] = get_embeddings(merged.iloc[row]['abstract'][:512], emb_model) 

In [0]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31647 entries, 0 to 31646
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   cord_uid                     31647 non-null  object 
 1   sha                          31646 non-null  object 
 2   source_x                     31647 non-null  object 
 3   title                        31647 non-null  object 
 4   doi                          31647 non-null  object 
 5   pmcid                        16634 non-null  object 
 6   pubmed_id                    24729 non-null  float64
 7   license                      31647 non-null  object 
 8   abstract                     31647 non-null  object 
 9   publish_time                 31647 non-null  object 
 10  authors                      31149 non-null  object 
 11  journal                      30525 non-null  object 
 12  Microsoft Academic Paper ID  357 non-null    float64
 13  WHO #Covidence  

In [0]:
check_lens = [len(lst) for lst in merged['scibert_emb']]
print(set(check_lens))

{768}


In [0]:
# merged.to_json(local_dir/'metadata_scibert.json')

In [0]:
del merged

In [0]:
summary_embeddings = pd.read_json(local_dir/'metadata_scibert.json') 
summary_embeddings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31647 entries, 0 to 31646
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   cord_uid                     31647 non-null  object        
 1   sha                          31646 non-null  object        
 2   source_x                     31647 non-null  object        
 3   title                        31647 non-null  object        
 4   doi                          31647 non-null  object        
 5   pmcid                        16634 non-null  object        
 6   pubmed_id                    24729 non-null  float64       
 7   license                      31647 non-null  object        
 8   abstract                     31647 non-null  object        
 9   publish_time                 31647 non-null  datetime64[ns]
 10  authors                      31149 non-null  object        
 11  journal                      30525 non-nu

##Faiss similarity search

In [0]:
# create a matrix to store abstarct vectors 
xb = np.array([np.array(lst) for lst in summary_embeddings['scibert_emb']]).astype('float32')
print(xb.shape)
# assign dimension for the vector space
d = xb.shape[1] 

(31647, 768)


In [0]:
# create a matrix to store queries 
# (basically the same matrix since we will use a selected article to find similar)
xq = xb.copy()

In [0]:
# build the index
index = faiss.IndexFlatL2(d) #brute-force L2 index 
print(index.is_trained)
# add vectors to the index
index.add(xb)                  
print(index.ntotal)

True
31647


In [0]:
summary_embeddings.iloc[20:30]

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file,url,number_tokens,summary,scibert_emb
20,2cerplno,53442eacc3f233078507fa37b78267399e8c1e3b,biorxiv,Dysregulation of Long Non-coding RNA (lncRNA) ...,http://doi.org/10.1101/061788,,,biorxiv,ABSTRACTZika Virus (ZIKV) is a causative agent...,2016-07-01,Arunachalam Ramaiah; Deisy Contreras; Vineela ...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/061788,286,ABSTRACTZika Virus (ZIKV) is a causative agent...,"[0.0098563898, -0.25841644410000003, -0.283691..."
21,t3tm6f9p,c04e10858012bdd666d50c0d69c3d1e7224ccbea,biorxiv,Design: An assay based on single-polypeptide-c...,http://doi.org/10.1101/065250,,,biorxiv,AbstractBackgroundThe adenosine A2A receptor (...,2016-07-26,Toshio Kamiya; Takashi Masuko; Dasiel Oscar Bo...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/065250,273,AbstractBackgroundThe adenosine A2A receptor (...,"[0.4562368691, 0.10551598670000001, -0.0793895..."
22,57sp9d9l,5570cb9ff2905fda60cfd6e94f3c32774b765265,biorxiv,The genome of the crustacean Parhyale hawaiens...,http://doi.org/10.1101/065789,,,biorxiv,ABSTRACTThe amphipod crustacean Parhyale hawai...,2016-07-25,Damian Kao; Alvina G. Lai; Evangelia Stamataki...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/065789,150,ABSTRACTThe amphipod crustacean Parhyale hawai...,"[0.057457856800000005, -0.0752342641, -0.06209..."
23,tiyjmih7,45f184e736c4a857bc0e3df0d55962f743c1493a,biorxiv,Similar ratios of introns to intergenic sequen...,http://doi.org/10.1101/068627,,,biorxiv,AbstractOne central goal of genome biology is ...,2016-08-09,Warren R. Francis; Gert Wörheide,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/068627,184,AbstractOne central goal of genome biology is ...,"[0.09885469820000001, -0.1979083121, -0.022167..."
24,30kwl4rj,ffbf8ea9948d73572fd052a74afa01b19e6758a3,biorxiv,Planning horizon affects prophylactic decision...,http://doi.org/10.1101/069013,,,biorxiv,AbstractHuman behavior can change the spread o...,2016-08-12,Luis G. Nardin; Craig R. Miller; Benjamin J. R...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/069013,161,AbstractHuman behavior can change the spread o...,"[-0.0726864636, -0.062798582, -0.1507194936, -..."
25,1znuxoj5,fd3fc2c49f5cc27e4262261d0c9045911d65cb6e,biorxiv,XRN1 is a Species-Specific Virus Restriction F...,http://doi.org/10.1101/069799,,,biorxiv,"AbstractIn eukaryotes, the degradation of cell...",2016-08-16,Paul A. Rowley; Brandon Ho; Sarah Bushong; Arl...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/069799,278,"AbstractIn eukaryotes, the degradation of cell...","[0.3114904463, -0.07228434090000001, -0.057984..."
26,pxpujrhx,44a440cc1c135c938d2216ea672f8ef4f9c01296,biorxiv,Genome-wide Prediction of microRNAs in Zika vi...,http://doi.org/10.1101/070656,,,biorxiv,AbstractZika virus (ZIKV) is a member of the f...,2016-08-21,Juan Cristina; Natalia Echeverría; Fabiana Gam...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/070656,209,AbstractZika virus (ZIKV) is a member of the f...,"[0.37312006950000004, 0.240855068, 0.258061826..."
27,2j4z5rp8,9b5f5119bbfbded3245acc37859cefde967458e7,biorxiv,Containing Emerging Epidemics: a Quantitative ...,http://doi.org/10.1101/072652,,,biorxiv,ABSTRACTStrategies for containing an emerging ...,2016-08-31,Corey M Peak; Lauren M Childs; Yonatan H Grad;...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/072652,334,ABSTRACTStrategies for containing an emerging ...,"[0.3610961735, -0.395444721, -0.3517359197, 0...."
28,8elmjre9,8f4b98da50277e6c9516aef0227f5cca9af6b5e8,biorxiv,Identification of quercetin from fruits to imm...,http://doi.org/10.1101/074559,,,biorxiv,Zika virus is spread mainly by the bite of an ...,2016-09-11,Amrita Roy; Liangzhong Lim; Jianxing Song,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/074559,206,Zika virus is spread mainly by the bite of an ...,"[0.1410364658, -0.132661432, -0.1461878717, 0...."
29,l82aakrk,f9b108c052ded463773a9838a8157ac7d5ba6d1a,biorxiv,Unique properties of Zika NS2B-NS3pro complexe...,http://doi.org/10.1101/078113,,,biorxiv,ABSTRACTZika virus can be passed from a pregna...,2016-09-28,Amrita Roy; Liangzhong Lim; Shagun Srivastava;...,,,,True,biorxiv_medrxiv,https://doi.org/10.1101/078113,240,ABSTRACTZika virus can be passed from a pregna...,"[-0.0201610141, 0.0819289982, -0.2859653831, 0..."


In [0]:
# get selected article's index from df (not sure how selected article will be saved)
# Title: The Incubation Period of Coronavirus Disease 2019 (COVID-19) From Publicly Reported Confirmed Cases: Estimation and Application
# ce8609a60724d457d5b5916d57a31dea0ffb831b
# Title: Duration of viral detection in throat and rectum of a patient with COVID-19
# 598d3eb737dfa7701ce8c29c86bc9f6589d8a581
# Title: Asymptomatic carrier state, acute respiratory disease, and pneumonia due to severe acute respiratory syndrome coronavirus 2 (SARSCoV-2): Facts and myths
# 89a8918f7e3044b89642aaa74defc7381abef482; 1f5c1597a84ed1d4f84c488cd19098a091a3d513
# Title: Stability and inactivation of SARS coronavirus
# 8a6f8fe47a3aa58e61e1eee0cba5af0037f38ee4
# Title: Feasibility of controlling COVID-19 outbreaks by isolation of cases and contacts
# a14b5655cb13ed64cb8cff7c806a7b58c858b8b7; 43064e9a5b81ad1ac0743c818cda48383c246c95
# Title:The use of masks and respirators to prevent transmission of influenza: a systematic review of the scientific evidence
# 963dfcc10563ef66cf366f73640ab9b3e84a9a55
# Title: Dispersal of Respiratory Droplets With Open vs Closed Oxygen Delivery Masks Implications for the Transmission of Severe Acute Respiratory Syndrome
# 80ce66bae70e5eb76387b05c25dd486e29c0087c
selected_index = summary_embeddings[summary_embeddings['sha'] == '80ce66bae70e5eb76387b05c25dd486e29c0087c'].index.astype(int)
# set number of most similar articles to return
k = 10
# retrieve selected article's embedding
xq = xb[selected_index]
# search top k similar articles and return a distance array (D) and an index array (I)
# D is L2 distance, to get cosine similarity, normalize xb and xq with faiss.normalize_L2
D, I = index.search(xq, k+1)   # actually search k+1 to get k articles additional to self

In [0]:
print(I) #the first item (not always first) is the selected article itself
print(D) #thus, distance is zero

[[  482 30953 27837 13343  6769 25668  5476 15654 28328  9650   206]]
[[ 0.       33.59949  33.73192  34.171967 34.337788 34.98459  35.34081
  35.434464 35.59046  35.65731  35.932854]]


In [0]:
# convert index array to a list
I_new = I[:][0].tolist()
# locate selected article from the list
query_ix = I_new.index(selected_index) # it's not always the first item so safer to save index separately
print(query_ix)
# remove selected article from the list
I_new.remove(selected_index)
print(I_new)
# use the index to remove selected article's distance to itself
D_new = D[:][0].tolist()
del D_new[query_ix]
print(D_new)

0
[2269, 21914, 30494, 12835, 5332, 6339, 5338, 1382, 9015, 21831]
[26.65274429321289, 31.02811050415039, 31.389015197753906, 31.467775344848633, 31.872941970825195, 32.15396499633789, 32.2318115234375, 32.47484588623047, 32.608551025390625, 32.835540771484375]


In [0]:
I[0]

array([  482, 30953, 27837, 13343,  6769, 25668,  5476, 15654, 28328,
        9650,   206])

In [0]:
# retrieve info for the top 10 simialr articles
top10_similar = summary_embeddings.iloc[I[0]].reset_index(drop=True) #I_new
# attach similarity scores
top10_similar['distance'] = D[0] #D_new
# view results
top10_similar

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,full_text_file,url,number_tokens,summary,scibert_emb,distance
0,53e1dyuz,80ce66bae70e5eb76387b05c25dd486e29c0087c,Elsevier,Dispersal of Respiratory Droplets With Open vs...,http://doi.org/10.1378/chest.125.3.1155,PMC7094599,15006983.0,els-covid,Nosocomial transmission of droplet-borne respi...,2004-03-31,"Somogyi, Ron; Vesely, Alex E.; Azami, Takafumi...",Chest,,,True,custom_license,https://doi.org/10.1378/chest.125.3.1155,99,Nosocomial transmission of droplet-borne respi...,"[0.1170018911, -0.4216496348, 0.0843923464, 0....",0.0
1,sl6gsjz4,b19a21295cdc5bb0be8ddc6688268147a0a6d510,PMC,Transmission of communicable respiratory infec...,http://doi.org/,PMC3004550,21197329.0,no-cc,BACKGROUND: Respiratory protection efficiency ...,2008-05-01,"Li, Yi; Guo, Yue Ping; Wong, Kwok Ching Thomas...",J Multidiscip Healthc,,,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,254,BACKGROUND: Respiratory protection efficiency ...,"[-0.2126090676, -0.2862428725, 0.0409490578, 0...",32.038342
2,cpaucz01,fadc39822e87cd5cafea86e56c173016a274d6cd,PMC,Infection control for SARS in a tertiary neona...,http://doi.org/10.1136/fn.88.5.f405,PMC1721604,12937045.0,unk,The Severe Acute Respiratory Syndrome (SARS) i...,2003-09-01,"Ng, P; So, K; Leung, T; Cheng, F; Lyon, D; Won...",Archives of Disease in Childhood - Fetal and N...,,,True,custom_license,http://europepmc.org/articles/pmc1721604?pdf=r...,148,The Severe Acute Respiratory Syndrome (SARS) i...,"[0.2468307167, -0.2908338308, 0.2803712189, 0....",36.113152
3,eqq7k4pz,bbb727ab78e299a712b0be35b84f6448fc9fea8f,Elsevier,Respiratory Hygiene in the Emergency Department,http://doi.org/10.1016/j.jen.2007.01.013,,,els-covid,The emergency department (ED) is an essential ...,2007-04-30,"Rothman, Richard E.; Irvin, Charlene B.; Moran...",Journal of Emergency Nursing,,,True,custom_license,https://doi.org/10.1016/j.jen.2007.01.013,268,The emergency department (ED) is an essential ...,"[0.1255655736, -0.0718322024, -0.2435592711, 0...",41.528198
4,qrnsmhws,c57bac4f38795dc13c7f91af3b758a0199ef3d2d,Elsevier,Respiratory Hygiene in the Emergency Department,http://doi.org/10.1016/j.annemergmed.2006.05.018,,17052558.0,els-covid,The emergency department (ED) is an essential ...,2006-11-30,"Rothman, Richard E.; Irvin, Charlene B.; Moran...",Annals of Emergency Medicine,,,True,custom_license,https://doi.org/10.1016/j.annemergmed.2006.05.018,268,The emergency department (ED) is an essential ...,"[0.1255655736, -0.0718322024, -0.2435592711, 0...",41.528198
5,scmta88m,709e51c08788944db48106970f38cebe2b5bd596,Elsevier,Role of viral bioaerosols in nosocomial infect...,http://doi.org/10.1016/j.jaerosci.2017.11.011,PMC7094610,,els-covid,Abstract The presence of patients with diverse...,2018-03-31,"Bing-Yuan, ; Zhang, Yun-Hui; Leung, Nancy H.L....",Journal of Aerosol Science,,,True,custom_license,https://doi.org/10.1016/j.jaerosci.2017.11.011,284,Abstract The presence of patients with diverse...,"[0.3153162003, -0.597124517, -0.1293283701, 0....",41.528847
6,jtu59u99,b1735d2bf2120775db46e1803d1c92023d22e60d,Elsevier,Neutrophils in respiratory syncytial virus inf...,http://doi.org/10.1016/j.jaci.2015.06.034,,26277597.0,els-covid,Lower respiratory tract infections by respirat...,2015-10-31,"Geerdink, Ruben J.; Pillay, Janesh; Meyaard, L...",Journal of Allergy and Clinical Immunology,,,True,custom_license,https://doi.org/10.1016/j.jaci.2015.06.034,193,Lower respiratory tract infections by respirat...,"[0.2818259895, -0.2612119913, -0.0667543039, 0...",41.627838
7,qqje5s04,a5153b2d372dcbb97abc94b5bfccc309fa01c7b7,Elsevier,The role of epidermal growth factor receptor (...,http://doi.org/10.1016/j.antiviral.2017.03.022,PMC5507769,28390872.0,els-covid,Abstract Many survivors of the 2003 outbreak o...,2017-07-31,"Venkataraman, Thiagarajan; Frieman, Matthew B.",Antiviral Research,,,True,custom_license,https://doi.org/10.1016/j.antiviral.2017.03.022,194,Abstract Many survivors of the 2003 outbreak o...,"[0.31591942910000004, -0.341525048, 0.09793578...",41.719566
8,x0im3m7v,fd51530ed73c17ccd16690c0bfa046be2f360f7f,Elsevier,A systematic risk-based strategy to select per...,http://doi.org/10.1016/j.ajic.2019.06.023,,31358421.0,els-covid,Background Personal protective equipment (PPE)...,2020-01-31,"Jones, Rachael M.; Bleasdale, Susan C.; Maita,...",American Journal of Infection Control,2964663000.0,#3682,True,custom_license,https://doi.org/10.1016/j.ajic.2019.06.023,232,Background Personal protective equipment (PPE)...,"[0.4809063375, -0.20011262600000002, -0.067928...",42.360649
9,6l87y6jg,24874af5c599f668e87df044bd03bb2743839af2,Elsevier,Enhancement of the infectivity of SARS-CoV in ...,http://doi.org/10.1016/j.antiviral.2006.03.001,,16621037.0,els-covid,Abstract Because of the conflicting data conce...,2006-08-31,"Barnard, Dale L.; Day, Craig W.; Bailey, Kevin...",Antiviral Research,,,True,custom_license,https://doi.org/10.1016/j.antiviral.2006.03.001,307,Prior infection and passive transfer of neutra...,"[0.2694769502, -0.6094094515, -0.0471549928000...",42.80267


In [0]:
top10_similar['title'].tolist()

['Dispersal of Respiratory Droplets With Open vs Closed Oxygen Delivery Masks Implications for the Transmission of Severe Acute Respiratory Syndrome',
 'Transmission of communicable respiratory infections and facemasks',
 'Infection control for SARS in a tertiary neonatal centre',
 'Respiratory Hygiene in the Emergency Department',
 'Respiratory Hygiene in the Emergency Department',
 'Role of viral bioaerosols in nosocomial infections and measures for prevention and control',
 'Neutrophils in respiratory syncytial virus infection: A target for asthma prevention',
 'The role of epidermal growth factor receptor (EGFR) signaling in SARS coronavirus-induced pulmonary fibrosis',
 'A systematic risk-based strategy to select personal protective equipment for infectious diseases',
 'Enhancement of the infectivity of SARS-CoV in BALB/c mice by IMP dehydrogenase inhibitors, including ribavirin',
 'Study of Particle Dispersion on One Bed Hospital using Computational Fluid Dynamics']

In [0]:
top10_similar.to_csv(local_dir/'FaissOutputs'/'ppe.csv', index=False)

## K-means clustering

In [0]:
# train kmeans to locate centroids in vector space
ncentroids = 10 #assuming there're 10 clusters 
niter = 20
verbose = True
d = xb.shape[1]
kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose)
kmeans.train(xb)

539.678955078125

In [0]:
kmeans.centroids.shape #(ncentroids, d)

(10, 200)

In [0]:
# locate the nearest centroid to each vector (abstract)
Dk, Ik = kmeans.index.search(xb, 1)

In [0]:
# reverse operation to find 20 nearest articles to the centroids (n=10)
index = faiss.IndexFlatL2(d)
index.add (xb)
D, I = index.search(kmeans.centroids, 20)

In [0]:
I.shape

(10, 20)

In [0]:
for i in range(I.shape[0]):
  print('Cluster {}:'.format(i+1))
  for j, title in enumerate(full_text.loc[I[i],'title']):
    print(j+1, title)
  print()


Cluster 1:
1 Inflammatory Lung Disease O.12.1 From SARS and avian flu to swine flu
2 Middle East Respiratory Syndrome
3 Host susceptibility to severe COVID-19 and establishment of a host risk score: findings of 487 cases outside Wuhan
4 Commentary Middle East respiratory syndrome coronavirus infection control: The missing piece?
5 TITLE PAGE Clinical features and laboratory inspection of novel coronavirus pneumonia (COVID-19) in Xiangyang, Hubei
6 
7 Clinical considerations for patients with diabetes in times of COVID-19 epidemic
8 Combining Clinical and Epidemiologic Features for Early Recognition of SARS PERSPECTIVE CLINICAL STUDIES
9 Deep learning Enables Accurate Diagnosis of Novel Coronavirus (COVID-19) with CT images
10 Estimation Of Direct Medical Costs Of Middle East Respiratory Syndrome Coronavirus Infection: A Single-Center Retrospective Chart Review Study
11 viruses Perspective Potential Maternal and Infant Outcomes from Coronavirus 2019-nCoV (SARS-CoV-2) Infecting Pregnant 