## This notebook will be used to merge virus terms and create a final, merged dictionary

In [2]:
import numpy as np
import pandas as pd
import nltk
import inflect
from pattern.text.en import singularize

In [7]:
# Files
orig_aitslab_file = "../data/AITSLAB_dictionaries/Virus_SARS-CoV-2.txt"

# This file has been taken from merged and processed data
processed_virus_file = "../data/processed_terms/ncbi_termsmerged_virus_sars_grm_lex.xlsx"

In [4]:
list_orig = []
with open(orig_aitslab_file) as f:
    for line in f.readlines():
        line = line.strip()
        list_orig.append(line)
footnote = list_orig[-5:]
list_orig = list_orig[:-5]
list_orig


['covid19 virus',
 'covid-19 virus',
 'covid 19 virus',
 'sarscov2',
 'sars-cov-2',
 'sars-cov2',
 'sarscov2 2019',
 '2019 sarscov2',
 '2019novel sarscov2',
 '2019new sarscov2',
 '2019 novel sarscov2',
 '2019 new sarscov2',
 'hcov 19',
 'hcov19',
 'sars cov 2',
 'sars cov 2 2019',
 '2019 sars cov 2',
 '2019novel sars cov 2',
 '2019new sars cov 2',
 '2019 novel sars cov 2',
 '2019 new sars cov 2',
 'new coronavirus',
 'new coronavirus 2019',
 '2019 new coronavirus',
 'new corona virus',
 'new corona virus 2019',
 '2019 new corona virus',
 'novel coronavirus',
 'novel coronavirus 2019',
 '2019 novel coronavirus',
 'novel corona virus',
 'novel corona virus 2019',
 '2019 novel corona virus',
 'wuhan virus',
 'wuhan virus 2019',
 '2019 wuhan virus',
 '2019novel wuhan virus',
 '2019new wuhan virus',
 '2019 novel wuhan virus',
 '2019 new wuhan virus',
 'wuhan pneumonia virus',
 'wuhan pneumonia virus 2019',
 '2019 wuhan pneumonia virus',
 '2019novel wuhan pneumonia virus',
 '2019new wuhan pn

In [5]:
footnote

['NCBI Taxonomy ID: 2697049',
 'NCBI:txid2697049',
 'txid2697049',
 'DiseasesDB:60833',
 'MeSH:M000679178']

In [8]:
df_processed = pd.read_excel(processed_virus_file)
df_processed

Unnamed: 0,TERM,INCLUDE
0,new severe acute respiratory syndrome coronavi...,y
1,China COV,y
2,coronaviruses,y
3,severe acute respiratory syndrome coronavirus ...,y
4,SARS-CoV-2 infected COV,n
...,...,...
161,2019-novel coronavirus (2019-nCoV) related cor...,n
162,SARS-CoV-2 (Severe acute respiratory syndrome ...,y
163,Novel Coronavirus causing COVID-19,y
164,COVID-19 cov,y


In [9]:
df_processed = df_processed[df_processed["INCLUDE"]=="y"]
df_processed

Unnamed: 0,TERM,INCLUDE
0,new severe acute respiratory syndrome coronavi...,y
1,China COV,y
2,coronaviruses,y
3,severe acute respiratory syndrome coronavirus ...,y
5,ncov,y
...,...,...
160,novel SARS-CoV-2 coronavirus,y
162,SARS-CoV-2 (Severe acute respiratory syndrome ...,y
163,Novel Coronavirus causing COVID-19,y
164,COVID-19 cov,y


In [10]:
new_terms = set(df_processed["TERM"].to_list())
new_terms

{'19 COV',
 '19 Coronavirus',
 '19-nCoV',
 '2019 COV',
 '2019 novel coronavirus disease COV',
 '2019-nCoV virus',
 '2019-nCoV, 2019 novel coronavirus',
 '2019-new coronavirus',
 'COVID-19 cov',
 'COVID-19 novel coronavirus',
 'COVID-19 viruses',
 'China COV',
 'China virus',
 'Chinese coronavirus',
 'CoV-19 virus',
 'CoV-2 virus',
 'CoV2',
 'CoV2 coronavirus',
 'Coronavirus Disease 2019 COV',
 'Coronavirus disease 2019 (SARS-CoV-2) virus',
 'HCoV-2',
 'Novel Coronavirus causing COVID-19',
 'SARS (Severe acute respiratory syndrome)-CoV-2',
 'SARS (severe acute respiratory syndrome)-CoV-2 virus',
 'SARS-CoV-19',
 'SARS-CoV-19 virus',
 'SARS-CoV-2 (2019-nCoV, COVID-19) coronavirus',
 'SARS-CoV-2 (COVID-19) coronavirus',
 'SARS-CoV-2 (COVID-19) virus',
 'SARS-CoV-2 (CoV-2) coronavirus',
 'SARS-CoV-2 (SC2) virus',
 'SARS-CoV-2 (Severe acute respiratory syndrome - coronavirus 2) virus',
 'SARS-CoV-2 (severe acute respiratory syndrome coronavirus 2) coronavirus',
 'SARS-CoV-2 COV',
 'SARS-CoV

In [11]:
merged_terms = set(list_orig).union(new_terms)
print(merged_terms)

{'virus that causes COVID-19 disease', '2019 new severe acute respiratory syndrome corona virus2', 'SARS-CoV-2019 virus', 'CoV-19 virus', 'human coronavirus 2', '2019new wuhan seafood market pneumonia virus', '2019 wuhan coronavirus', 'severe acute respiratory syndromerelated coronavirus 2 2019', '2019 wuhancorona virus', '2019 severe acute respiratory syndromerelated coronavirus2', 'novel coronavirus (SARS-CoV-2) that causes novel coronavirus pneumonia', '2019 novel wuhan seafood market pneumonia virus', 'acute respiratory syndrome coronavirus causing coronavirus disease 2019', '2019 new seafood market corona virus', '2019novel wuhan seafood market pneumonia virus', 'novel coronavirus that causes novel coronavirus disease 2019', '2019 wuhan virus', 'novel severe acute respiratory syndrome coronavirus 2', '2019 chinese coronavirus', 'seafood market coronavirus 2019', '2019 novel wuhan pneumonia virus', '2019novel chinese corona virus', 'new coronavirus that causes COVID-19', '2019 seve

In [13]:
len(merged_terms)

357

In [14]:
#Convert to singular but no use

set_plurals = set()
for terms in sorted(merged_terms):
    term_list = terms.split()
    for term in term_list:
        if not singularize(term) == term:
            set_plurals.update([term])
print(set_plurals)

{'coronaviruses', 'coronavirus', 'wuhancoronavirus', 'illness', 'SARS-coronavirus', 'pneumonia', 'Coronavirus', 'Virus', 'causes', 'viruses', 'infections', 'Syndrome-Coronavirus', 'SARS', 'infectious', '(SARS)-coronavirus', 'sars', 'virus'}


In [15]:
# create dictionary to remove them
dict_plurals = {"coronaviruses": "coronavirus", 
                "viruses": "virus",
                "infections": "infection",
                }

merged_terms_set_processed = set()
#for each term
for terms in sorted(merged_terms):
    txt = []
    #split the term into words
    for term in terms.split():
        #manually check plural
        if term in dict_plurals:
            term = dict_plurals[term]
        txt.append(term)
        
    
    merged_terms_set_processed.update([" ".join(txt)])
print(merged_terms_set_processed)     

{'virus that causes COVID-19 disease', '2019 new severe acute respiratory syndrome corona virus2', 'SARS-CoV-2019 virus', 'human coronavirus 2', 'severe acute respiratory syndromerelated coronavirus 2 2019', '2019 severe acute respiratory syndromerelated coronavirus2', '2019 wuhancorona virus', '2019 novel wuhan seafood market pneumonia virus', 'acute respiratory syndrome coronavirus causing coronavirus disease 2019', 'novel coronavirus that causes novel coronavirus disease 2019', '2019 wuhan virus', 'novel severe acute respiratory syndrome coronavirus 2', 'seafood market coronavirus 2019', '2019 novel wuhan pneumonia virus', '2019novel chinese corona virus', 'new coronavirus that causes COVID-19', '2019novel sars cov 2', '2019 novel coronavirus disease COV', 'novel coronavirus that causes coronavirus disease 2019', '2019 new sarscov2', 'new SARS-CoV-2 coronavirus', 'covid19 virus', 'Virus Causing COVID-19 Disease', '2019 seafood market coronavirus', 'seafood market virus 2019', 'sever

In [17]:
len(merged_terms_set_processed)

352

In [16]:
outfile_disease = "../results/virus_all_processed_merged.txt"
with open(outfile_disease, "w") as f:
    for term in sorted(merged_terms_set_processed):
        f.write(term + "\n")
        
    for note in footnote:
        f.write(note + "\n")


In [11]:
nltk.PorterStemmer("pneumonia caused by 2019new sars cov 2")

ValueError: Mode must be one of PorterStemmer.NLTK_EXTENSIONS, PorterStemmer.MARTIN_EXTENSIONS, or PorterStemmer.ORIGINAL_ALGORITHM

In [18]:
p = nltk.PorterStemmer()
p.stem("pneumonia pneumonia caused by 2019new sars cov 2 viruses disease and illness")


'diseas'

In [23]:


getSingular("pneumonia pneumonia caused by 2019new sars cov 2 viruses disease and illness")


'pneumonia pneumonia caused by 2019new sars cov 2 viruses disease and illness'

'virus'

In [34]:
txt = "pneumonia pneumonia caused by 2019new sars cov 2 viruses disease and illness"

x = [getSingular(word) for word in txt.split()]

{'infeCtious', 'diseases', 'Illness', 'Pneumonia', 'Virus', 'syndromes', 'infections', 'Coronaviruses', 'INFECTIONS', 'CoVs', 'illness', 'virus', 'pneumonia', 'SARS-coronavirus', 'illnesses', 'distress', 'ARDS', 'infectious', 'COVID-ARDS', 'Coronavirus', 'sars', 'coronavirus', 'wurs', 'Infections', 'wuhancoronavirus', 'SARS', 'disorders'}
