## This notebook will be used to merge disease terms and create a final, merged dictionary

In [25]:
import numpy as np
import pandas as pd
import nltk
import inflect
from pattern.text.en import singularize

In [2]:
# Files
orig_aitslab_file = "../data/AITSLAB_dictionaries/Disease_COVID-19.txt"

# This file has been taken from merged and processed data
processed_disease_file = "../data/processed_terms/ncbi_mergedterms_disease_covid_grm_lex.xlsx"

disease_terms_to_add = ["New coronavirus",
"Novel coronavirus",
"Chinese virus",
"China virus",
"Chinese coronavirus",
"China coronavirus",
"kung flu"
]

In [3]:
list_orig = []
with open(orig_aitslab_file) as f:
    for line in f.readlines():
        line = line.strip()
        list_orig.append(line)
footnote = list_orig[-9:]
list_orig = list_orig[:-9] + disease_terms_to_add
list_orig


['covid19',
 'covid 19',
 'ncp',
 'coronavirus disease 2019',
 'corona virus disease 2019',
 'coronavirus disease19',
 'corona virus disease19',
 '2019 novel coronavirus respiratory syndrome',
 '2019 novel corona virus respiratory syndrome',
 'wurs',
 'seafood market pneumonia',
 'severe acute respiratory syndrome type 2',
 'sars 2',
 'sars2',
 'sars 2019',
 '2019 sars',
 'severe acute respiratory syndrome 2019',
 'wuhan infection',
 'hubei infection',
 'seafood market infection',
 'covid19 virus infection',
 'covid 19 virus infection',
 'sarscov2 infection',
 'sarscov2 2019 infection',
 '2019 sarscov2 infection',
 '2019novel sarscov2 infection',
 '2019new sarscov2 infection',
 '2019 novel sarscov2 infection',
 '2019 new sarscov2 infection',
 'hcov 19 infection',
 'hcov19 infection',
 'sars cov 2 infection',
 'sars cov 2 2019 infection',
 '2019 sars cov 2 infection',
 '2019novel sars cov 2 infection',
 '2019new sars cov 2 infection',
 '2019 novel sars cov 2 infection',
 '2019 new sars 

In [4]:
footnote

['ICD10:U07.1',
 'ICD10: U07.1',
 'U07.1',
 'ICD11:RA01.0',
 'ICD11: RA01.0',
 'RA01.0',
 'wikidata:Q84263196',
 'DOID:0080600',
 'MeSH:C000657245']

In [5]:
df_processed = pd.read_excel(processed_disease_file)
df_processed

Unnamed: 0,TERM,INCLUDE
0,disease caused by SARS-CoV-2 virus,y
1,infectious respiratory disease caused by sever...,y
2,Wuhan Novel Coronavirus Infection,y
3,Coronavirus 2019 (COVID-19) Infection,y
4,presymptomatic COVID-19 infections,n
...,...,...
284,acute respiratory syndrome coronavirus 2 (SARS...,y
285,Disease due to 2019 Novel Coronavirus,y
286,infections caused by severe acute respiratory ...,y
287,severe acute respiratory syndrome coronavirus ...,y


In [6]:
df_processed = df_processed[df_processed["INCLUDE"]=="y"]
df_processed

Unnamed: 0,TERM,INCLUDE
0,disease caused by SARS-CoV-2 virus,y
1,infectious respiratory disease caused by sever...,y
2,Wuhan Novel Coronavirus Infection,y
3,Coronavirus 2019 (COVID-19) Infection,y
7,virus-2 (SARS-CoV-2) infection,y
...,...,...
284,acute respiratory syndrome coronavirus 2 (SARS...,y
285,Disease due to 2019 Novel Coronavirus,y
286,infections caused by severe acute respiratory ...,y
287,severe acute respiratory syndrome coronavirus ...,y


In [7]:
new_terms = set(df_processed["TERM"].to_list())
new_terms

{'19 coronavirus infection',
 '2019 coronavirus (COVID-19) infection',
 '2019 novel corona virus (2019-nCoV) disease',
 '2019 novel coronavirus (SARS-CoV2) infection',
 '2019, novel coronavirus pneumonia',
 '2019-nCoV acute respiratory disease',
 '2019-nCoV acute respiratory diseases',
 '2019-nCoV-associated infectious diseases',
 'ACUTE RESPIRATORY INFECTIONS FROM SARS-COV-2',
 'ARDS caused by SARS-CoV2',
 'ARDS due to SARS-CoV-2',
 'Acute Respiratory Infections Caused by 2019-nCoV',
 'Acute respiratory disease caused by 2019 novel coronavirus',
 'Acute respiratory distress syndrome (ARDS) caused by SARS-CoV-2',
 'COVID illness',
 'COVID related illness',
 'COVID-19 (Coronavirus) infection',
 'COVID-19 (SARS-CoV-2)-associated ARDS',
 'COVID-19 associated pneumonia',
 'COVID-19 illness',
 'COVID-19 illnesses',
 'COVID-19 infection pneumonia',
 'COVID-19 infectious diseases',
 'COVID-19 related respiratory illness',
 'COVID-19 respiratory illness',
 'COVID-19 respiratory infection',
 'C

In [8]:
merged_terms = set(list_orig).union(new_terms)
print(merged_terms)

{'novel coronavirus severe respiratory syndrome', 'seafood market syndrome', 'disease caused by 2019 new severe acute respiratory syndromerelated corona virus 2', 'disease due to 2019 sars coronavirus 2', 'hubei acute severe disorder', 'wuhan coronavirus severe respiratory disease', '2019novel wuhan virus severe disorder', '2019new chinese coronavirus acute severe disorder', '2019new sars cov 2 acute respiratory disease', '2019novel wuhan seafood market pneumonia virus acute severe respiratory infection', 'illness caused by 2019 corona virus', 'disease due to 2019 sarscov2', '2019novel sarscov2 syndrome', '2019novel wuhan virus severe syndrome', 'sars cov 2 2019 severe disorder', 'chinese coronavirus 2019 severe acute syndrome', 'infection from 2019 wuhan seafood market pneumonia virus', '2019novel sarscov2 disorder', 'wuhancorona virus severe acute disorder', 'syndrome caused by 2019 human coronavirus', '2019 new seafood market corona virus acute syndrome', 'virus responsible for covi

In [9]:
len(merged_terms)

13142

In [39]:
#Convert to singular but no use

set_plurals = set()
for terms in sorted(merged_terms):
    term_list = terms.split()
    for term in term_list:
        if not singularize(term) == term:
            set_plurals.update([term])
print(set_plurals)

{'infeCtious', 'diseases', 'Illness', 'Pneumonia', 'Virus', 'syndromes', 'infections', 'Coronaviruses', 'INFECTIONS', 'CoVs', 'illness', 'virus', 'pneumonia', 'SARS-coronavirus', 'illnesses', 'distress', 'ARDS', 'infectious', 'COVID-ARDS', 'Coronavirus', 'sars', 'coronavirus', 'wurs', 'Infections', 'wuhancoronavirus', 'SARS', 'disorders'}


In [47]:
# create dictionary to remove them
dict_plurals = {"diseases": "disease", 
                "syndromes": "syndrome",
                "infections": "infection",
                "INFECTIONS": "INFECTION",
                "illnesses": "illness",
                "Infections": "Infection",
                "disorders": "disorder"}

merged_terms_set_processed = set()
#for each term
for terms in sorted(merged_terms):
    txt = []
    #split the term into words
    for term in terms.split():
        #manually check plural
        if term in dict_plurals:
            term = dict_plurals[term]
        txt.append(term)
        
    
    merged_terms_set_processed.update([" ".join(txt)])
print(merged_terms_set_processed)     

{'novel coronavirus severe respiratory syndrome', 'seafood market syndrome', 'disease caused by 2019 new severe acute respiratory syndromerelated corona virus 2', 'disease due to 2019 sars coronavirus 2', 'hubei acute severe disorder', '2019new chinese coronavirus acute severe disorder', '2019novel wuhan virus severe disorder', 'wuhan coronavirus severe respiratory disease', '2019new sars cov 2 acute respiratory disease', '2019novel wuhan seafood market pneumonia virus acute severe respiratory infection', 'disease due to 2019 sarscov2', 'illness caused by 2019 corona virus', '2019novel sarscov2 syndrome', '2019novel wuhan virus severe syndrome', 'chinese coronavirus 2019 severe acute syndrome', 'infection from 2019 wuhan seafood market pneumonia virus', 'sars cov 2 2019 severe disorder', '2019novel sarscov2 disorder', 'syndrome caused by 2019 human coronavirus', 'wuhancorona virus severe acute disorder', '2019 new seafood market corona virus acute syndrome', '2019 seafood market corona

In [48]:
outfile_disease = "../results/disease_all_processed_merged.txt"
with open(outfile_disease, "w") as f:
    for term in sorted(merged_terms_set_processed):
        f.write(term + "\n")
        
    for note in footnote:
        f.write(note + "\n")


In [11]:
nltk.PorterStemmer("pneumonia caused by 2019new sars cov 2")

ValueError: Mode must be one of PorterStemmer.NLTK_EXTENSIONS, PorterStemmer.MARTIN_EXTENSIONS, or PorterStemmer.ORIGINAL_ALGORITHM

In [18]:
p = nltk.PorterStemmer()
p.stem("pneumonia pneumonia caused by 2019new sars cov 2 viruses disease and illness")


'diseas'

In [23]:


getSingular("pneumonia pneumonia caused by 2019new sars cov 2 viruses disease and illness")


'pneumonia pneumonia caused by 2019new sars cov 2 viruses disease and illness'

'virus'

In [34]:
txt = "pneumonia pneumonia caused by 2019new sars cov 2 viruses disease and illness"

x = [getSingular(word) for word in txt.split()]

{'infeCtious', 'diseases', 'Illness', 'Pneumonia', 'Virus', 'syndromes', 'infections', 'Coronaviruses', 'INFECTIONS', 'CoVs', 'illness', 'virus', 'pneumonia', 'SARS-coronavirus', 'illnesses', 'distress', 'ARDS', 'infectious', 'COVID-ARDS', 'Coronavirus', 'sars', 'coronavirus', 'wurs', 'Infections', 'wuhancoronavirus', 'SARS', 'disorders'}
