In [None]:
import pandas as pd

res = pd.read_csv('/Users/chkapsalis/Desktop/nlp_project/working/annot_schema/results.csv', header=0, sep=',', index_col=False)
res.head()

Unnamed: 0,label,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9,word_10
0,MEDICAL-CONDITIONS,"""Acute respiratory distress syndrome""","""Pneumonia""","""Sepsis""","""ARDS""","""Cytokine storm""","""Multi-organ failure""","""Hypoxemia""","""Coagulopathy""","""Acute kidney injury""","""Cardiovascular complications"""
1,SYMPTOMS,"""1. Fever""","""2. Cough""","""3. Shortness of breath""","""4. Fatigue""","""5. Body aches""","""6. Loss of taste or smell""","""7. Sore throat""","""8. Congestion or runny nose""","""9. Nausea or vomiting""","""10. Diarrhea"""
2,TREATMENTS,"""Vaccine""","""Remdesivir""","""Dexamethasone""","""Convalescent plasma therapy""","""Monoclonal antibodies""","""Tocilizumab""","""Ivermectin""","""Favipiravir""","""Azithromycin""","""Baricitinib"""
3,EVENTS,"""1. Global pandemic declared",2. Travel restrictions implemented,3. Schools closed,4. Work from home orders issued,5. Mask mandates enforced,6. Social distancing guidelines introduced,7. Vaccination campaigns rolled out,8. Economic shutdowns initiated,9. Testing and contact tracing ramped up,"10. Healthcare systems overwhelmed"""
4,POLICIES,"""mask mandate""","""social distancing guidelines""","""stay-at-home orders""","""quarantine requirements""","""capacity limits for businesses""","""travel restrictions""","""mandatory testing for travelers""","""remote work policies""","""school closures""","""mass gathering bans"""


In [None]:
for col in res.columns.difference(['label']):
    res[col] = res[col].str.replace('"',"")
    res[col] = res[col].str.replace(r'\d\.?', '', regex=True)
    res[col] = res[col].str.lower()
    res[col] = res[col].str.strip()


In [None]:
res.head()

Unnamed: 0,label,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9,word_10
0,MEDICAL-CONDITIONS,acute respiratory distress syndrome,pneumonia,sepsis,ards,cytokine storm,multi-organ failure,hypoxemia,coagulopathy,acute kidney injury,cardiovascular complications
1,SYMPTOMS,fever,cough,shortness of breath,fatigue,body aches,loss of taste or smell,sore throat,congestion or runny nose,nausea or vomiting,diarrhea
2,TREATMENTS,vaccine,remdesivir,dexamethasone,convalescent plasma therapy,monoclonal antibodies,tocilizumab,ivermectin,favipiravir,azithromycin,baricitinib
3,EVENTS,global pandemic declared,travel restrictions implemented,schools closed,work from home orders issued,mask mandates enforced,social distancing guidelines introduced,vaccination campaigns rolled out,economic shutdowns initiated,testing and contact tracing ramped up,healthcare systems overwhelmed
4,POLICIES,mask mandate,social distancing guidelines,stay-at-home orders,quarantine requirements,capacity limits for businesses,travel restrictions,mandatory testing for travelers,remote work policies,school closures,mass gathering bans


In [None]:
res.to_csv('results2.csv', index=False, sep=',')

In [None]:
# function necessary to get the `wordnet` representation of the part-of-speech tag so
# I can use it in other wordnet operations
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


In [None]:
# I see some of these phrases are longer but the words they contain it is safe to assume that -given the fact that our training tweets are all covid-related
# will always pertain to pandemic-themed discussions/statements
from itertools import chain
import nltk
from nltk import ne_chunk, pos_tag, word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

stop_words = stopwords.words('english')
from nltk.corpus import wordnet

# Dictionary that will map labels of the annotation schema to sets containing all synonyms of the GPT-generated anchor words for each of them
# I also make sure the original phrases/words are included
syn_sets = dict()
syn_sets.update({label: set() for label in res['label'].unique()})  # the `update` method overwrites the -nonexisting so doesn't matter- initial values

# So for each label, I will go through them one by one and -ignoring stopwords- I will find synonyms from wordnet to extend our vocabulary
for label in res['label'].unique():
    phrases = [res[res['label'] == label][i].values[0] for i in res.columns.difference(['label'])]
    for phrase in phrases:
        tokens = word_tokenize(phrase)
        print(tokens)
        pos = pos_tag(tokens)
        print(pos)
        for token in tokens:
            synonyms = wordnet.synsets(token, pos=get_wordnet_pos(pos[0][1]), lang='eng')
            lemmas = set([i.lower().replace('_',' ') for i in chain.from_iterable([word.lemma_names() for word in synonyms]) if i.lower() not in stop_words])  # flatten the list of lists into a continuous list in place, plus
            lemmas.add(phrase)
            # getting rid of duplicate values
            # [word.lemma_names() for word in synonyms]  # this would return a nuanced list of lists with possible duplicate values across lists
            if lemmas:
                syn_sets[label].update(lemmas)
                print(lemmas)


['acute', 'respiratory', 'distress', 'syndrome']
[('acute', 'JJ'), ('respiratory', 'NN'), ('distress', 'NN'), ('syndrome', 'NN')]
{'acuate', 'incisive', 'intense', 'acute respiratory distress syndrome', 'sharp', 'knifelike', 'keen', 'needlelike', 'discriminating', 'penetrating', 'penetrative', 'acute', 'piercing'}
{'acute respiratory distress syndrome', 'respiratory'}
{'acute respiratory distress syndrome'}
{'acute respiratory distress syndrome'}
['cardiovascular', 'complications']
[('cardiovascular', 'JJ'), ('complications', 'NNS')]
{'cardiovascular', 'cardiovascular complications'}
{'cardiovascular complications'}
['pneumonia']
[('pneumonia', 'NN')]
{'pneumonia'}
['sepsis']
[('sepsis', 'NN')]
{'sepsis'}
['ards']
[('ards', 'NNS')]
{'ards', 'wet lung', 'adult respiratory distress syndrome', 'white lung'}
['cytokine', 'storm']
[('cytokine', 'NN'), ('storm', 'NN')]
{'cytokine', 'cytokine storm'}
{'storm', 'violent storm', 'tempest', 'cytokine storm'}
['multi-organ', 'failure']
[('multi-o

In [None]:
syn_sets

{'MEDICAL-CONDITIONS': {'acuate',
  'acute',
  'acute kidney injury',
  'acute respiratory distress syndrome',
  'adult respiratory distress syndrome',
  'ards',
  'cardiovascular',
  'cardiovascular complications',
  'coagulopathy',
  'cytokine',
  'cytokine storm',
  'discriminating',
  'hypoxemia',
  'incisive',
  'intense',
  'keen',
  'knifelike',
  'multi-organ failure',
  'needlelike',
  'penetrating',
  'penetrative',
  'piercing',
  'pneumonia',
  'respiratory',
  'sepsis',
  'sharp',
  'storm',
  'tempest',
  'violent storm',
  'wet lung',
  'white lung'},
 'SYMPTOMS': {'abruptness',
  'ache',
  'aching',
  'appreciation',
  'aroma',
  'beaver state',
  'body',
  'body aches',
  'breath',
  'breather',
  'breathing place',
  'breathing space',
  'breathing spell',
  'breathing time',
  'brusqueness',
  'congestion',
  'congestion or runny nose',
  'consistence',
  'consistency',
  'cough',
  'coughing',
  'curtness',
  'dead body',
  'departure',
  'deprivation',
  'diarrhea'

In [None]:
for label in res['label'].unique():
    print(len(syn_sets[label]))
    print(syn_sets[label])

31
{'wet lung', 'sepsis', 'adult respiratory distress syndrome', 'cytokine storm', 'cardiovascular complications', 'white lung', 'hypoxemia', 'acuate', 'cardiovascular', 'storm', 'intense', 'sharp', 'knifelike', 'keen', 'pneumonia', 'piercing', 'multi-organ failure', 'respiratory', 'incisive', 'violent storm', 'needlelike', 'discriminating', 'acute', 'cytokine', 'ards', 'acute respiratory distress syndrome', 'coagulopathy', 'penetrating', 'penetrative', 'tempest', 'acute kidney injury'}
115
{'discernment', 'cough', 'curtness', 'weariness', 'consistence', 'tasting', 'brusqueness', 'personnel casualty', 'departure', 'operating room', 'body', 'odor', 'organic structure', 'mouthful', 'shortness of breath', 'gustatory sensation', 'fatigue duty', 'olfactory perception', 'emesis', 'appreciation', 'olfactory property', 'diarrhea', 'pyrexia', 'red ink', 'look', 'smell', 'feel', 'nose', 'gustatory modality', 'trunk', 'breathing spell', 'nozzle', 'tiredness', 'consistency', 'ache', 'deprivation',

In [None]:
pd.DataFrame.from_dict(data=syn_sets, orient='index').to_csv('all_relevant.csv', header=False)