## Setup

In [221]:
import pandas as pd
patient_history=pd.read_csv('./vaccine reaction/2021VAERSDATA.CSV', sep=',', encoding='ISO-8859-1')
patient_sympt=pd.read_csv('./vaccine reaction/2021VAERSSYMPTOMS.CSV', sep=',',encoding='ISO-8859-1')
patient_vac=pd.read_csv('./vaccine reaction/2021VAERSVAX.CSV', sep=',',encoding='ISO-8859-1')

  


In [222]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
pd.set_option("display.max_rows", 1000)

In [223]:
patient_history.head(2)

Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,CUR_ILL,HISTORY,PRIOR_VAX,SPLTTYPE,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES
0,916600,01/01/2021,TX,33.0,33.0,,F,,Right side of epiglottis swelled up and hinder...,,...,,,,,2,01/01/2021,,Y,,Pcn and bee venom
1,916601,01/01/2021,CA,73.0,73.0,,F,,Approximately 30 min post vaccination administ...,,...,Patient residing at nursing facility. See pati...,Patient residing at nursing facility. See pati...,,,2,01/01/2021,,Y,,"""Dairy"""


## Data preprocessing

In [224]:
 # get the needed columns from patient history file
patient_history=patient_history.loc[: , ['VAERS_ID','AGE_YRS','SEX','CUR_ILL','HISTORY','ALLERGIES']]
 # get the needed columns from patient vaccine file
patient_vac=patient_vac.loc[:,['VAERS_ID','VAX_NAME']]
 # commbining reactions per patient 
patient_sympt['SYMPTOMS']= patient_sympt['SYMPTOM1'].fillna('')+','+ patient_sympt['SYMPTOM2'].fillna('')+','+ patient_sympt['SYMPTOM3'].fillna('')+','+ patient_sympt['SYMPTOM4'].fillna('')+','+ patient_sympt['SYMPTOM5'].fillna('')
patient_sympt=patient_sympt.groupby(by='VAERS_ID')['SYMPTOMS'].agg(lambda x: ','.join(x)).reset_index()


In [225]:
 # mergeing all extracted info above to a single dataframe 
#df=pd.merge(patient_history, patient_vac, left_on='VAERS_ID', right_on='VAERS_ID')
df=patient_history.merge( patient_vac , on='VAERS_ID')


In [226]:
df=pd.merge(df, patient_sympt,  left_on='VAERS_ID', right_on='VAERS_ID')
df=df.fillna(value="")
# prepare y values step1 
# make symptom columns into a list 
df['SYMPTOMS']=df['SYMPTOMS'].apply(lambda x : x.split(","))
# and than remove any empty values in the list. 
df['SYMPTOMS']=df['SYMPTOMS'].apply(lambda x : [i for i in x if i])
df=df.drop_duplicates(['VAERS_ID'])
df[df['VAERS_ID']==1015465]

Unnamed: 0,VAERS_ID,AGE_YRS,SEX,CUR_ILL,HISTORY,ALLERGIES,VAX_NAME,SYMPTOMS
22431,1015465,75.0,F,U07.1 2019-nCoV acute respiratory disease (Pri...,U07.1 2019-nCoV acute respiratory disease (Pri...,"Iodinated Contrast- Oral and IV Dye, Penicilli...",COVID19 (COVID19 (MODERNA)),"[Heart rate abnormal, Pulse absent, Respirator..."


In [227]:
def convertAge(string_age):
    if string_age =="" :
        return "unknown_age"
    if float(string_age) <= 60 :
        return 'under_60'
    return "over_60"

def changeVaxName(vax_name):
    if 'MODERNA'in vax_name:
        return 'MODERNA'
    if 'PFIZER' in vax_name:
        return 'PFIZER'
    
    return 'JANSSEN'

In [228]:
# remove none coivd related vaccine samples 
df=df[df['VAX_NAME'].str.contains('|'.join(['MODERNA','JANSSEN','PFIZER-BIONTECH'])) & df['VAX_NAME'].isna()==False]
# put age in bucket under 60 or over
df['AGE_CAT']=df['AGE_YRS'].apply(lambda x : convertAge(x) )
# simmplify vaccine names 
df['VAX_NAME']=df['VAX_NAME'].apply(lambda x : changeVaxName(x))
df

Unnamed: 0,VAERS_ID,AGE_YRS,SEX,CUR_ILL,HISTORY,ALLERGIES,VAX_NAME,SYMPTOMS,AGE_CAT
0,916600,33.0,F,,,Pcn and bee venom,MODERNA,"[Dysphagia, Epiglottitis]",under_60
1,916601,73.0,F,Patient residing at nursing facility. See pati...,Patient residing at nursing facility. See pati...,"""Dairy""",MODERNA,"[Anxiety, Dyspnoea]",over_60
2,916602,23.0,F,,,Shellfish,PFIZER,"[Chest discomfort, Dysphagia, Pain in extremit...",under_60
3,916603,58.0,F,kidney infection,"diverticulitis, mitral valve prolapse, osteoar...","Diclofenac, novacaine, lidocaine, pickles, tom...",MODERNA,"[Dizziness, Fatigue, Mobility decreased]",under_60
4,916604,47.0,F,Na,,Na,MODERNA,"[Injection site erythema, Injection site pruri...",under_60
...,...,...,...,...,...,...,...,...,...
34625,1113917,88.0,F,,Medical History/Concurrent Conditions: Aortic ...,,MODERNA,"[Dysphagia, Hernia, Obstruction]",over_60
34626,1113920,83.0,F,,Medical History/Concurrent Conditions: Atrial ...,,PFIZER,"[Arrhythmia, Asymptomatic COVID-19, Chills, Co...",over_60
34627,1113963,59.0,F,,Medical History/Concurrent Conditions: Type II...,,PFIZER,"[Haemophagocytic lymphohistiocytosis, SARS-CoV...",under_60
34628,1115045,57.0,M,,Medical History/Concurrent Conditions: No adve...,,MODERNA,[Death],under_60


In [229]:
# Create a list of remove words
remove_word_list = ['none', 'na']

def remove_words(strings):
    for words in remove_word_list:
        strings = strings.replace(words, '')
    return strings

In [230]:
# make the health_condition text lower case 
df['CUR_ILL']=df['CUR_ILL'].str.lower()
df['HISTORY']=df['HISTORY'].str.lower()
df['ALLERGIES']=df['ALLERGIES'].str.lower()
# remove list of words from columns

df['CUR_ILL']=df['CUR_ILL'].apply(lambda x : remove_words(x)  )
df['HISTORY']=df['HISTORY'].apply(lambda x : remove_words(x)  )
df['ALLERGIES']=df['ALLERGIES'].apply(lambda x : remove_words(x)  )

In [231]:
df

Unnamed: 0,VAERS_ID,AGE_YRS,SEX,CUR_ILL,HISTORY,ALLERGIES,VAX_NAME,SYMPTOMS,AGE_CAT
0,916600,33.0,F,,,pcn and bee venom,MODERNA,"[Dysphagia, Epiglottitis]",under_60
1,916601,73.0,F,patient residing at nursing facility. see pati...,patient residing at nursing facility. see pati...,"""dairy""",MODERNA,"[Anxiety, Dyspnoea]",over_60
2,916602,23.0,F,,,shellfish,PFIZER,"[Chest discomfort, Dysphagia, Pain in extremit...",under_60
3,916603,58.0,F,kidney infection,"diverticulitis, mitral valve prolapse, osteoar...","diclofec, novacaine, lidocaine, pickles, tomat...",MODERNA,"[Dizziness, Fatigue, Mobility decreased]",under_60
4,916604,47.0,F,,,,MODERNA,"[Injection site erythema, Injection site pruri...",under_60
...,...,...,...,...,...,...,...,...,...
34625,1113917,88.0,F,,medical history/concurrent conditions: aortic ...,,MODERNA,"[Dysphagia, Hernia, Obstruction]",over_60
34626,1113920,83.0,F,,medical history/concurrent conditions: atrial ...,,PFIZER,"[Arrhythmia, Asymptomatic COVID-19, Chills, Co...",over_60
34627,1113963,59.0,F,,medical history/concurrent conditions: type ii...,,PFIZER,"[Haemophagocytic lymphohistiocytosis, SARS-CoV...",under_60
34628,1115045,57.0,M,,medical history/concurrent conditions: no adve...,,MODERNA,[Death],under_60


## Extract the needed columns for training

In [232]:
df_training = df.loc[:, ['AGE_CAT','SEX','VAX_NAME']]
# combining patients' current illness , medical history and, allergies and store in the "health condition column"
df['HEALTH_CONDITION'] = df['CUR_ILL']+","+ df['HISTORY']+","+df['ALLERGIES']


## NLTK PREPROCESSING

In [233]:
import nltk
import itertools
from string import punctuation
from nltk.corpus import stopwords

punctuation


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [234]:
stopwords = stopwords.words('english')


In [235]:
def remove_words_containning_numbers(word):
    if word.isalpha():
        return word
    return ""
    

In [236]:
# tokenize df_training['HEALTH_CONDITION']
df['HEALTH_CONDITION']=df['HEALTH_CONDITION'].apply(lambda x : nltk.word_tokenize(x))
allwords=set(itertools.chain.from_iterable(df.HEALTH_CONDITION))
len(allwords)



16363

In [237]:
# remove any punctuations from words 
df['HEALTH_CONDITION']=df['HEALTH_CONDITION'].apply(lambda x : [''.join(c for c in s if c not in punctuation) for s in x])
allwords=set(itertools.chain.from_iterable(df.HEALTH_CONDITION))
len(allwords)

14303

In [238]:
from textblob import TextBlob
def spell_correct (word):
    return str(TextBlob(word).correct())
    

In [239]:
# remove any words mix with number
df['HEALTH_CONDITION']=df['HEALTH_CONDITION'].apply(lambda x : [remove_words_containning_numbers(w) for w in x])
allwords=set(itertools.chain.from_iterable(df.HEALTH_CONDITION))
len(allwords)

11968

In [240]:
# remove any empty words
df['HEALTH_CONDITION']=df['HEALTH_CONDITION'].apply(lambda x : [string for string in x if string != ""])
allwords=set(itertools.chain.from_iterable(df.HEALTH_CONDITION))
len(allwords)

11967

In [241]:
# remove stop words
df['HEALTH_CONDITION']=df['HEALTH_CONDITION'].apply(lambda x: [words for words in x if words not in stopwords])
allwords=set(itertools.chain.from_iterable(df.HEALTH_CONDITION))
len(allwords)

11836

## auto correct spelling errors (run paralle), since it takes a long time to run, I saved the data processed dataframe
import multiprocessing
from autocorrect import Speller
import time
spell = Speller(lang='en')


import mypack.correc as co
import numpy as np
new_dff=df.loc[0:200,'HEALTH_CONDITION']
new_dff
start_time = time.time()
pool = multiprocessing.Pool()

new_dff=new_dff.apply(lambda x :  pool.map(co.correction, x ))
   
print("--- %s seconds ---" % (time.time() - start_time))

In [242]:
new_df=pd.read_csv('savedf4-7.csv', sep=',')
new_df.head(60)

Unnamed: 0,VAERS_ID,AGE_YRS,SEX,CUR_ILL,HISTORY,ALLERGIES,VAX_NAME,SYMPTOMS,AGE_CAT,HEALTH_CONDITION
0,916600,33.0,F,,,pcn and bee venom,MODERNA,"['Dysphagia', 'Epiglottitis']",under_60,"['pen', 'bee', 'venom']"
1,916601,73.0,F,patient residing at nursing facility. see pati...,patient residing at nursing facility. see pati...,"""dairy""",MODERNA,"['Anxiety', 'Dyspnoea']",over_60,"['patient', 'residing', 'nursing', 'facility',..."
2,916602,23.0,F,,,shellfish,PFIZER,"['Chest discomfort', 'Dysphagia', 'Pain in ext...",under_60,['shellfish']
3,916603,58.0,F,kidney infection,"diverticulitis, mitral valve prolapse, osteoar...","diclofec, novacaine, lidocaine, pickles, tomat...",MODERNA,"['Dizziness', 'Fatigue', 'Mobility decreased']",under_60,"['kidney', 'infection', 'diverticulitis', 'mis..."
4,916604,47.0,F,,,,MODERNA,"['Injection site erythema', 'Injection site pr...",under_60,[]
5,916606,44.0,F,,,iodine (shellfish) has epipen,MODERNA,['Pharyngeal swelling'],under_60,"['iodine', 'shellfish', 'pipe']"
6,916607,50.0,M,,"high blood pressure, high cholesterol, sleep a...",penicillin,MODERNA,"['Abdominal pain', 'Chills', 'Sleep disorder']",under_60,"['high', 'blood', 'pressure', 'high', 'cholest..."
7,916608,33.0,M,,,,MODERNA,"['Diarrhoea', 'Nasal congestion']",under_60,[]
8,916609,71.0,F,,"hashimoto's thyroiditis, hypertension, depression","sulfa antibiotics, azithromycin, adhesive in ...",MODERNA,"['Vaccination site erythema', 'Vaccination sit...",over_60,"['hashimoto', 'thyroiditis', 'hypertension', '..."
9,916610,18.0,F,,,jackfruit,MODERNA,"['Rash', 'Urticaria']",under_60,['jackfruit']


In [243]:


import ast
new_df['HEALTH_CONDITION']=new_df['HEALTH_CONDITION'].apply(lambda x: ast.literal_eval(x))
new_df['HEALTH_CONDITION']=new_df['HEALTH_CONDITION'].apply(lambda x: [n.strip() for n in x])




In [244]:
new_df

Unnamed: 0,VAERS_ID,AGE_YRS,SEX,CUR_ILL,HISTORY,ALLERGIES,VAX_NAME,SYMPTOMS,AGE_CAT,HEALTH_CONDITION
0,916600,33.0,F,,,pcn and bee venom,MODERNA,"['Dysphagia', 'Epiglottitis']",under_60,"[pen, bee, venom]"
1,916601,73.0,F,patient residing at nursing facility. see pati...,patient residing at nursing facility. see pati...,"""dairy""",MODERNA,"['Anxiety', 'Dyspnoea']",over_60,"[patient, residing, nursing, facility, see, pa..."
2,916602,23.0,F,,,shellfish,PFIZER,"['Chest discomfort', 'Dysphagia', 'Pain in ext...",under_60,[shellfish]
3,916603,58.0,F,kidney infection,"diverticulitis, mitral valve prolapse, osteoar...","diclofec, novacaine, lidocaine, pickles, tomat...",MODERNA,"['Dizziness', 'Fatigue', 'Mobility decreased']",under_60,"[kidney, infection, diverticulitis, mistral, v..."
4,916604,47.0,F,,,,MODERNA,"['Injection site erythema', 'Injection site pr...",under_60,[]
...,...,...,...,...,...,...,...,...,...,...
34116,1113917,88.0,F,,medical history/concurrent conditions: aortic ...,,MODERNA,"['Dysphagia', 'Hernia', 'Obstruction']",over_60,"[medical, historyconcurrent, condition, aortic..."
34117,1113920,83.0,F,,medical history/concurrent conditions: atrial ...,,PFIZER,"['Arrhythmia', 'Asymptomatic COVID-19', 'Chill...",over_60,"[medical, historyconcurrent, condition, atrial..."
34118,1113963,59.0,F,,medical history/concurrent conditions: type ii...,,PFIZER,"['Haemophagocytic lymphohistiocytosis', 'SARS-...",under_60,"[medical, historyconcurrent, condition, type, ..."
34119,1115045,57.0,M,,medical history/concurrent conditions: no adve...,,MODERNA,['Death'],under_60,"[medical, historyconcurrent, condition, advers..."


In [245]:
allwords=set(itertools.chain.from_iterable(new_df.HEALTH_CONDITION))
len(allwords)

9633

In [246]:
allwords_list=list(allwords)
ans=sorted(allwords_list)
print (ans)




In [274]:
def get_wordnet_pos(tag):
    x, y=tag[0]
    if y.startswith('J'):
        return wordnet.ADJ
    elif y.startswith('V'):
        return wordnet.VERB
    elif y.startswith('N'):
        return wordnet.NOUN
    elif y.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [267]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet
from nltk import pos_tag
st =PorterStemmer()
wnl = WordNetLemmatizer()
nltk.tag.pos_tag(['location'])

[('location', 'NN')]

In [275]:

new_df['HEALTH_CONDITION']=new_df['HEALTH_CONDITION'].apply(lambda x: [wnl.lemmatize(w, pos=get_wordnet_pos(pos_tag([w]))) for w in x ])



#allwords=set(itertools.chain.from_iterable(new_df.HEALTH_CONDITION))
#len(allwords)

from textblob import Word
# Lemmatize a word
def lamma(x):
    w = Word(x)
    return w.lemmatize()
      
lamma("walked")    

from textblob import TextBlob, Word
# Lemmatize a word
w = Word('ducks')
w.lemmatize()
new_df['HEALTH_CONDITION']=new_df['HEALTH_CONDITION'].apply(lambda x: [lamma(w) for w in x ])

In [276]:
allwords=set(itertools.chain.from_iterable(new_df.HEALTH_CONDITION))
len(allwords)

9084

In [277]:
allwords_list=list(allwords)
ans=sorted(allwords_list)
print (ans)


['aaa', 'aat', 'ab', 'abatacept', 'abd', 'abdomen', 'abdominoperineal', 'ability', 'ablation', 'able', 'abnormal', 'abnormality', 'abnormally', 'abo', 'aboutus', 'above', 'abovemigraines', 'abrasive', 'abrupt', 'abscess', 'abscesshematoma', 'absence', 'absent', 'absolute', 'absolutely', 'absorb', 'absorption', 'abt', 'abuse', 'ac', 'aca', 'academia', 'academically', 'acalculia', 'acarbose', 'accelerate', 'accept', 'acceptable', 'access', 'accident', 'accompany', 'accord', 'accordingly', 'ace', 'acetabular', 'acetaminophen', 'acetaminophencodeine', 'acetaminophine', 'acetate', 'acetiminophen', 'acetomephine', 'acetometiphan', 'acetominiphen', 'acetonide', 'acetyl', 'acf', 'ach', 'achalasia', 'achelasia', 'acheswell', 'achieve', 'achieves', 'achrocidin', 'acid', 'acidity', 'acidosis', 'acitretin', 'acknowledge', 'acne', 'acquire', 'acre', 'acromioclavicular', 'across', 'acrylic', 'act', 'actemraprednisone', 'actenol', 'activate', 'activation', 'active', 'actively', 'activity', 'actonel',

In [None]:
simmilar_word= {'amox':'amoxicillin', 'admin':'admin' ,'acet':'acetaminophen','allerg':'allergy','ampic':"ampicilina",'asthma':'asthma',
               'ather':'Atherosclerosis','ator':'atorvastatin'}

In [None]:
remove_word_list = ['none', 'na','aaa', 'aat', 'ab','abd','acf', 'ach','after','ak', 'aka', 'aki', 'al', 'alan','alert', 'alex', 'alexa', 'alf','although',
                  'already', 'also', 'b', 'ba', ]

In [207]:
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()

new_df['HEALTH_CONDITION']=new_df['HEALTH_CONDITION'].apply(lambda x: [st.stem(w) for w in x ])

In [208]:
allwords=set(itertools.chain.from_iterable(new_df.HEALTH_CONDITION))
print(len(allwords))
allwords_list=list(allwords)
ans=sorted(allwords_list)
print (ans)

7762
['aa', 'aat', 'ab', 'abataceiv', 'abd', 'abdom', 'abdominoperin', 'abl', 'abnorm', 'abo', 'about', 'abov', 'abovemigrain', 'abras', 'abrupt', 'abs', 'abscess', 'abscesshematom', 'absolv', 'absorb', 'abt', 'abus', 'ac', 'academ', 'acalcul', 'acarbos', 'acceiv', 'accel', 'access', 'accid', 'accompany', 'accord', 'acet', 'acetabul', 'acetaminoph', 'acetaminophencodein', 'acetaminophin', 'acetiminoph', 'acetomephin', 'acetometiph', 'acetominiph', 'acetonid', 'acetyl', 'acf', 'ach', 'achalas', 'achelas', 'acheswel', 'achiev', 'achrocidin', 'acid', 'acidos', 'acitretin', 'acknowledg', 'acn', 'acquir', 'acr', 'acromioclavicul', 'across', 'acryl', 'act', 'actemraprednison', 'actenol', 'actonel', 'acuminat', 'acut', 'acuteonchron', 'acuterec', 'acyclovir', 'ad', 'adalimumab', 'addadhd', 'adderal', 'addict', 'addit', 'adduc', 'adenocarcinom', 'adenoidectom', 'adenom', 'adenomyos', 'adenosin', 'adequ', 'adh', 'adhdad', 'adher', 'adhesivecontact', 'adjust', 'adjut', 'adl', 'adm', 'admin', 'ad

True

tensor([1., 2., 3.], device='cuda:0')

In [36]:
new_dff

0                                        [pen, bee, venom]
1        [patient, residing, nursing, facility, see, pa...
2                                              [shellfish]
3        [kidney, infection, diverticulitis, mistral, v...
4                                                       []
                               ...                        
34625    [medical, historyconcurrent, condition, aortic...
34626    [medical, historyconcurrent, condition, atrial...
34627    [medical, historyconcurrent, condition, type, ...
34628    [medical, historyconcurrent, condition, advers...
34629    [comment, list, nonencoded, patient, relevant,...
Name: HEALTH_CONDITION, Length: 34121, dtype: object

In [38]:
df['HEALTH_CONDITION'] = new_dff

In [39]:
df

Unnamed: 0,VAERS_ID,AGE_YRS,SEX,CUR_ILL,HISTORY,ALLERGIES,VAX_NAME,SYMPTOMS,AGE_CAT,HEALTH_CONDITION
0,916600,33.0,F,,,pcn and bee venom,MODERNA,"[Dysphagia, Epiglottitis]",under_60,"[pen, bee, venom]"
1,916601,73.0,F,patient residing at nursing facility. see pati...,patient residing at nursing facility. see pati...,"""dairy""",MODERNA,"[Anxiety, Dyspnoea]",over_60,"[patient, residing, nursing, facility, see, pa..."
2,916602,23.0,F,,,shellfish,PFIZER,"[Chest discomfort, Dysphagia, Pain in extremit...",under_60,[shellfish]
3,916603,58.0,F,kidney infection,"diverticulitis, mitral valve prolapse, osteoar...","diclofec, novacaine, lidocaine, pickles, tomat...",MODERNA,"[Dizziness, Fatigue, Mobility decreased]",under_60,"[kidney, infection, diverticulitis, mistral, v..."
4,916604,47.0,F,,,,MODERNA,"[Injection site erythema, Injection site pruri...",under_60,[]
...,...,...,...,...,...,...,...,...,...,...
34625,1113917,88.0,F,,medical history/concurrent conditions: aortic ...,,MODERNA,"[Dysphagia, Hernia, Obstruction]",over_60,"[medical, historyconcurrent, condition, aortic..."
34626,1113920,83.0,F,,medical history/concurrent conditions: atrial ...,,PFIZER,"[Arrhythmia, Asymptomatic COVID-19, Chills, Co...",over_60,"[medical, historyconcurrent, condition, atrial..."
34627,1113963,59.0,F,,medical history/concurrent conditions: type ii...,,PFIZER,"[Haemophagocytic lymphohistiocytosis, SARS-CoV...",under_60,"[medical, historyconcurrent, condition, type, ..."
34628,1115045,57.0,M,,medical history/concurrent conditions: no adve...,,MODERNA,[Death],under_60,"[medical, historyconcurrent, condition, advers..."


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
# define stop words 
addition_stopwords= ['none','None','na','nkda','00', '000', '000mg', '001d', '002d', '008s', '00x0', '01', '010', '0107', '010d', '011', '012', '02', '0200', '021', '029', '02jan2021', '02xa', '03', '0350', '03jan2021', '04', '0400', '041d', '0430', '04jan2021', '04nov2020', '05', '053', '06', '06jan2021', '07', '070s', '08', '080d', '080s', '0845', '08jun', '09', '091', '09nov2020', '0p', '10', '100', '1000', '100mg', '101', '102', '103', '104', '106', '10mg', '10th', '10yrs', '11', '110', '111', '112', '113', '115', '116', '119', '11jan2021', '11th', '12', '120', '1200mg', '1201005', '120320', '120ml', '121', '123', '1245', '125', '125mg', '1273', '129', '12dec2020', '12h', '12mm', '12th', '12xa', '12xd', '13', '130', '135', '13645005', '13apr2016', '13jan2021', '13nov2020', '13th', '14', '140', '142', '142d', '143', '14feb2021', '14th', '15', '1500', '1500mg', '152', '155mg', '15mg', '15min', '15pm', '15th', '16', '160cm', '160mg', '161', '16years', '17', '170s', '17feb2021', '17th', '18', '180', '1800', '18jan2021', '18mm', '18th', '19', '191813001', '192020', '193031009', '1940', '1943', '1944', '1950', '1957', '1960', '1960s', '1965', '1969', '1972', '1973', '1974', '1976', '1977', '1980', '1980s', '1983', '1985', '1986', '1987', '1988', '1989', '1990', '1990s', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '19dec2020', '19feb2021', '19jan2021', '19nov2020', '19th', '1a', '1month', '1onth', '1st', '1week', '1wk', '1x', '20', '200', '2000', '2000s', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '200mg', '201', '2010', '2010h', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '202', '2020', '2021', '20210a', '2022', '203', '204', '205', '209', '20lb', '20mg', '20nov2020', '20s', '20th', '21', '210', '212', '215q', '21dec2020', '21st', '22', '220', '222a', '229', '22dec2020', '22ndhad', '22oct2020', '23', '23dec2020', '24', '24dec2020', '24h', '24th', '25', '250', '2510', '25jan2021', '25mg', '26', '267434003', '26th', '27', '272', '274769005', '279039007', '27dec2020', '27th', '28', '285', '286', '28dec2020', '28mg', '28th', '29', '2921', '293', '298', '299', '29iu', '29nov2020', '29th', '2b', '2cm', '2dm', '2l', '2nd', '2ndary', '2o13', '2weeks', '2x', '2yrs', '30', '300', '3019', '302d', '303', '307', '309', '30mg', '30min', '30mm', '30pm', '30th', '30xa', '30xs', '31', '310', '311', '3112', '315', '318', '319', '31dec2020', '31st', '32', '320', '321', '3222', '324', '33', '34', '345', '35', '354', '359', '36', '367399005', '37', '37am', '38', '39', '396275006', '3a', '3b', '3cm', '3kg', '3l', '3mg', '3mm', '3months', '3rd', '3weeks', '3x', '40', '400', '400047006', '401', '40mg', '40s', '41', '412a', '413', '417k', '42', '421326000', '422166005', '423263001', '42xd', '43', '430pm', '433144002', '44', '440', '443', '445d', '448', '45', '45mg', '46', '462', '465', '47', '48', '49', '491', '493', '4g', '4th', '4x', '4years', '50', '500', '500mg', '501d', '507', '50s', '51', '511', '511a', '512', '512a', '515d', '52', '521', '522', '53', '536', '5370000', '53mg', '54404000', '55', '550', '551', '555', '559', '56', '561', '562', '564', '569', '57', '571', '58', '585', '59', '5cm', '5ft', '5g', '5other', '5q', '5th', '5weeks', '5wks', '5x', '5x0s', '5x9d', '5y', '5yrs', '60', '609', '61', '62', '621', '622', '63', '630pm', '64', '640', '641', '65', '650mg', '66', '67', '671', '68', '68yo', '69', '6dec2020', '6g', '6mg', '6th', '6x', '70', '71', '710', '718', '719', '72', '723', '724', '724698009', '73', '73430006', '73438004', '74', '75', '750', '758', '75mg', '76', '77', '78', '780', '79', '791', '79k', '7th', '80', '800', '800mg', '802d', '80s', '81', '810', '811', '812d', '813', '815', '816', '818', '819', '81mg', '82', '82020', '821a', '828', '829', '83', '838', '84', '841', '85', '8517006', '8580', '86', '87', '875', '88', '89', '890', '891', '898', '899', '89xs', '8am', '8cm', '8h', '8th', '90', '901', '909', '90s', '91', '911', '912', '92', '920', '921', '922', '92814006', '929', '93', '94', '95', '951', '95mg', '96', '97', '98', '99', '991', '998', '9chronic', '9in', '9mm', '9th', 'a1', 'a1c', 'a3', 'a41', 'a46', 'age24', 'alpha1', 'alpha1antitrypsin', 'aox3', 'appendectomy1988', 'apr2020', 'b00', 'b12', 'b19', 'b27', 'b35', 'b91', 'b95', 'b95a', 'b96', 'bipolar1', 'bipolar2', 'bnt162b2', 'brca2', 'c18', 'c25', 'c3', 'c4', 'c44', 'c5', 'c50', 'c53', 'c6', 'c61', 'c677t', 'c7', 'c8', 'c82', 'c85', 'c91', 'cabgx3', 'cd4', 'ckd2', 'ckd3', 'ckd4', 'cov2', 'covid10', 'covid19', 'covid_19', 'd05', 'd12', 'd3', 'd46', 'd49', 'd50', 'd51', 'd53', 'd61', 'd63', 'd64', 'd68', 'd69', 'd72', 'dec2017', 'dec2019', 'dec2020', 'dg6', 'diabetes2', 'diagnosis1', 'dissection2016', 'dm1', 'dm11', 'dm2', 'dmt2', 'dose1', 'e00', 'e02', 'e03', 'e04', 'e06', 'e07', 'e08', 'e10', 'e11', 'e26', 'e27', 'e3', 'e43', 'e46', 'e53', 'e55', 'e56', 'e63', 'e66', 'e73', 'e78', 'e86', 'e87', 'e876', 'e88', 'ees400', 'ef15', 'el0140', 'en6203', 'f01', 'f02', 'f03', 'f05', 'f06', 'f10', 'f17', 'f20', 'f22', 'f25', 'f29', 'f31', 'f32', 'f33', 'f34', 'f39', 'f41', 'f43', 'f44', 'f48', 'f51', 'f73', 'f89', 'f90', 'f95', 'f98', 'feb2020', 'for2', 'fracture11', 'ft4', 'g2', 'g20', 'g25', 'g30', 'g31', 'g35', 'g3a', 'g40', 'g43', 'g44', 'g45', 'g47', 'g56', 'g60', 'g62', 'g6pd', 'g80', 'g81', 'g82', 'g89', 'g93', 'g95', 'h04', 'h1', 'h16', 'h1c', 'h1n1', 'h25', 'h26', 'h35', 'h40', 'h47', 'h52', 'h53', 'h5n1', 'h61', 'h91', 'hba1c', 'her2', 'hga1c', 'hgba1c', 'hsv1', 'hsv2', 'hypertensionr26', 'i05', 'i07', 'i10', 'i11', 'i13', 'i16', 'i21', 'i25', 'i26', 'i27', 'i42', 'i44', 'i45', 'i47', 'i48', 'i49', 'i50', 'i51', 'i63', 'i66', 'i67', 'i69', 'i70', 'i71', 'i72', 'i73', 'i82', 'i83', 'i87', 'i89', 'i95', 'icd10', 'icd9', 'if_covid_prior_vaccination', 'igg4', 'insulin16', 'j01', 'j02', 'j15', 'j18', 'j20', 'j30', 'j31', 'j40', 'j42', 'j43', 'j44', 'j45', 'j69', 'j96', 'j98', 'jak2', 'jan2020', 'jan2021', 'jul2018', 'jul2019', 'jul2020', 'jun2018', 'jun2020', 'k21', 'k29', 'k30', 'k42', 'k43', 'k44', 'k52', 'k55', 'k56', 'k57', 'k58', 'k59', 'k62', 'k63', 'k76', 'k80', 'k85', 'k92', 'known_allergies', 'l02', 'l03', 'l08', 'l1', 'l2', 'l21', 'l23', 'l29', 'l3', 'l30', 'l4', 'l5', 'l53', 'l5s1', 'l5s4', 'l70', 'l80', 'l89', 'l97', 'l98', 'm05', 'm06', 'm10', 'm13', 'm15', 'm17', 'm19', 'm1a', 'm20', 'm24', 'm25', 'm41', 'm43', 'm47', 'm48', 'm51', 'm53', 'm54', 'm62', 'm6us', 'm79', 'm81', 'm85', 'm86', 'mar2020', 'may2020', 'medical_history', 'mm3', 'mso4', 'n0', 'n02', 'n04', 'n147', 'n17', 'n18', 'n20', 'n28', 'n31', 'n32', 'n39', 'n40', 'n62', 'n63', 'n90', 'n92', 'n95', 'nov020', 'nov18', 'nov2019', 'nov2020', 'o2', 'oct2020', 'october2020', 'om1', 'on12', 'on2', 'other_medical_history', 'p2', 'pcv13', 'polyethylene_glycol', 'pt1cn1am0', 'pt2n2am0', 'q21', 'q24', 'q38', 'q44', 'q4hrs', 'q61', 'q66', 'r00', 'r05', 'r06', 'r09', 'r13', 'r14', 'r19', 'r20', 'r23', 'r25', 'r26', 'r27', 'r29', 'r30', 'r31', 'r33', 'r41', 'r42', 'r44', 'r45', 'r47', 'r48', 'r49', 'r50', 'r52', 'r53', 'r54', 'r55', 'r56', 'r60', 'r62', 'r63', 'r73', 'r74', 'r77', 'r78', 'r79', 'r80', 'r93', 'r94', 's06', 's1', 's22', 's28', 's32', 's42', 's60', 's61', 's63', 's70', 's72', 's80', 's81', 's82', 's90', 's91', 'sep2020', 'sm859', 'sm869', 'sp02', 'spo2', 'surgery4', 't1', 't10', 't11', 't12', 't1b2', 't1dm', 't2', 't2dm', 't3', 't3n0', 't4', 't45', 't50', 't7', 't8', 't81', 't9', 'testx2', 'tiax2', 'type1', 'type2', 'type2dm', 'u07', 'v00', 'v15', 'v2', 'w19', 'x1', 'x10', 'x2', 'x20', 'x3', 'x3mo', 'x4', 'x4v', 'x50', 'x5yd', 'x6', 'x7', 'x9', 'y65', 'z11', 'z16', 'z20', 'z23', 'z29', 'z41', 'z43', 'z45', 'z46', 'z48', 'z51', 'z66', 'z68', 'z72', 'z74', 'z79', 'z83', 'z85', 'z86', 'z87', 'z90', 'z91', 'z95', 'z96', 'z97', 'z98', 'z99']
stop_w=text.ENGLISH_STOP_WORDS.union(addition_stopwords)


In [None]:
MODERNA=df_training[df_training['VAX_NAME']=='MODERNA']
PFIZER=df_training[df_training['VAX_NAME']=='PFIZER']
JANSSEN=df_training[df_training['VAX_NAME']=='JANSSEN']

In [None]:
MODERNA

In [None]:
# make df_training data to sparse matrix
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_w)
tfidfs = tfidf_vectorizer.fit_transform(MODERNA['HEALTH_CONDITION'])
tfidfs = pd.DataFrame(tfidfs.toarray(), columns=tfidf_vectorizer.get_feature_names())
dummie_age_sex=pd.get_dummies(MODERNA[['SEX','AGE_CAT']],  prefix='', prefix_sep='').reset_index()
moderna_df=pd.concat([dummie_age_sex,tfidfs], axis=1)


In [None]:
moderna_df

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=200)
kmeans.fit(moderna_df)

In [None]:
kmeans.labels_

In [None]:
MODERNA_CLUSTERED=df[df['VAX_NAME']=='MODERNA'].copy()

MODERNA_CLUSTERED['CLUSTER'] = kmeans.labels_

## How many instance are belonging to each cluster

In [None]:
MODERNA_CLUSTERED.groupby(by='CLUSTER')['VAERS_ID'].agg('count').sort_values()

## what are in cluster 90 

In [None]:
sym_freq=MODERNA_CLUSTERED[MODERNA_CLUSTERED['CLUSTER']==90]
sym_freq

## Symptoms frequency in cluster 90 

In [None]:
from itertools import chain
from collections import Counter
pd.Series(Counter(chain.from_iterable(sym_freq.SYMPTOMS.dropna()))).sort_values(ascending=False)