In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from functools import partial
from math import log
from typing import Tuple
from pprint import pprint
import matplotlib.pyplot as plt


import gensim
from gensim.models import LsiModel
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel

import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import RegexpTokenizer

from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

stemmer = SnowballStemmer("english")

pd.set_option('display.max_columns', 200)

unable to import 'smart_open.gcs', disabling that module


# Read Data and Preprocess

In [None]:
filename = '../2019VAERSData/2019VAERSDATA.csv'

# rawdata = pd.read_csv(filename, header=[0], error_bad_lines=False, encoding="ISO-8859-1")
rawdata = pd.read_csv(filename, header=[0], error_bad_lines=False, encoding="latin-1")

rawdata.columns = rawdata.columns.str.lower()


scrub = ['\x97', '\x96', '\x91', '\x80', '\xad']
for byte in scrub:
    rawdata['symptom_text'] = rawdata['symptom_text'].str.replace(byte, ' ')
    
rawdata['symptom_text'] = rawdata['symptom_text'].str.replace('ç', 'c').fillna('')

dropcols = ['died', 'er_visit', 'hospital', 'disable']


serious_bool = rawdata[dropcols].eq('Y').any(axis=1)
rawdata['serious'] = serious_bool.map({True: 'Y', False: 'N'})


data = rawdata.drop(columns=dropcols)
data = data.set_index('vaers_id')

In [None]:
data.head()

# Process Data/Tokenize

In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
doc_sample = data['symptom_text'].iloc[55]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

In [None]:
processed_docs = data['symptom_text'].map(preprocess)

In [None]:
processed_docs = list()
for n, doc in enumerate(data['symptom_text']):
    processed_docs.append(preprocess(doc))

In [None]:
dictionary = gensim.corpora.Dictionary(processed_docs)

dictionary.filter_extremes(no_below=15, no_above=0.5)

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
bow_doc_50 = bow_corpus[50]
for i in range(len(bow_doc_50)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_50[i][0], 
                                               dictionary[bow_doc_50[i][0]], 
bow_doc_50[i][1]))

In [None]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [None]:
# for doc in tfidf[bow_corpus]:
#     print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])

In [None]:
flattened = [{dictionary[tok_id]: np.around(freq, decimals=2) for tok_id, freq in doc} for doc in tfidf[bow_corpus]]

tfidf_df = pd.DataFrame(flattened).fillna(0)

# SVD

In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', 
                             max_features= 1000, # keep top 1000 terms 
                             max_df = 0.5, 
                             smooth_idf=True)

X = vectorizer.fit_transform(tfidf_df)

X.shape # check shape of the document-term matrix

(3090, 1000)

In [114]:
# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=5, algorithm='randomized', n_iter=100, random_state=0)

svd_model.fit(X)

len(svd_model.components_)

5

In [115]:
terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    print(' + '.join([f'{round(score, 3)}*{token}' for token, score in sorted_terms]))

Topic 0: 
0.101*neuriti + 0.097*meclizin + 0.084*resum + 0.083*polyneuropathi + 0.081*pharmacist + 0.081*rang + 0.077*pillow
Topic 1: 
0.124*telephon + 0.096*personnel + 0.083*pentacel + 0.079*rabavert + 0.075*paint + 0.075*print + 0.072*meantim
Topic 2: 
0.105*rotarix + 0.091*trade + 0.086*threaten + 0.075*throw + 0.075*prematur + 0.074*tragic + 0.074*qfaa
Topic 3: 
0.109*node + 0.109*maculopapular + 0.086*weird + 0.086*towel + 0.085*thursday + 0.084*needl + 0.082*thinner
Topic 4: 
0.116*typic + 0.083*prednison + 0.081*pot + 0.078*trade + 0.078*terribl + 0.076*prescript + 0.075*plaqu


In [109]:
sorted_terms

[('rare', 0.10040628668373605),
 ('moment', 0.09147790029546797),
 ('mistak', 0.09123940792274332),
 ('prepar', 0.08631633123178989),
 ('retin', 0.08618075544816545),
 ('tree', 0.0855027907312574),
 ('prove', 0.08446678783858576)]

# Example 2

In [25]:
# from sklearn.feature_extraction.text import CountVectorizer

# # cv = CountVectorizer(ngram_range=(1, 1))
# # X = cv.fit_transform(tfidf_df)

# U, s, Vh = np.linalg.svd(tfidf_df)

In [26]:
# terms = tfidf.columns[:10] #cv.vocabulary_
# for i, component in enumerate(Vh[:10]):
#     terms_components = zip(terms, component)
#     sorted_terms = sorted(terms_components, key=lambda x:x[1], reverse=True)[:10] # take features for topic
#     print("topic : ", i)
#     for term_socres in sorted_terms:
#         print(10*" ", term_socres[0])
#     print(50*'*')

# LSA

In [31]:
# doc_term_matrix = bow_corpus


def compute_coherence_values(dictionary, bow_corpus, doc_clean, stop, start=2, step=3):
    """
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        # generate LSA model
        model = LsiModel(bow_corpus, num_topics=num_topics, id2word=dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

def plot_graph(processed_docs, start, stop, step):
    model_list, coherence_values = compute_coherence_values(dictionary, bow_corpus, processed_docs,
                                                            stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

In [None]:
start, stop, step = (2, 10, 1)

plot_graph(processed_docs, start, stop, step)

# LDA

In [110]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.050*"report" + 0.044*"receiv" + 0.038*"unknown" + 0.031*"shingrix" + 0.026*"case" + 0.025*"medic" + 0.024*"date" + 0.021*"dose" + 0.014*"experi" + 0.013*"site"
Topic: 1 
Words: 0.071*"report" + 0.037*"dose" + 0.031*"medic" + 0.031*"date" + 0.026*"receiv" + 0.025*"unknown" + 0.021*"advers" + 0.019*"expir" + 0.016*"effect" + 0.015*"concomit"
Topic: 2 
Words: 0.025*"pain" + 0.022*"inject" + 0.019*"swell" + 0.017*"site" + 0.013*"fever" + 0.012*"day" + 0.011*"leav" + 0.011*"rash" + 0.010*"feel" + 0.010*"symptom"
Topic: 3 
Words: 0.043*"zoster" + 0.038*"medic" + 0.033*"shingl" + 0.029*"zostavax" + 0.028*"live" + 0.025*"condit" + 0.022*"injuri" + 0.021*"suffer" + 0.021*"loss" + 0.020*"result"
Topic: 4 
Words: 0.029*"pain" + 0.025*"influenza" + 0.012*"infect" + 0.009*"receiv" + 0.009*"unknown" + 0.009*"report" + 0.009*"shoulder" + 0.009*"leav" + 0.008*"signific" + 0.008*"virus"


In [111]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.008*"rash" + 0.008*"pain" + 0.007*"state" + 0.007*"swell" + 0.007*"shoot" + 0.006*"inject" + 0.006*"go" + 0.006*"start" + 0.006*"leav" + 0.006*"give"
Topic: 1 Word: 0.008*"report" + 0.007*"pregnanc" + 0.007*"unknown" + 0.006*"date" + 0.005*"pain" + 0.005*"unspecifi" + 0.005*"medic" + 0.005*"influenza" + 0.005*"event" + 0.005*"subject"
Topic: 2 Word: 0.019*"swell" + 0.017*"inject" + 0.016*"fever" + 0.016*"site" + 0.016*"pain" + 0.015*"red" + 0.013*"headach" + 0.013*"ach" + 0.012*"chill" + 0.011*"sore"
Topic: 3 Word: 0.036*"shingrix" + 0.021*"report" + 0.020*"unknown" + 0.014*"receiv" + 0.014*"pain" + 0.012*"date" + 0.012*"inject" + 0.012*"site" + 0.010*"case" + 0.010*"experi"
Topic: 4 Word: 0.020*"medic" + 0.017*"error" + 0.013*"report" + 0.011*"advers" + 0.009*"dose" + 0.009*"expir" + 0.009*"holder" + 0.009*"receiv" + 0.008*"histori" + 0.008*"case"


In [11]:
# add columns age/sex/curr ill/num days to clustering
clusters = 5
model = KMeans(n_clusters=clusters, init='k-means++', max_iter=100, n_init=1)
model.fit(merged)

# tfidf['cluster'] = model.labels_

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=5, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [30]:
for cluster_index in range(clusters):
    cluster = merged[(model.labels_ == cluster_index)]
#     cluster = tfidf[tfidf.cluster.eq(cluster_index)]

    tfidf = cluster[:, 5:]
    tfidf_mean = tfidf.mean(axis=0)
    tfidf_mean = np.squeeze(np.asarray(tfidf_mean))
    
    ave_age = cluster[:, 0].mean()
    male = 
    
    top_ten = tfidf_mean.argsort()[-10:][::-1]
    
    print(f"Cluster {cluster_index} (size={len(cluster)}):")
    print("-----------------------------------------")
    print(', '.join(ind_to_token[idx] for idx in top_ten))
    print()

Cluster 0 (size=14944):
-----------------------------------------
patienr, unknowingly, reportable, datasheet, receive, dosde, vaccince, medicaitons, shingritz, cascading

Cluster 1 (size=6794):
-----------------------------------------
vaccince, patienr, fetzima, givein, rarity, dawned, swellin, rednes, reactio, sitagliptin

Cluster 2 (size=6855):
-----------------------------------------
arlv, patienr, pail, injectio, dawned, sitagliptin, shingritz, swellin, rednes, rarity

Cluster 3 (size=11178):
-----------------------------------------
pail, arlv, injectio, patienr, dawned, fetzima, sitagliptin, headach, chileed, shingritz

Cluster 4 (size=4573):
-----------------------------------------
patienr, vaccince, arlv, pail, givein, injectio, sitagliptin, dawned, psychotropics, lefse



In [None]:
cluster[:, 0].mean()

In [31]:
cluster[:, 0].mean()

21.287338727312488

In [26]:
merged[:, 0].mean()

45.967172807380685

In [28]:
cluster

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

distance measure:  

https://scikit-learn.org/stable/modules/clustering.html#k-means  
within-cluster sum-of-squares

In [314]:
data.shape

(44344, 31)

In [315]:
data['cluster'] = model.labels_

In [316]:
data.head()

Unnamed: 0_level_0,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DATEDIED,L_THREAT,HOSPDAYS,X_STAY,RECOVD,VAX_DATE,ONSET_DATE,NUMDAYS,LAB_DATA,V_ADMINBY,V_FUNDBY,OTHER_MEDS,CUR_ILL,HISTORY,PRIOR_VAX,SPLTTYPE,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES,SERIOUS,cluster
VAERS_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
794156,01/01/2019,,69.0,69.0,,F,,"Severe pain Lt. shoulder area, very localized ...",,,,,Y,10/29/2018,10/30/2018,1.0,,PVT,,,none,none,,,2,01/01/2019,,Y,,tetracycline,N,2
794157,01/01/2019,IL,68.0,68.0,,F,,"Much joint pain/aching, lack of energy, listle...",,,,,N,12/28/2018,12/29/2018,1.0,,PHM,,"Levothyroxine, sertraline, vitamin B complex, ...",,Hypothryoidism,Arm soreness at site of injection,,2,01/01/2019,,,,,N,1
794158,01/01/2019,MA,62.0,62.0,,F,,"Headache, Fever, Chills, Body Aches, Nausea la...",,,,,Y,12/30/2018,12/31/2018,1.0,,PHM,,"Fish Oil, vitamin D, magnesium.",,,,,2,01/01/2019,,,,,N,2
794159,01/01/2019,UT,5.0,5.0,,M,,"Site is swollen, red and warm to the touch. Pa...",,,,,Y,12/27/2018,12/28/2018,1.0,,PVT,,Unknown,Small red spot on his cheek at time of visit.,,,,2,01/01/2019,,,,Amoxicillin,N,1
794160,01/01/2019,TX,79.0,79.0,,F,,"FLU LIKE: CHILLS ACHE ALL OVER; STRONG PAIN, W...",,,,,N,12/28/2018,12/28/2018,0.0,NONE TODAY IS A HOLIDAY,PVT,,ATORVASTATIN; CELEBREX; XYZAL; D-3; ELIQUIS; A...,SINUSITIS; ATHROSCLEROSIS; DIABETES; HYPERCHOL...,SAME AS ITEM 11 ABOVE,,,2,01/01/2019,,Y,Y,ACE INHIBITORS,N,1


In [318]:
pd.crosstab(data.cluster, data.SERIOUS)

SERIOUS,N,Y
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5615,127
1,5180,70
2,23078,2002
3,2692,694
4,4801,85


In [317]:
data.pivot_table(index='cluster', columns='SERIOUS')

Unnamed: 0_level_0,AGE_YRS,AGE_YRS,CAGE_MO,CAGE_MO,CAGE_YR,CAGE_YR,FORM_VERS,FORM_VERS,HOSPDAYS,NUMDAYS,NUMDAYS
SERIOUS,N,Y,N,Y,N,Y,N,Y,Y,N,Y
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
0,20.800842,29.995897,0.329412,0.125,20.980066,34.583333,1.914337,1.96063,11.714286,43.760192,259.485714
1,53.331225,42.505156,0.338415,0.35,53.379281,43.727273,1.971622,1.642857,2.4375,8.63355,45.477612
2,45.248608,37.005634,0.320905,0.327778,45.369581,36.641509,1.961478,1.864136,6.200739,21.420956,28.88843
3,35.641392,66.529412,0.381116,,34.528746,62.333333,2.0,2.0,3.666667,70.727273,197.685185
4,65.462022,64.638158,0.36,,65.900091,68.027027,1.999792,2.0,7.115385,12.6,7.225806


In [269]:
vectorizer.vocabulary_['patient']

20332

In [276]:
tfidf.shape

(44344, 30822)

In [288]:
(model.labels_ == 3).argwhere(True)

AttributeError: 'numpy.ndarray' object has no attribute 'argwhere'

In [305]:
np.argwhere((model.labels_ == 1))

array([[    1],
       [    3],
       [    4],
       ...,
       [44316],
       [44325],
       [44329]])

In [311]:
rawdata.loc[44329].SYMPTOM_TEXT

'Redness and swelling at injection site - 3x2 inch area'

In [275]:
tfidf.mean(axis=0)

KeyboardInterrupt: 

In [218]:
tfidf_mean.shape

(30822,)

In [220]:
tfidf_mean.max()

0.252844294441899

In [221]:
tfidf_mean.argmax()

24680

In [222]:
tfidf_mean.argsort()[-10:][::-1]

array([24680, 20332, 28000, 20084, 22913, 23349, 10801, 23346, 12905,
        9118])

In [231]:
ind_to_word[9118]

'case'

In [211]:
c = cluster[:, 3635]

In [212]:
c.max()

0.0

In [213]:
c.min()

0.0

In [210]:
cluster.shape

(4406, 30822)

In [214]:
cluster

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.16143788, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [173]:
cluster[983]

<1x30822 sparse matrix of type '<class 'numpy.float64'>'
	with 163 stored elements in Compressed Sparse Row format>

In [139]:
sum([1 for text in rawdata['SYMPTOM_TEXT'].fillna('') if '2019434226' in text])

1

In [107]:
sum((model.labels_ == 8))

7089

In [46]:
data.loc[data.SERIOUS.eq('Y'), ['NUM_DAYS', 'SYMPTOM_TEXT', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX', 'BIRTH_DEFECT', 'SERIOUS']].head(16)

Unnamed: 0_level_0,NUM_DAYS,SYMPTOM_TEXT,CUR_ILL,HISTORY,PRIOR_VAX,BIRTH_DEFECT,SERIOUS
VAERS_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
794190,,Information has been received on 19-DEC-2018 r...,,,,,Y
794191,,Information has been received on 19-DEC-2018 r...,,,,,Y
794195,,"This is a literature case, initially received ...",,Medical History/Concurrent Conditions: Asthma,,,Y
794197,,As soon as I got the shots and was going to pu...,,Back and neck pain Depression High blood pres...,,,Y
794210,,"Tdap shot, tar stool, cramps in stomach, letha...",,,,,Y
794217,,Vomiting started the day after the vaccine. Pa...,none,none,,,Y
794268,,"Simple Febrile Seizure at home, lasting no mor...",,Macrocrania- monitoring,,,Y
794271,,Pt is complaining of left upper arm pain/shoul...,none listed,thyroid,,,Y
794291,,Information has been received from a lawyer re...,,,,,Y
794330,,Developed fever and diarrhea that evening. Mom...,,,,,Y


In [20]:
serious_bool.map({True: 'Y', False: 'N'})

0        N
1        N
2        N
3        N
4        N
        ..
44339    Y
44340    N
44341    N
44342    N
44343    N
Length: 44344, dtype: object