### In this notebook we will perform the word embedding & topic modeling & Cosine Similarity

***we merged the **three** chapters to perform the topic modeling, in order to perform cosine similarity to select which chapter the new input should go with.***

In [2]:
import pandas as pd
import numpy as np
import pickle

# gensim
from gensim import corpora, models, similarities, matutils

# sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

### Read the data and pickle file

In [4]:
df02 = pd.read_csv('chapters_4rows.csv')

In [5]:
# reading the stop words list with pickle
with open ('stop_words.ob', 'rb') as fp:
    stop_words = pickle.load(fp)

In [6]:
df02.columns

Index(['string_values'], dtype='object')

In [7]:
# Declare a list that is to be converted into a column
ch_no = ['cardiovascular', 'neurologic', 'renal']
 
# Using 'ch_no' as the column name
# and equating it to the list
df02['Ch_No'] = ch_no

In [8]:
df02

Unnamed: 0,string_values,Ch_No
0,introduction fetus barely cease end life defin...,cardiovascular
1,introduction communication network coordinate ...,neurologic
2,introduction kidney located retroperitoneally ...,renal


### Word Embedding

In [10]:
df02['string_values']

0    introduction fetus barely cease end life defin...
1    introduction communication network coordinate ...
2    introduction kidney located retroperitoneally ...
Name: string_values, dtype: object

In [11]:
# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(stop_words=stop_words)

doc_word_cv = count_vectorizer.fit_transform(df02['string_values'])



In [12]:
pd.DataFrame(doc_word_cv.toarray(), index=df02['Ch_No'], columns = count_vectorizer.get_feature_names_out()).head()

Unnamed: 0_level_0,aaa,ab,abates,abbokinase,abdomen,abdomenjunction,abduc,aberrant,ability,ablation,...,yoga,york,younge,zazulia,zealand,zigzag,zinc,zone,zoster,μl
Ch_No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cardiovascular,6,3,0,1,2,0,0,0,6,2,...,0,0,1,0,0,0,1,0,0,0
neurologic,0,0,0,0,0,0,1,1,12,0,...,1,1,0,1,1,1,0,4,2,0
renal,0,0,1,0,0,1,0,0,3,2,...,0,4,1,0,0,0,0,1,1,1


In [13]:
# Create a TfidfVectorizer for parsing/counting words
tfidf = TfidfVectorizer(stop_words=stop_words)

doc_word_tfidf = tfidf.fit_transform(df02['string_values'])



In [14]:
pd.DataFrame(doc_word_tfidf.toarray(), index=df02['Ch_No'], columns = tfidf.get_feature_names_out()).head()

Unnamed: 0_level_0,aaa,ab,abates,abbokinase,abdomen,abdomenjunction,abduc,aberrant,ability,ablation,...,yoga,york,younge,zazulia,zealand,zigzag,zinc,zone,zoster,μl
Ch_No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cardiovascular,0.019473,0.009737,0.0,0.003246,0.006491,0.0,0.0,0.0,0.011501,0.004937,...,0.0,0.0,0.002468,0.0,0.0,0.0,0.003246,0.0,0.0,0.0
neurologic,0.0,0.0,0.0,0.0,0.0,0.0,0.003601,0.003601,0.02552,0.0,...,0.003601,0.002738,0.0,0.003601,0.003601,0.003601,0.0,0.010954,0.005477,0.0
renal,0.0,0.0,0.004275,0.0,0.0,0.004275,0.0,0.0,0.007575,0.006503,...,0.0,0.013006,0.003251,0.0,0.0,0.0,0.0,0.003251,0.003251,0.004275


### Topic Modeling: **LDA**

In [16]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(doc_word_cv)

In [17]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [18]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=5)

In [19]:
lda.print_topics(3)

[(0,
  '0.814*"ab" + 0.015*"aaa" + 0.015*"abates" + 0.000*"pick" + 0.000*"physiology" + 0.000*"physiotherapy" + 0.000*"physician" + 0.000*"picking" + 0.000*"pig" + 0.000*"pickle"'),
 (1,
  '0.788*"abates" + 0.003*"ab" + 0.002*"aaa" + 0.000*"pick" + 0.000*"physiology" + 0.000*"physiotherapy" + 0.000*"physician" + 0.000*"picking" + 0.000*"pig" + 0.000*"pickle"'),
 (2,
  '0.816*"aaa" + 0.023*"abates" + 0.017*"ab" + 0.000*"pick" + 0.000*"physiology" + 0.000*"physiotherapy" + 0.000*"physician" + 0.000*"picking" + 0.000*"pig" + 0.000*"pickle"')]

### Performing CorEx:

In [21]:
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

words = list(np.asarray(count_vectorizer.get_feature_names_out()))


In [22]:
topic_model = ct.Corex(n_hidden=3, words=words, seed=1)
topic_model.fit(doc_word_cv, words=words, docs=df02['string_values'])



<corextopic.corextopic.Corex at 0x219e40161d0>

In [23]:
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: abduc,numbered,ntilation,nt,nourishment,notepad,nostril,nortriptyline,nonirritating,noniodinated
1: aaa,significance,signify,initiating,signsand,inhibition,inhibit,silhouette,sighing,since
2: abates,nephrosclerosis,nephropathy,nephron,nephrologists,nephrolithiasis,nephrocalcinosis,nephritis,nephrectomy,necessitate


### Topic Modeling: LSA

In [25]:
lsa = TruncatedSVD(3)
doc_topic = lsa.fit_transform(doc_word_cv)
print(lsa.explained_variance_ratio_)

[0.10451465 0.5101165  0.38536885]


In [26]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ['component'+str(i) for i in range(3)],
             columns = count_vectorizer.get_feature_names_out())

print(topic_word)

              aaa     ab  abates  abbokinase  abdomen  abdomenjunction  abduc  \
component0  0.009  0.004   0.001       0.001    0.003            0.001  0.001   
component1 -0.011 -0.005   0.000      -0.002   -0.004            0.000  0.002   
component2 -0.005 -0.002   0.003      -0.001   -0.002            0.003 -0.001   

            aberrant  ability  ablation  ...   yoga   york  younge  zazulia  \
component0     0.001    0.023     0.004  ...  0.001  0.004   0.002    0.001   
component1     0.002    0.017    -0.003  ...  0.002  0.003  -0.002    0.002   
component2    -0.001   -0.005     0.005  ... -0.001  0.011   0.002   -0.001   

            zealand  zigzag   zinc   zone  zoster     μl  
component0    0.001   0.001  0.001  0.005   0.003  0.001  
component1    0.002   0.002 -0.002  0.009   0.005  0.000  
component2   -0.001  -0.001 -0.001 -0.000   0.001  0.003  

[3 rows x 5714 columns]


In [27]:
tem_list = [] 
def display_topics(model, feature_names, no_top_words, topic_names=None):
    
    for ix, topic in enumerate(model.components_):
        inner_tem_list = []
       
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
            
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        inner_tem_list.append(", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        tem_list.append(inner_tem_list)

In [28]:
result1 = display_topics(lsa, count_vectorizer.get_feature_names_out(), 150)


Topic  0
artery, aneurysm, brain, muscle, valve, arrhythmia, defect, ventricle, seizure, aorta, kidney, bladder, motor, headache, stroke, leg, medication, vessel, intake, ecg, infarction, hypertrophy, case, hemorrhage, endocarditis, weakness, ischemia, dysfunction, encephalitis, catheterization, oxygen, occlusion, shock, page, onset, obstruction, arm, insufficiency, circulation, vomiting, rupture, dyspnea, bleeding, shunt, cord, pericarditis, virus, tract, wave, caused, vsd, abnormality, icp, size, diet, reveals, calculus, echocardiography, two, thrombosis, qrs, monitoring, prognosis, eye, status, csf, contraction, tachycardia, follow, attack, detect, sodium, sinus, protein, carefully, place, neck, atrium, must, possibly, nausea, rarely, temperature, stenosis, eventually, study, neuron, pacemaker, hypotension, fatigue, dialysis, state, meningitis, block, abscess, smoking, prevention, food, resistance, men, generally, po, line, calcium, become, three, pda, phase, occasionally, identify

In [29]:
tem_list
final_dic = {}
final_dic["Cardio"] = tem_list[0]
final_dic["Neuro"] = tem_list[1]
final_dic["Renal"] = tem_list[2]

In [30]:
final_dic

{'Cardio': ['artery, aneurysm, brain, muscle, valve, arrhythmia, defect, ventricle, seizure, aorta, kidney, bladder, motor, headache, stroke, leg, medication, vessel, intake, ecg, infarction, hypertrophy, case, hemorrhage, endocarditis, weakness, ischemia, dysfunction, encephalitis, catheterization, oxygen, occlusion, shock, page, onset, obstruction, arm, insufficiency, circulation, vomiting, rupture, dyspnea, bleeding, shunt, cord, pericarditis, virus, tract, wave, caused, vsd, abnormality, icp, size, diet, reveals, calculus, echocardiography, two, thrombosis, qrs, monitoring, prognosis, eye, status, csf, contraction, tachycardia, follow, attack, detect, sodium, sinus, protein, carefully, place, neck, atrium, must, possibly, nausea, rarely, temperature, stenosis, eventually, study, neuron, pacemaker, hypotension, fatigue, dialysis, state, meningitis, block, abscess, smoking, prevention, food, resistance, men, generally, po, line, calcium, become, three, pda, phase, occasionally, ident

In [31]:
tem_df = pd.DataFrame.from_dict(final_dic, orient ='index') 
tem_df

Unnamed: 0,0
Cardio,"artery, aneurysm, brain, muscle, valve, arrhyt..."
Neuro,"brain, seizure, muscle, motor, headache, encep..."
Renal,"bladder, kidney, calculus, dialysis, obstructi..."


In [32]:
# Declare a list that is to be converted into a column

 
# Using 'ch_no' as the column name
# and equating it to the list
tem_df['Disease_Name'] = ch_no

In [33]:
tem_df.columns

Index([0, 'Disease_Name'], dtype='object')

In [34]:
tem_df = tem_df.rename(columns={0: 'Description'})
tem_df

Unnamed: 0,Description,Disease_Name
Cardio,"artery, aneurysm, brain, muscle, valve, arrhyt...",cardiovascular
Neuro,"brain, seizure, muscle, motor, headache, encep...",neurologic
Renal,"bladder, kidney, calculus, dialysis, obstructi...",renal


In [35]:
tem_df.to_csv('diseases_with_description.csv', index=False)