In [1]:
# Import Required Libariries
import pandas as pd
from collections import Counter

# Plotting
import squarify
import matplotlib.pyplot as plt
import seaborn as sns

# NLP Libraries
import re
from nltk.stem import PorterStemmer
import spacy
from spacy.tokenizer import Tokenizer

In [2]:
# NLP utilities

class HandleTokens(object):
    @staticmethod
    def tokenize(df_in):
        tokens = []
        for doc in tokenizer.pipe(df_in, batch_size=500):
            doc_tokens = [token.text for token in doc]
            tokens.append(doc_tokens)
        return tokens
    @staticmethod
    def count(docs):
        word_counts = Counter()
        appears_in = Counter()
        
        total_docs = len(docs)

        for doc in docs:
            word_counts.update(doc)
            appears_in.update(set(doc))

        temp = zip(word_counts.keys(), word_counts.values())
        
        wc = pd.DataFrame(temp, columns = ['word', 'count'])

        wc['rank'] = wc['count'].rank(method='first', ascending=False)
        total = wc['count'].sum()

        wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
        wc = wc.sort_values(by='rank')
        wc['cul_pct_total'] = wc['pct_total'].cumsum()

        t2 = zip(appears_in.keys(), appears_in.values())
        ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
        wc = ac.merge(wc, on='word')

        wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
        return wc.sort_values(by='rank')
    
    @staticmethod
    def stopwords(list_in):
        return nlp.Defaults.stop_words.union(list_in)
    
    @staticmethod
    def combine_stopwords(dataframe_in, stopword_dict):
        tokens = []

        for doc in tokenizer.pipe(dataframe_in, batch_size=500):

            doc_tokens = []

            for token in doc: 
                if token.text.lower() not in stopword_dict:
                    doc_tokens.append(token.text.lower())

            tokens.append(doc_tokens)

        return tokens

## Data cleaning
The scraped questions and answer data comes with html tags that needs to be removed and cleaned, we should use the BeautifulSoup library to clean the data into a more usable format. 

In [3]:
df = pd.read_json('data.json')
df

Unnamed: 0,scraped_question,scraped_answer
0,[<strong>Q1. What organs are excised is an ova...,"[<div class=""su-spoiler-content su-u-clearfix ..."


In [4]:
questions = df['scraped_question'][0]
questions

['<strong>Q1. What organs are excised is an ovarian tumor is malignant?</strong>',
 '<strong>Q2.\xa0The spiral conical structure of the inner ear is the?,</strong>',
 '<strong>Q3. What is the term for a relationship in which two organisms occupy the same area and one organism benefits while the other is unharmed?</strong>',
 '<strong>Q4.\xa0Permission for treatment given with full knowledge of the risks is a/an?</strong>',
 '<strong>Q5.\xa0Which of the following is a method of high-level disinfection?</strong>',
 '<strong>Q6.\xa0The mumps may be diagnosed by finding inflammation in which of the following glands?</strong>',
 '<strong>Q7. How should the stretcher be oriented when necessary to use an elevator to transport a patient to the or?</strong>',
 '<strong>Q8. Which of the following solutions should be used to prep the donor site for a split-thickness skin graft?</strong>',
 '<strong>Q9. What type of procedure would involve the removal of teeth?</strong>',
 '<strong>Q10.\xa0Which o

In [5]:
answers = df['scraped_answer'][0][1:]
answers

['<p>If a tumor of an ovary is found to be malignant, the surgeon will excise the other ovary, both fallopian tubes, and the uterus to ensure that all cancer cells have been removed.<br>\n</p>',
 '<div class="su-spoiler-content su-u-clearfix su-u-trim">\xa0<strong><span style="color: #008000;">Correct option: 2<br>\n</span></strong>Sol: The cochlea is a bony spiral canal in the ear.\n</div>',
 '<div class="su-spoiler-content su-u-clearfix su-u-trim">\xa0<strong><span style="color: #008000;">Correct option: 4<br>\n</span></strong>Sol: A myomectomy is a procedure performed for the removal of fibromyomas or fibroid tumors from the uterine wall.\n</div>',
 '<div class="su-spoiler-content su-u-clearfix su-u-trim">\xa0<strong><span style="color: #008000;">Correct option: 1<br>\n</span></strong>Sol: The patient should be placed in the elevator entering headfirst and exit feet first when being transported to the O.R. on the stretcher.\n</div>',
 '<div class="su-spoiler-content su-u-clearfix su

In [6]:
nlp = spacy.load("en_core_web_lg")

tokenizer = Tokenizer(nlp.vocab)

In [8]:
from bs4 import BeautifulSoup

def prettify_list(list_in, slice_in):
    questions = list_in
    state = []
    for i,x in enumerate(questions):
        # Replace scraped metatags and encoding
        a = questions[i].replace(u'\xa0', u'')
        a = a.replace(u'Correct', u'')
        a = a.replace(u'option:', u'')
        
        a = a.replace(u'\n', u'')
        
        
        soup = BeautifulSoup(a, 'html.parser')
        state.append(soup.get_text()[slice_in:])

    return state

Q_list = prettify_list(questions, 0)
Q_list

['Q1. What organs are excised is an ovarian tumor is malignant?',
 'Q2.The spiral conical structure of the inner ear is the?,',
 'Q3. What is the term for a relationship in which two organisms occupy the same area and one organism benefits while the other is unharmed?',
 'Q4.Permission for treatment given with full knowledge of the risks is a/an?',
 'Q5.Which of the following is a method of high-level disinfection?',
 'Q6.The mumps may be diagnosed by finding inflammation in which of the following glands?',
 'Q7. How should the stretcher be oriented when necessary to use an elevator to transport a patient to the or?',
 'Q8. Which of the following solutions should be used to prep the donor site for a split-thickness skin graft?',
 'Q9. What type of procedure would involve the removal of teeth?',
 'Q10.Which of the following is a fenestrated drape?',
 'Q11. Compression of the heart from excessive fluid or blood buildup is called?',
 'Q12.Which is the first part of the small intestine?',


In [9]:
# Reformat first item of answer list for fidelity
a1 = prettify_list(answers, 0)[0]
ans_list = prettify_list(answers, 8)
ans_list.pop(0)
ans_list.insert(0, a1)
ans_list

['If a tumor of an ovary is found to be malignant, the surgeon will excise the other ovary, both fallopian tubes, and the uterus to ensure that all cancer cells have been removed.',
 'The cochlea is a bony spiral canal in the ear.',
 'A myomectomy is a procedure performed for the removal of fibromyomas or fibroid tumors from the uterine wall.',
 'The patient should be placed in the elevator entering headfirst and exit feet first when being transported to the O.R. on the stretcher.',
 '2% glutaraldehyde is a type of high-level disinfectant solution; the device must be complete submerse for 20 minutes at room temperature in order to be disinfected',
 'The parotid glands are located inferior and anterior to the ears between the skin and masseter muscle. The parotid glands are attacked by the mumps virus.',
 'The patient should be placed in the elevator entering headfirst and exit feet first when being transported to the O.R. on the stretcher.',
 'The donor site should be scrubbed with a c

In [10]:
# Compile these lists into a dictionary to export, and to be used consumed in fontend applications. 

dict_out = {'Questions': Q_list, 'Answers': ans_list}
dict_out

{'Questions': ['Q1. What organs are excised is an ovarian tumor is malignant?',
  'Q2.The spiral conical structure of the inner ear is the?,',
  'Q3. What is the term for a relationship in which two organisms occupy the same area and one organism benefits while the other is unharmed?',
  'Q4.Permission for treatment given with full knowledge of the risks is a/an?',
  'Q5.Which of the following is a method of high-level disinfection?',
  'Q6.The mumps may be diagnosed by finding inflammation in which of the following glands?',
  'Q7. How should the stretcher be oriented when necessary to use an elevator to transport a patient to the or?',
  'Q8. Which of the following solutions should be used to prep the donor site for a split-thickness skin graft?',
  'Q9. What type of procedure would involve the removal of teeth?',
  'Q10.Which of the following is a fenestrated drape?',
  'Q11. Compression of the heart from excessive fluid or blood buildup is called?',
  'Q12.Which is the first part o

## Topic Modeling 
Create a LDA topic model of the review text then visualize the most common topics in the CST exam.

In [20]:
#save lem state
class LemState(object):
    def __init__(self):
        self._lem = "state"
        
    @property
    def lem(self):
        return self._lem
    
    @lem.setter
    def lem(self, value):
        self._lem = value

LS = LemState()

        

  and should_run_async(code)


In [21]:
HT = HandleTokens()
token_list = HT.tokenize(ans_list)
token_list

  and should_run_async(code)


[['If',
  'a',
  'tumor',
  'of',
  'an',
  'ovary',
  'is',
  'found',
  'to',
  'be',
  'malignant,',
  'the',
  'surgeon',
  'will',
  'excise',
  'the',
  'other',
  'ovary,',
  'both',
  'fallopian',
  'tubes,',
  'and',
  'the',
  'uterus',
  'to',
  'ensure',
  'that',
  'all',
  'cancer',
  'cells',
  'have',
  'been',
  'removed.'],
 ['The', 'cochlea', 'is', 'a', 'bony', 'spiral', 'canal', 'in', 'the', 'ear.'],
 ['A',
  'myomectomy',
  'is',
  'a',
  'procedure',
  'performed',
  'for',
  'the',
  'removal',
  'of',
  'fibromyomas',
  'or',
  'fibroid',
  'tumors',
  'from',
  'the',
  'uterine',
  'wall.'],
 ['The',
  'patient',
  'should',
  'be',
  'placed',
  'in',
  'the',
  'elevator',
  'entering',
  'headfirst',
  'and',
  'exit',
  'feet',
  'first',
  'when',
  'being',
  'transported',
  'to',
  'the',
  'O.R.',
  'on',
  'the',
  'stretcher.'],
 ['2%',
  'glutaraldehyde',
  'is',
  'a',
  'type',
  'of',
  'high-level',
  'disinfectant',
  'solution;',
  'the',
  '

In [26]:
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
import pyLDAvis
import pyLDAvis.gensim 

def pyldavis(list_in):
    mylist = ['medical','cause','o.r','o.r.','term','means','lower','blood', 'common','surgical','surgeon', 'called', 'patient', 'position', 'batteries','I','i', 'it', "it's", 'it.', 'the', 'this', "1", "i'm","i've", "got", "-", "come", '&']
    STOP_WORDS = HT.stopwords(mylist)

    LS.lem = HT.combine_stopwords(ans_list, STOP_WORDS)

    # Learn Vocabularary of data
    id2word = Dictionary(LS.lem)

    # create bog of words representation of the corpus
    corpus = [id2word.doc2bow(text) for text in LS.lem]

    # Create a lda model
    lda = LdaMulticore(corpus=corpus,
                       id2word=id2word,
                       iterations=5,
                       workers=4,
                       num_topics = 5
                      )
    
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda, corpus, id2word)
    return pyLDAvis.display(vis)
    


  and should_run_async(code)


In [27]:
pyldavis(ans_list)

  and should_run_async(code)


In [28]:
pyldavis(Q_list)

  and should_run_async(code)


The congecture of the above visulizations could be that the majority of the questions of the cst exam are about tissue removal, which is often the goal of surgery to remove cancerous or non-operating tissue. Another most frequent term is about steam sterilization, which is newer technolgy that sterilzes equipment faster but at a much lower tempreture, which means it requires more attention to do correcty compared to the classical dry sterilzation method.

## Vector Representation