<a href="https://colab.research.google.com/github/AlexanderBelfort/MDPRKT/blob/main/K_DictPlusExtras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install spacy
!python -m spacy download en_core_web_md

In [194]:
class medicalCategory:
  gallstoneDisease = "Gallstone Disease"
  arthritis = "Arthritis"

train_x = ["Gallstones are hardened deposits of digestive fluid that can form in your gallbladder",\
           "People who experience symptoms from their gallstones usually require gallbladder removal surgery.",\
           "Arthritis is inflammation of one or more of your joints that causes pain and stiffness.",\
           "Although there is no cure for arthritis, there are many treatment options available to help manage pain and keep people staying active."]
train_y = [medicalCategory.gallstoneDisease, medicalCategory.gallstoneDisease, medicalCategory.arthritis, medicalCategory.arthritis]


**Fit vectorizer to transform text to bag-of-words vectors**

In [195]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True)
train_x_vectors = vectorizer.fit_transform(train_x)

print(vectorizer.get_feature_names())
print(train_x_vectors.toarray())

['active', 'although', 'and', 'are', 'arthritis', 'available', 'can', 'causes', 'cure', 'deposits', 'digestive', 'experience', 'fluid', 'for', 'form', 'from', 'gallbladder', 'gallstones', 'hardened', 'help', 'in', 'inflammation', 'is', 'joints', 'keep', 'manage', 'many', 'more', 'no', 'of', 'one', 'options', 'or', 'pain', 'people', 'removal', 'require', 'staying', 'stiffness', 'surgery', 'symptoms', 'that', 'their', 'there', 'to', 'treatment', 'usually', 'who', 'your']
[[0 0 0 1 0 0 1 0 0 1 1 0 1 0 1 0 1 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
  0 0 0 0 0 1 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
  1 0 0 1 1 0 1 0 0 0 1 1 0]
 [0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0 1 1 0 0
  0 0 1 0 0 1 0 0 0 0 0 0 1]
 [1 1 1 1 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 1 1 1 0 1 0 0 1 0 1 1 0
  0 1 0 0 0 0 0 1 1 1 0 0 0]]



**Train SVM Model**

In [196]:
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

**Testing**

In [197]:
test_x1 = vectorizer.transform(['Is there a cure for arthritis?'])

clf_svm.predict(test_x1)


array(['Arthritis'], dtype='<U17')

In [198]:
test_x2 = vectorizer.transform(['I need gallstones surgery.'])

clf_svm.predict(test_x2)

array(['Gallstone Disease'], dtype='<U17')

**Cleaning data practice**

In [199]:
import nltk
from nltk.corpus import stopwords
import string
import re

# load doc into memory
def load_doc(filename):
  # open the file as read only
  file = open(filename, 'r')
  # read all text
  text = file.read()
  # close the file
  file.close()
  return text
  # turn a doc into clean tokens

def clean_doc(doc):

  # toLower
  doc = doc.lower()
  # split into tokens by white space
  tokens = doc.split()
  # prepare regex for char filtering
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  # remove punctuation from each word
  tokens = [re_punc.sub('', w) for w in tokens]
  # remove remaining tokens that are not alphabetic
  tokens = [word for word in tokens if word.isalpha()]
  # filter out stop words
  stop_words = set(stopwords.words('english'))
  tokens = [w for w in tokens if not w in stop_words]
  # filter out short tokens
  tokens = [word for word in tokens if len(word) > 1]
  return tokens

# load the gallstones document
filename1 = '1gal.txt'
text = load_doc(filename1)
tokens1 = clean_doc(text)

print(tokens1)

filename2 = '2art.txt'
text = load_doc(filename2)
tokens2 = clean_doc(text)

print(tokens2)

filename2 = '3cgall.txt'
text = load_doc(filename2)
tokens2 = clean_doc(text)

print(tokens2)


['gallstones', 'hardened', 'deposits', 'digestive', 'fluid', 'form', 'gallbladder', 'gallbladder', 'small', 'pearshaped', 'organ', 'right', 'side', 'abdomen', 'beneath', 'liver', 'gallbladder', 'holds', 'digestive', 'fluid', 'called', 'bile', 'thats', 'released', 'small', 'intestine', 'gallstones', 'range', 'size', 'small', 'grain', 'sand', 'large', 'golf', 'ball', 'people', 'develop', 'one', 'gallstone', 'others', 'develop', 'many', 'gallstones', 'time', 'people', 'experience', 'symptoms', 'gallstones', 'usually', 'require', 'gallbladder', 'removal', 'surgery', 'gallstones', 'dont', 'cause', 'signs', 'symptoms', 'typically', 'dont', 'need', 'treatment']
['arthritis', 'inflammation', 'one', 'joints', 'causes', 'pain', 'stiffness', 'arthritis', 'mainly', 'adult', 'disease', 'forms', 'affect', 'children', 'many', 'types', 'arthritis', 'include', 'osteoarthritis', 'rheumatoid', 'arthritis', 'posttraumatic', 'arthritis', 'septic', 'arthritis', 'psoriatic', 'arthritis', 'conditions', 'diffe

**Create a vocabulary.**

In [203]:
from os import listdir
from collections import Counter

# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
  # load doc
  
  doc = load_doc(filename)
  # clean doc
  tokens = clean_doc(doc)
  # update counts
  vocab.update(tokens)

# load all docs in a directory
def process_docs(directory, vocab):
  # walk through all files in the folder
  for filename in listdir(directory):
    # create the full path of the file to open
    path = directory + '/' + filename
    # add doc to vocab
    add_doc_to_vocab(path, vocab)
  
# define vocab
vocab = Counter()

# add all docs to vocab
# this is my words + katie's first file
process_docs('gallstones', vocab)
process_docs('arthritis', vocab)

# print the size of the vocab
print('Words in the vocab: ', len(vocab), '\n')

# print the top words in the vocab
prettyPrint = dict(vocab.most_common(50))
for k, v in prettyPrint.items():
  print (k, '---->', v)


Words in the vocab:  574 

would ----> 29
pain ----> 24
study ----> 21
okay ----> 18
need ----> 17
get ----> 16
people ----> 14
one ----> 14
take ----> 14
gallstones ----> 13
gallbladder ----> 13
go ----> 13
ibuprofen ----> 12
like ----> 11
operation ----> 11
quite ----> 11
part ----> 11
time ----> 10
surgery ----> 10
yeah ----> 10
taking ----> 10
happy ----> 10
information ----> 10
small ----> 9
liver ----> 9
things ----> 9
arthritis ----> 9
treatment ----> 8
name ----> 8
good ----> 8
know ----> 8
yes ----> 8
going ----> 8
right ----> 7
side ----> 7
fine ----> 7
absolutely ----> 7
put ----> 7
medical ----> 7
say ----> 7
research ----> 7
might ----> 7
symptoms ----> 6
surgeon ----> 6
obviously ----> 6
lot ----> 6
night ----> 6
sometimes ----> 6
keyhole ----> 6
patients ----> 6
