<a href="https://colab.research.google.com/github/BhavyaaaD/MedBot/blob/main/Medical_Specialty_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

In [None]:
!pip install nlpaug

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')

#plot
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf

from nlpaug.util import Action

## Data import and prepocessing

In [None]:
medical_data = pd.read_csv("../input/medical-specialty-classification/Train.csv")
medical_data

In [None]:
round(100*(medical_data.isnull().sum()/len(medical_data.index)),2)

In [None]:
medical_data = medical_data.loc[:, medical_data.isnull().mean() < .99]
medical_data = medical_data.drop(medical_data.index[12])

## medical_data.isnull().sum(axis=0)

In [None]:
round(100*(medical_data.isnull().sum()/len(medical_data.index)),2)

In [None]:
medical_data = medical_data[medical_data['transcription'].notna()]

In [None]:
round(100*(medical_data.isnull().sum()/len(medical_data.index)),2)

In [None]:
medical_data['medical_specialty'].unique()

In [None]:
medical_data.groupby('medical_specialty').count()['transcription'].reset_index()


In [None]:
medical_data = medical_data.groupby('medical_specialty').filter(lambda x : len(x)>20)


In [None]:
label_count = medical_data.groupby('medical_specialty').count()['transcription'].reset_index()
label_count

In [None]:
medical_data = medical_data[['transcription','medical_specialty']]


In [None]:
plt.figure(figsize=(20,8))
x = medical_data.medical_specialty.value_counts()
sns.barplot(x.index,x)
#plt.tight_layout()
plt.xticks(
    rotation=45,
    horizontalalignment='right',
    fontweight='light',
    fontsize='x-large'
)
plt.gca().set_ylabel('samples')

## Data Augmentation

In [None]:
aug = naw.SynonymAug(aug_src='wordnet',aug_max= 24)

In [None]:
for i in range(0, len(label_count)-1):
    aug_val = int(round(label_count.transcription.max() / label_count.transcription[i],0)) - 1
    filtered_data = medical_data.loc[medical_data['medical_specialty'] == label_count.medical_specialty[i]]
    for j in filtered_data.index:
        temps=aug.augment(filtered_data.transcription[j],n=aug_val)
        for sent in temps:
            medical_data1 = {'transcription': sent, 'medical_specialty': label_count.medical_specialty[i]}
            medical_data = medical_data.append(medical_data1, ignore_index = True)
    #print(medical_data.shape)


In [None]:
medical_data.groupby('medical_specialty').count()['transcription'].reset_index()


In [None]:
medical_data

## Text Preprocessing

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
import regex as re
import string

from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

In [None]:
def text_preprocessing(data):
    data = str(data).lower()
    data = re.sub('\[.*?\]', '', data)
    data = re.sub('https?://\S+|www\.\S+', '', data)
    data = re.sub('<.*?>+', '', data)
    data = re.sub('[%s]' % re.escape(string.punctuation), '', data)
    data = re.sub('\n', '', data)
    data = re.sub('\w*\d\w*', '', data)
    return data

In [None]:
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [None]:
medical_data["transcription"] = medical_data["transcription"].apply(lambda text: text_preprocessing(text))


In [None]:
medical_data["transcription"] = medical_data["transcription"].apply(lambda text: remove_stopwords(text))


In [None]:
medical_data['tr_word_count']= medical_data['transcription'].apply(lambda x: len(str(x).split(' ')))


In [None]:
Y = pd.get_dummies(medical_data['medical_specialty']).values
print('Shape of label tensor:', Y.shape)

In [None]:
medical_data.hist(column = 'tr_word_count') #most of the transcription are in range of 400 words

In [None]:
documents = []

for doc in medical_data['transcription']:
    documents.append(doc.split(' '))

In [None]:
documents[10]

## Building word2Vec model

In [None]:
from gensim.models import Word2Vec

In [None]:
# Training the Word2Vec model
model = Word2Vec(documents, #Word list
                   min_count=5, #Ignore all words with total frequency lower than this
                   workers=4, #Number of CPUs
                   vector_size=100,  #Embedding size
                   window=5 #Maximum Distance between current and predicted word
                  # iter=10   #Number of iterations over the text corpus
                  )

In [None]:
#How many words in the model and how many features
model.wv.vectors.shape

In [None]:
all_words = model.wv.index_to_key               #key_to_index
top_words = model.wv.index_to_key[0:5]
top_words

In [None]:
print('Vocabulary size: %d' % len(all_words))

In [None]:
# Finding Word Vectors
vector = model.wv['mmode']
vector

In [None]:
# Most similar words
similar = model.wv.most_similar('anesthetic')
similar

## Saving The model

In [None]:
model.save("word2vec_Medical Specialty Classification")

## train test validation split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = medical_data['transcription']
y = Y


In [None]:
#train test validation split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify = y)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1, stratify = y_train)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(X_val.shape)
print(y_val.shape)

In [None]:
# building tokenizer to form a token for each word in each transcription

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding

In [None]:
#Convert words to integers

top_words = 10000 #reducing no of words to increase the processing speed

tokenizer = Tokenizer(num_words=top_words)
tokenizer.fit_on_texts(X_train)

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
# pad sequenc to mmake each transcription of the same size
#taking 300  based on avg length of the transcription form histogram
padding_max_len = 300

X_train = pad_sequences(X_train, maxlen=padding_max_len, padding='post')
X_test = pad_sequences(X_test, maxlen=padding_max_len, padding='post')

In [None]:
# Building embedding matrix from above Word2Vec model(pre trained model)

word2vec_model = Word2Vec.load("word2vec_Medical Specialty Classification")  #loading the one created above

In [None]:

word2vec_model.wv.vectors.shape

In [None]:
#embedding length
embedding_vector_length = word2vec_model.wv.vectors.shape[1]
embedding_vector_length

## building embedding matrix

In [None]:
# building embedding matrix
#initailiazing matrix to all zeros

embedding_matrix = np.zeros((top_words+1, embedding_vector_length))

tokenizer.word_index.items()

for word,i in sorted(tokenizer.word_index.items(), key = lambda x:x[1]):
  if i > top_words:
    break
  if word in word2vec_model.wv.index_to_key:
    #print(word)
    embedding_vector = word2vec_model.wv[word]
    #print(embedding_vector)
    embedding_matrix[i] = embedding_vector
    #print(embedding_matrix[i])

In [None]:
embedding_matrix[tokenizer.word_index['procedure']]

##  Create an embedding layer

In [None]:
# Create an embedding layer
#Output from Embedding is 3 dimension
#batch_size x max_review_length x embedding_vector_length

embedding_layer = Embedding(top_words+1, embedding_vector_length, weights=[embedding_matrix],
                             input_length=padding_max_len,trainable=False)


## Using LSTM

In [None]:
# Using LSTM

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,BatchNormalization,Embedding,LSTM ,Bidirectional
from tensorflow.keras import regularizers

## Model 1

In [None]:
model1 = Sequential()

model1.add(embedding_layer)

#LSTM
model1.add(LSTM(32))

#dense layer
model1.add(Dense(units=21,activation='softmax'))

model1.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])
print(model1.summary())


In [None]:
model_history = model1.fit(X_train, y_train, batch_size=200, epochs=30, validation_data=(X_test, y_test))

In [None]:
from matplotlib import pyplot as plt
# visualise training history
plt.plot(model_history.history['accuracy'])
plt.plot(model_history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc="lower right")
plt.show()

## Model 2

In [None]:
model4 = Sequential()

model4.add(embedding_layer)

model4.add(Bidirectional(LSTM(64)))

model4.add(Dense(units=21,activation='softmax'))

model4.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])
print(model4.summary())


In [None]:
model_history = model4.fit(X_train, y_train, batch_size=100, epochs=10, validation_data=(X_test, y_test))

In [None]:
from matplotlib import pyplot as plt
# visualise training history
plt.plot(model_history.history['accuracy'])
plt.plot(model_history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc="lower right")
plt.show()

In [None]:
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc="lower right")
plt.show()


## Model 3

In [None]:
#improving above overfitted model:

model4_1 = Sequential()

model4_1.add(embedding_layer)

model4_1.add(Bidirectional(LSTM(32)))

model4_1.add(Dense(units=21,activation='softmax'))

model4_1.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])
print(model4_1.summary())


In [None]:
model_history = model4_1.fit(X_train, y_train, batch_size=100, epochs=8, validation_data=(X_test, y_test))

In [None]:
from matplotlib import pyplot as plt
# visualise training history
plt.plot(model_history.history['accuracy'])
plt.plot(model_history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc="lower right")
plt.show()

In [None]:
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc="lower right")
plt.show()

## model 4

In [None]:
#improving above overfitted model:

model4_2 = Sequential()

model4_2.add(embedding_layer)

model4_2.add(Bidirectional(LSTM(32 , dropout=0.1, recurrent_dropout=0.1)))

model4_2.add(Dense(units=21,activation='softmax'))

model4_2.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])
print(model4_2.summary())


In [None]:
model_history = model4_2.fit(X_train, y_train, batch_size=100, epochs=10, validation_data=(X_test, y_test))

In [None]:
from matplotlib import pyplot as plt
# visualise training history
plt.plot(model_history.history['accuracy'])
plt.plot(model_history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc="lower right")
plt.show()

In [None]:
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc="lower right")
plt.show()

