In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd
import numpy, textblob, string, xgboost
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

In [2]:
trainDF = pd.read_csv('https://query.data.world/s/2ok63wr6t3j57kkj5atfzsa7ylgej4')
trainDF.head()

Unnamed: 0,Reason,Classification
0,A 23-year-old white female presents with comp...,Allergy / Immunology
1,Consult for laparoscopic gastric bypass. Lapa...,Bariatrics
2,Consult for laparoscopic gastric bypass. Lapa...,Bariatrics
3,2-D M-Mode. Doppler. 2-D Echocardiogram - 1...,Cardiovascular / Pulmonary
4,2-D Echocardiogram 2-D Echocardiogram - 2 1. ...,Cardiovascular / Pulmonary


In [3]:
trainDF['Reason'] = trainDF['Reason'].str.replace('\d+','',regex=True)
trainDF['Reason'] = trainDF['Reason'].str.replace('-', '',regex=True)
trainDF['Reason'] = trainDF['Reason'].str.replace('/','',regex=True)
trainDF['Reason'] = trainDF['Reason'].str.replace('"','',regex=True)
trainDF['Reason'] = trainDF['Reason'].str.replace('[^\w\s]','',regex=True)
trainDF['Reason'] = trainDF['Reason'].str.replace('[^a-zA-z\s]','',regex=True)
trainDF['Reason'] = trainDF['Reason'].replace('>','',regex=True)
trainDF['Reason'] = trainDF['Reason'].replace('<','',regex=True)
trainDF.head()

Unnamed: 0,Reason,Classification
0,A yearold white female presents with complain...,Allergy / Immunology
1,Consult for laparoscopic gastric bypass Lapar...,Bariatrics
2,Consult for laparoscopic gastric bypass Lapar...,Bariatrics
3,D MMode Doppler D Echocardiogram D MMODE ...,Cardiovascular / Pulmonary
4,D Echocardiogram D Echocardiogram The lef...,Cardiovascular / Pulmonary


In [4]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['Reason'], trainDF['Classification'])

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [5]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['Reason'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [6]:
# word level tf-idf
#tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=5000)
tfidf_vect.fit(trainDF['Reason'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
#tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['Reason'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
#tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['Reason'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

In [7]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('wiki-news-300d-1M.vec',encoding="utf8")):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(trainDF['Reason'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [8]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [9]:
# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("SVM, N-Gram Vectors: ", accuracy)

SVM, N-Gram Vectors:  0.003076923076923077


<strong>Original Results On Client Dataset</strong>
<p>SVM, N-Gram Vectors:  0.24623210248681235</p>

In [10]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print("Xgb, Count Vectors: ", accuracy) 

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print("Xgb, WordLevel TF-IDF: ", accuracy) 

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print("Xgb, CharLevel Vectors: ", accuracy)

Xgb, Count Vectors:  0.0020512820512820513
Xgb, WordLevel TF-IDF:  0.0020512820512820513
Xgb, CharLevel Vectors:  0.0041025641025641026


<strong>Original Results On Client Dataset</strong>
<p>
Xgb, Count Vectors:  0.9233232856066315<br>
Xgb, WordLevel TF-IDF:  0.9312358703843255<br>
Xgb, CharLevel Vectors:  0.9355689525244913<br>
</p>