<a href="https://colab.research.google.com/github/simied/mypackage/blob/master/Medical_Invoice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EDSA Medical Invoice Classification

#### In this notebook we will be classifying Medical Invoice NLP problem

In [0]:
#Data Libraries
import numpy as np
import pandas as pd 

#Visuales Libraries
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# Scikit libraries 
import sklearn
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.metrics import classification_report

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer

import nltk

import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_columns=None

In [0]:
#Load data in pandas dataframes
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
sample = pd.read_csv('sample_solution.csv')

In [0]:
#Display The first 5 rows of dataframes
display(sample.head())
display(train.head())
display(test.head())

Unnamed: 0,ID,ItemCategory_New
0,84485,x-rays
1,84486,anti-inflammatory tablets
2,84487,catheterisation
3,84488,gowns
4,84489,gastro-intestinal treatments


Unnamed: 0,ID,ItemCategory,ItemCategory_New
0,0,Inj 1.50 x Metacam 5mg/ml Injection for Dogs a...,anti-inflammatory injection
1,1,Mini metabolic profile,blood test
2,2,Biochemistry profile A,blood test
3,3,Comfortan Injection (CD Sch2) 4.35pm dose,analgesic injections
4,4,Micralax Enema 5ml,non specific medications


Unnamed: 0,ID,ItemCategory
0,84485,Consultation - Repeat (HPC 10% Sales Discount)
1,84486,1 x Lab Lasercyte Haematology
2,84487,5x Metronidazole 200mg Tablets
3,84488,Subsequent Extra Large Radiograph
4,84489,30X SYNOQUIN MEDIUM TASTY TABLETS


In [0]:
#Dimesion dipsplay
train.shape

(84485, 3)

In [0]:
test.shape

(14909, 2)

## Exploratory Data Analysis

In [0]:
# finding missing values
train.isnull().sum()

ID                  0
ItemCategory        1
ItemCategory_New    0
dtype: int64

In [0]:
#fill in missing values using built in method
train=train.fillna(method ='bfill') 

In [0]:
train.isnull().sum()

ID                  0
ItemCategory        0
ItemCategory_New    0
dtype: int64

In [0]:
null = train[train.ItemCategory.isnull()]['ItemCategory']
null

Series([], Name: ItemCategory, dtype: object)

In [0]:
test.isnull().sum()

ID              0
ItemCategory    0
dtype: int64

In [0]:
df = pd.concat((train[['ItemCategory']] ,test[['ItemCategory']]),axis=0)

In [0]:
#df['ItemCategory'] = df['ItemCategory'].apply(lambda x: ' '.join(x.split("|||")))

In [0]:
display(df.head())
display(df.shape)

Unnamed: 0,ItemCategory
0,Inj 1.50 x Metacam 5mg/ml Injection for Dogs a...
1,Mini metabolic profile
2,Biochemistry profile A
3,Comfortan Injection (CD Sch2) 4.35pm dose
4,Micralax Enema 5ml


(99394, 1)

## Remove Noise

In [0]:
df['ItemCategory'] = df['ItemCategory'].str.replace(' mg','mg')
df['ItemCategory'] = df['ItemCategory'].str.replace(' x','x')
df['ItemCategory'] = df['ItemCategory'].str.replace(' ml','ml')
df['ItemCategory'] = df['ItemCategory'].str.replace(' :','')
#df['ItemCategory'] = df['ItemCategory'].str.replace(' percent','percent')

In [0]:
# make lower case
df['ItemCategory'] = df['ItemCategory'].str.lower()

In [0]:
# import string
# def remove_punctuation_numbers(post):
#     punc_numbers = string.punctuation + '1234567890'
#     return ''.join([l for l in post if l not in punc_numbers])
# df['ItemCategory']= df['ItemCategory'].apply(remove_punctuation_numbers)

In [0]:
df.head()

Unnamed: 0,ItemCategory
0,inj 1.50x metacam 5mg/ml injection for dogs an...
1,mini metabolic profile
2,biochemistry profile a
3,comfortan injection (cd sch2) 4.35pm dose
4,micralax enema 5ml


In [0]:
df.describe()

Unnamed: 0,ItemCategory
count,99394
unique,97173
top,consultation
freq,9


In [0]:
df['post_len'] = df['ItemCategory'].apply(len)

## Tokenizing

In [0]:
from nltk.tokenize import word_tokenize, TreebankWordTokenizer

In [0]:
# we will use the TreeBankWordTokenizer since it is MUCH quicker than the word_tokenise function
tokeniser = TreebankWordTokenizer()
df['tokens'] = df['ItemCategory'].apply(tokeniser.tokenize)

In [0]:
df.head()

Unnamed: 0,ItemCategory,post_len,tokens
0,inj 1.50x metacam 5mg/ml injection for dogs an...,80,"[inj, 1.50x, metacam, 5mg/ml, injection, for, ..."
1,mini metabolic profile,22,"[mini, metabolic, profile]"
2,biochemistry profile a,22,"[biochemistry, profile, a]"
3,comfortan injection (cd sch2) 4.35pm dose,41,"[comfortan, injection, (, cd, sch2, ), 4.35pm,..."
4,micralax enema 5ml,18,"[micralax, enema, 5ml]"


## Lemmatizing

In [0]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def Item_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words] 

In [0]:
# lemmatize all words in dataframe
df['lemma'] = df['tokens'].apply(Item_lemma, args=(lemmatizer, ))

## StopWords

In [0]:
# Update stopwords with class names and other stop words without punctuation
from nltk.corpus import stopwords

stopset = set(stopwords.words('english'))
stopset.update(['^'])


In [0]:
def remove_stop_words(tokens):
    return [t for t in tokens if t not in stopset]

df['new_lemma'] = df['lemma'].apply(remove_stop_words)

In [0]:
df.shape

(99394, 5)

In [0]:
df.head()

Unnamed: 0,ItemCategory,post_len,tokens,lemma,new_lemma
0,inj 1.50x metacam 5mg/ml injection for dogs an...,80,"[inj, 1.50x, metacam, 5mg/ml, injection, for, ...","[inj, 1.50x, metacam, 5mg/ml, injection, for, ...","[inj, 1.50x, metacam, 5mg/ml, injection, dog, ..."
1,mini metabolic profile,22,"[mini, metabolic, profile]","[mini, metabolic, profile]","[mini, metabolic, profile]"
2,biochemistry profile a,22,"[biochemistry, profile, a]","[biochemistry, profile, a]","[biochemistry, profile]"
3,comfortan injection (cd sch2) 4.35pm dose,41,"[comfortan, injection, (, cd, sch2, ), 4.35pm,...","[comfortan, injection, (, cd, sch2, ), 4.35pm,...","[comfortan, injection, (, cd, sch2, ), 4.35pm,..."
4,micralax enema 5ml,18,"[micralax, enema, 5ml]","[micralax, enema, 5ml]","[micralax, enema, 5ml]"


In [0]:
df['lemmatizing'] = [' '.join(line) for line in df['new_lemma']]
df.head()

Unnamed: 0,ItemCategory,post_len,tokens,lemma,new_lemma,lemmatizing
0,inj 1.50x metacam 5mg/ml injection for dogs an...,80,"[inj, 1.50x, metacam, 5mg/ml, injection, for, ...","[inj, 1.50x, metacam, 5mg/ml, injection, for, ...","[inj, 1.50x, metacam, 5mg/ml, injection, dog, ...",inj 1.50x metacam 5mg/ml injection dog cat 10m...
1,mini metabolic profile,22,"[mini, metabolic, profile]","[mini, metabolic, profile]","[mini, metabolic, profile]",mini metabolic profile
2,biochemistry profile a,22,"[biochemistry, profile, a]","[biochemistry, profile, a]","[biochemistry, profile]",biochemistry profile
3,comfortan injection (cd sch2) 4.35pm dose,41,"[comfortan, injection, (, cd, sch2, ), 4.35pm,...","[comfortan, injection, (, cd, sch2, ), 4.35pm,...","[comfortan, injection, (, cd, sch2, ), 4.35pm,...",comfortan injection ( cd sch2 ) 4.35pm dose
4,micralax enema 5ml,18,"[micralax, enema, 5ml]","[micralax, enema, 5ml]","[micralax, enema, 5ml]",micralax enema 5ml


## Vectorization

In [0]:
vect = CountVectorizer(lowercase=True,stop_words=stopset,max_features=6500)
X_count = vect.fit_transform(df.lemmatizing)

tfizer = TfidfTransformer()

tfizer.fit(X_count)
X_tfidfV = tfizer.fit_transform(X_count)

In [0]:
# tfidf = TfidfVectorizer(stop_words='english',max_features=7000)
# X_tfidfV = tfidf.fit_transform(df.lemmatizing)

## Trianing and Fitting

In [0]:
y = train.ItemCategory_New
X_tfidfV = X_tfidfV

In [0]:
X = X_tfidfV[:len(train)]
Test_data = X_tfidfV[len(train):]

In [0]:
display(X.shape)
display(y.shape)

(84485, 6500)

(84485,)

In [0]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

## Modelling

In [0]:
# Cross_val_score is the new class for today...
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification

# main ones to focus on for this sprint
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# Covered in sprint 3
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# Covered in sprint 4
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

# Neural Network!!
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.svm import SVC 

from sklearn.model_selection import GridSearchCV 

from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

In [0]:
#clf = LogisticRegression(C=7.5,n_jobs= -1,penalty='l2')
clf = MLPClassifier(hidden_layer_sizes=(50),max_iter=1, warm_start=True)

In [0]:
# parameters = {'activation': ["relu", "Tanh"],'max_iter': [1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000 ],'hidden_layer_sizes':np.arange(5, 15),'learning_rate': ["constant", "invscaling", "adaptive"], 'random_state':[0,1,2,3,4,5,6,7,8,9],'warm_start':(True,False)}
# clf = GridSearchCV(lm, parameters, n_jobs=-1)

# clf.fit(X_train, y_train)
# print(clf.best_params_)

In [0]:
#parameters = {'C':(0.5,1,2,3,4,5,6,6.5,7.5,10)}
#clf = GridSearchCV(lm, parameters)
clf.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(50, 3), learning_rate='constant',
              learning_rate_init=0.001, max_iter=1, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=True)

In [0]:
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

                                       precision    recall  f1-score   support

      anaesthetic (general) induction       0.00      0.00      0.00       670
                 analgesic injections       0.00      0.00      0.00       444
                    analgesic tablets       0.00      0.00      0.00       233
                    anti emetic drugs       0.00      0.00      0.00       242
                anti-biotic injection       0.00      0.00      0.00       441
                  anti-biotic tablets       0.00      0.00      0.00       607
          anti-inflammatory injection       0.00      0.00      0.00       395
            anti-inflammatory tablets       0.00      0.00      0.00       409
                            bandaging       0.00      0.00      0.00       144
           blood pressure measurement       0.00      0.00      0.00        22
                   blood sampling fee       0.00      0.00      0.00       129
                           blood test       0.00   

In [0]:
print(f1_score(y_test,y_pred,average='micro'))
#0.8211516837308398

0.0854589572113393


In [0]:
sample.head()

Unnamed: 0,ID,ItemCategory_New
0,84485,x-rays
1,84486,anti-inflammatory tablets
2,84487,catheterisation
3,84488,gowns
4,84489,gastro-intestinal treatments


## Submission

In [0]:
prediction = clf.predict(Test_data)

In [0]:
sub = pd.DataFrame()

In [0]:
sub['ID'] = test.ID

In [0]:
sub['ItemCategory_New'] = prediction

In [0]:
sub.head()

Unnamed: 0,ID,ItemCategory_New
0,84485,consultation fee
1,84486,blood test
2,84487,anti-biotic tablets
3,84488,physiotherapy
4,84489,ultrasound


In [0]:
sub.to_csv('submission.csv',index=False)