 # Medical Text Classification using k-NN

In [18]:
# essential libraries needed for the program
import numpy as np
import scipy as sp
import pandas as pd
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict,Counter
import matplotlib.pyplot as plt
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ajith\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ajith\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
# train_data
with open("train.dat",'r') as fh:
    lines= fh.readlines()


In [24]:
# test_data
with open("test.dat",'r') as fh:
    lines1= fh.readlines()


In [25]:
# breaks down sentences into words
tr_data=[l.split() for l in lines]

In [26]:
te_data=[l.split() for l in lines1]

In [27]:
# appends train data into a separate doc for processing and separates the label and data
Class=[]
docs=[]
for i in range(0,len(tr_data)):
    Class.append(tr_data[i][0])
    docs.append(tr_data[i][1:])

In [28]:

data_frame = pd.DataFrame()
data_frame['text']= docs[:]
data_frame['class']= Class[:]
Y= Counter(data_frame['class'])
print(Y)

Counter({'5': 4805, '1': 3163, '4': 3051, '3': 1925, '2': 1494})


In [29]:
# appends the test data into the doc created for the purpose of analysis
for i in range(0,len(te_data)):
    docs.append(te_data[i])
print(len(docs))

28880


In [30]:
# Pre-Processing steps:
# the function to remove the stopwords in the documents 
def stopWords_remover(docs):
    doc1=[]
    stopWords=set(stopwords.words('english'))
    for doc in docs:
        word1=[]
        for word in doc:
            if word not in stopWords:
                word1.append(word)
        doc1.append(word1)
    return doc1

In [31]:
#the function which removes the punctuations used in the sentences using Regular Expressions(re) library
def punctuation_remover(docs):
    doc1=[]
    for doc in docs:
        word1=[]
        for word in doc:
            word=re.sub(r'[^\w\s]','',word)
            if word != '':
                word1.append(word)
        doc1.append(word1)
    return doc1

In [32]:
#removes the words that are below the minlength set
def filterlen(docs,minlen):
    return[[t for t in d if len(t)>=minlen]for d in docs]


In [33]:
# the function lemmatizes the document using the WordNetLemmatizer() available in the nltk library
def lemmatize(docs):
    lemmatizer=WordNetLemmatizer()
    doc1=[]
    for doc in docs:
        word1=[]
        for word in doc:
            lemma=lemmatizer.lemmatize(word,'v')
            word1.append(lemma)
        doc1.append(word1)
    return doc1

In [34]:
#the function that changes all the words into lowercase and filters out any digits present in the document
def tolowerCase_and_filtering(docs):
    doc1=[]
    for doc in docs:
        word1=[]
        for word in doc:
            word1.append(word.lower())
            for char in word:
                if(not char.isalpha()):
                    word1.remove(word.lower())
                    break
        doc1.append(word1)
    return doc1
    

In [35]:
doc1= stopWords_remover(docs)

doc2= punctuation_remover(doc1)

doc3= filterlen(doc2,4)

doc4= lemmatize(doc3)

doc5= tolowerCase_and_filtering(doc4)         
            

In [36]:
# building up of the sparse matrix
from scipy.sparse import csr_matrix
def build_matrix(docs):
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    for d in docs:
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)
        
    
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  
    n = 0  

    for d in docs:
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat


In [37]:
def csr_info(mat, name="", non_empy=False):
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

In [39]:
#scales the matrix and normalizes its rows 
def csr_idf(mat, copy=False, **kargs):
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    df = defaultdict(int)
    for i in ind:
        df[i] += 1
    for k,v in df.items():
        df[k] = np.log(nrows / float(v))  
    for i in range(0, nnz):
        val[i] *= df[ind[i]]
        
    return df if copy is False else mat

def csr_l2normalize(mat, copy=False, **kargs):
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr

    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat


In [40]:
doc6 = build_matrix(doc5)
csr_info(doc6)
doc7 = csr_idf(doc6, copy=True)
doc8 = csr_l2normalize(doc7, copy=True)
print(doc8.shape)



 [nrows 28880, ncols 55021, nnz 1916480]
(28880, 55021)


In [41]:
# splitting up of the train and test data for the purpose of analysis
train=doc8[0:14438]
test=doc8[14438:]
train_Classes=Class[0:14438]
test_Classes=Class[14438:]

In [42]:
# the function that computes the cosine similarity between the test and train matrices
import math
def cosine_similarity(test,train):
    dot_prod=test.dot(train.T)
    sim=list(zip(dot_prod.indices,dot_prod.data))
    return sim
    


In [43]:
#the function that finds the nearest neighbour and classifies them using maximum vote method
def NN(test,sim,trainlabel,k=3):
    #sim=cosine_similarity(test,train)
    if len(sim) ==0:
        if np.random.rand()>0.5:
            return '+'
        else:
            return '-'
    sim.sort(key=lambda test:test[1], reverse= True)
    c=Counter(trainlabel[i[0]] for i in sim[:k] ).most_common(2)
    if len(c)<2 or c[0][1]>c[1][1]:
        return c[0][0]
    c=defaultdict(float)
    for i in sim[:k]:
         c[trainlabel[i[0]]] += i[1]
    sortedVotes=sorted(c.items(),key=lambda x:x[1], reverse= True)[0][0]
    return sortedVotes
    

In [44]:
#finding the prediction and copying them into a file
sim=[]
predictions=[]
file=open("Predictions7.dat","w+")
for i in test:
    sim=cosine_similarity(i,train)
    #print(sim)
    predictions=NN(i,sim,train_Classes,25)
    file.write(str(predictions) +"\n" )
file.close()