In [1]:
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import *
from sklearn.naive_bayes import GaussianNB


In [2]:
class NaiveBayes():
    def __init__(self):
        self.data_set = None #original dataset
        self.train_data_set = None
        self.test_data_set = None
        self.probabilities = dict()

    def probability(self, occurance, total, total_categories, laplacian = True):
        if laplacian:
            alpha = 1
            return (alpha + occurance) / ((alpha * total_categories) + total)
        else:
            return (occurance) / total

    def learn(self, input, output):
        self.trainX = input.toarray()
        self.trainY = output.array
        self.labels = list(output.value_counts().index)
        self.labels_prob = {self.labels[key]:self.probability(value, len(output), len(self.labels)) for key,value in enumerate(output.value_counts())}
        no_of_columns = self.trainX.shape[1]
        self.no_of_columns = no_of_columns
        self.learned_prob = np.zeros((len(self.labels), self.trainX.shape[1], 2)) #matrix to hold all probabilities. matrix_ij stores probability of word j given label i
        for i in range(len(self.labels)):
            print("Learning probablities for", self.labels[i] )
            cur_label = self.labels[i]
            indices = np.where(self.trainY == cur_label)[0] #finding indicies where label i is found
            total = len(indices) #total cases of label i
            freq = [np.unique(self.trainX[indices,i], return_counts = True)[1] for i in range(no_of_columns)] #stores a list of counts of each word
            freq_array = np.zeros((no_of_columns, 2))
            freq_array = np.array([(self.probability(tup[0], total, 2), self.probability(sum(tup[1:]), total, 2)) for tup in freq])
            self.learned_prob[i] = freq_array #assigning the all probabities of all the words occuring (and not occuring) given label i


    def classify(self, inputs):
        input_mat = inputs.toarray()
        predictions = np.zeros((len(self.labels), input_mat.shape[0])) #the matrix to hold all probabilities. matrix_ij represents probablity of class i given sentence j
        input_mat[np.where(input_mat > 1)[0]] = 1 #converting all to binary case
        no_rows = np.zeros(self.no_of_columns)
        no_cols = np.zeros(self.no_of_columns)
        no_cols = range(self.no_of_columns)
        predicted_vals = []
        for i, label in enumerate(self.labels):
            print("computing probabilites for", label)
            no_rows = i
            prod_label = self.labels_prob[label] 
            for row_num in range(len(input_mat)):
                probs = self.learned_prob[no_rows, no_cols, input_mat[row_num]]
                # P(i_label | given sentence) = P(all words | label) * P(i_label)
                predictions[i, row_num] = np.prod(probs) * prod_label
        print('prediction done')
        for i in range((input_mat.shape[0])):
            min_ind = np.where(predictions[:,i] == np.amax(predictions[:,i]))[0] #finding the max
            predicted_vals.append(self.labels[min_ind[0]])

        return predicted_vals

In [3]:
def load_file(fileName):
    dataset = pd.read_csv(fileName, header=0, sep=",", encoding="unicode_escape", error_bad_lines = False)
    return dataset

In [4]:
def preprocess(data):
    count_vectorizer = CountVectorizer(binary = True, stop_words= 'english')
    data = count_vectorizer.fit_transform(data)
    #tfidf_data = TfidfTransformer(use_idf=False).fit_transform(data)

    return data

In [5]:
def learn_model(data,target):
    classifier = NaiveBayes()
    classifier.learn(data, target)
    return classifier

In [6]:
def classify(classifier, testdata):
    predicted_val= classifier.classify(testdata)
    return predicted_val

In [7]:
def confusion_matrix(actual, predicted):
    labels = list(actual.value_counts().index)
    matrix = np.zeros((len(labels), len(labels)))
    for index, val in enumerate(actual.array):
        predicted_val = predicted[index]
        matrix[labels.index(val), labels.index(predicted_val)] += 1
    return matrix
    

In [8]:
def precision(TP, FP, TN, FN):
    if (TP + FP) == 0:
        return 0
    return TP / (TP + FP)

In [9]:
def recall(TP, FP, TN, FN):
    if (TP + FN) == 0:
        return 0
    return TP / (TP + FN)

In [10]:
def f1score(p, r):
    if (r + p) == 0:
        return 0
    return 2*(r * p) / (r + p)

In [11]:
def evaluate(actual_class, predicted_class):
    accuracy = 0
    mat = confusion_matrix(actual_class, predicted_class) 
    precisions = []
    recalls = []
    f1 = []
    accurate = 0
    counts = actual_class.value_counts() #counts of labels in the test set.
    
    print(' '.join(list(actual_class.value_counts().index)))
    print(mat)
    
    for i in range(mat.shape[0]):
        weight = counts[i] / np.sum(counts) #frequency of label
        TP = mat[i,i]
        FP = np.sum(mat[:,i]) - TP
        FN = np.sum(mat[i,:]) - TP
        TN = np.sum(mat) - TP - FP - FN
        precisions.append(precision(TP, FP, TN, FN) * weight) #storing weighted precision
        recalls.append(recall(TP, FP, TN, FN) * weight)
        f1.append(f1score(precisions[i]/weight, recalls[i]/weight) * weight) #dividing by weight, cos F1 is supposed to take precision, and not weight precision.
        accurate += TP
    
    print("The precision score is :", sum(precisions))
    print("The recall score is :", sum(recalls))
    print("The F1 score is :", sum(f1))
    print("The accuracy score is :", accurate/len(predicted_class))
    

    
        
        
        

In [12]:
features = ["SUMMARY", "categories", "sub_categories"]

print("Loading data.....")
# noinspection PyInterpreter
dataset = load_file("TextClassification_Data.csv")
data,target = dataset[features[0]].fillna(" "), dataset[features[1]]

print("preprocessing data.....")
word_vectors = preprocess(data)

trainingX,testX,trainingY,testY = train_test_split(word_vectors,target,test_size=0.4,random_state=43)

Loading data.....
preprocessing data.....


In [None]:
print("Learning model.....")

model = learn_model(trainingX,trainingY)

Learning model.....
Learning probablities for PRESCRIPTION
Learning probablities for APPOINTMENTS
Learning probablities for MISCELLANEOUS
Learning probablities for ASK_A_DOCTOR
Learning probablities for LAB


In [None]:
print("Classifying test data......")      
predictedY = classify(model, testX)

In [None]:
print("Evaluating results.....")
evaluate(testY,predictedY)