####  Naïve Bayes Classification for Text

In [1]:
import pandas as pd
import numpy as np


In [2]:
#import the training files to dataframes
words_df = pd.read_csv("terms.txt", header = None)
train_class_df = pd.read_csv("trainClasses.txt", header = None, sep='\t', names=['Classes'])
train_matrix_df = pd.read_csv("trainMatrix.txt", header = None, sep='\t')

In [3]:
# identify the total number of classes in the data
classes = train_class_df['Classes'].unique()

# number of classes
num_o_classes = classes.size

# initalize class dictionary
class_dicts = []

# var for vocab total
vocab_total = 0

class_counts = []
class_percents = []
total_count = 0

# total words per class
word_per_class = []

# identify the vocabulary total
vocab_total = words_df.shape[0]

#create dictionaries for each class identified
i = 0
for i in range(num_o_classes):
    temp = {}
    class_dicts.append(temp)
    i = i + 1
    
# find the total count per class, overall total count, and the percentages
for var in classes:
    temp = 0
    i = 0
    for i in range(train_class_df.shape[0]):
        if train_class_df.iat[i,0] == var:
            temp = temp + 1
        i = i + 1
    class_counts.append(temp)

# total count for all occurances across all classes
for var in classes:
    total_count = total_count + class_counts[var]

# find the percent of occurance for each class
for var in classes:
    class_percents.append(class_counts[var]/total_count)
    
# reshape the dataframe
train_matrix_df = train_matrix_df.T

# add the classes
train_matrix_df['Classes'] = pd.Series(train_class_df['Classes'])

# seperate out the classes
for var in classes:
    
    test_sum = 0 
    
    # temp dataframe for calculating the number for each class
    temp_df = train_matrix_df[(mask:=train_matrix_df['Classes'] == var)].copy()
    
    #drop the class column
    temp_df.drop(temp_df.columns[[-1]], axis=1, inplace=True)
    
    # sum each column which coresponds to a unique word
    sum_columns = temp_df.sum(axis=0)
    
    # sum the total for the class
    for tmp in sum_columns:
        test_sum = test_sum + tmp
    
    # append the total for this class
    word_per_class.append(test_sum)
    
    # calculate the probability for each word per class and store it in a dictionary with the word are the key
    for i in range(len(sum_columns)):
        # 1 is added to each word count and vocab is added to the denominator to help smooth the data
        prob = (sum_columns[i]+1)/(word_per_class[var]+vocab_total)
        class_dicts[var][words_df.iat[i,0]] = prob

In [4]:
word = 'program'
print('Word: ', word)
print('Class 0 prob: ',class_dicts[0]["program"])
print('Class 1 prob: ',class_dicts[1]["program"])

Word:  program
Class 0 prob:  0.0059020084988922385
Class 1 prob:  0.00011006952725138046


# Testing Section 

In [5]:
# read in the testing data
test_class_df = pd.read_csv("testClasses.txt", header = None, sep='\t', names=['Classes'])
test_matrix_df = pd.read_csv("testMatrix.txt", header = None, sep='\t')
test_words_df = pd.read_csv("terms.txt", header = None)

In [6]:
# method returns the word probability 
# x is the target word
def calc_word_prob(word_df, word_dict, matrix_df, x):
    i = 0
    total_prob = 1
    
    for i in range(matrix_df.shape[0]):
        temp = 0
        
        if matrix_df.iat[i,x] > 0:
            
            #log summation to prevetn underflow
            temp = matrix_df.iat[i,x]*(np.log(word_dict[word_df.iat[i,0]]))

            total_prob = total_prob + temp

        i = i + 1

    return total_prob

# predictor of class
def pred_class(class_probs, classes):
    pred_classes = []
    total_class_probs = 0
    
    #sum the total probs from all classes
    for var in class_probs:
        total_class_probs = total_class_probs + var
    
    # iterate through each prob per class and divide by the combined total probs
    for var in class_probs: 
        pred_classes.append(var/total_class_probs)
    
    # find where the location of the predicted class 
    temp = min(pred_classes)
    
    # return the the predicted class
    return pred_classes.index(temp), temp

# the main testing method
def testing(word_df, word_dict, matrix_df, classes):

    # list to hold the predicted classes
    pred_classes = []

    pred_class_temp = 0
    
    pred_prob = []
    
    j = 0
    
    # iterate through the columns (words) and store the predicted class in pred_classes
    for j in range(matrix_df.shape[1]): 
        
        class_temp = []
        
        # iterate through each class
        for var in classes:
            
            # calculate the probability for this word in each class
            class_temp.append(calc_word_prob(word_df, word_dict[var], matrix_df,j))
        
        # call pred_class to predict the class
        pred_class_temp, pred_prob_temp = pred_class(class_temp, classes)

        # append the results to the lists
        pred_classes.append(pred_class_temp)
        pred_prob.append(pred_prob_temp)

    return pred_classes, pred_prob

# evaluation method
def eval_test( class_df, test_results):
    perc_correct = 0
    number_o_tests = len(test_results)

    # iterate and compare the predicted results against the actual class
    for i in range(number_o_tests):
        if test_results[i] == class_df.iat[i,0]:
            perc_correct = perc_correct + 1
    
    return perc_correct/number_o_tests

# method to print out the results from the training section per class
def print_training(words, dicts):
    for word in words:
        print('\nWord: ', word)
        for i in range(len(class_dicts)):
            print('    Class {} prob: {}'.format(i, dicts[i][word]))

# Implement the NB assignment

In [7]:
# call the main testing method
hw_test, pred = testing(test_words_df, class_dicts, test_matrix_df, classes)

In [8]:
# write up item 1
# Evaluate the accuracy
result = eval_test(test_class_df, hw_test)
print("Accuracy: {}%".format(result*100))

Accuracy: 99.5%


In [9]:
# writeup item 2
# Print out the first 20 items that were predicted, their actual class, and probability for the class
for i in range(20):
    print("Document #{}: Predicted Class: {} <---> {} :Actual Class   Prob: {}".format(i+1, hw_test[i], test_class_df.iat[i,0], pred[i]))


Document #1: Predicted Class: 1 <---> 1 :Actual Class   Prob: 0.456874384605304
Document #2: Predicted Class: 0 <---> 0 :Actual Class   Prob: 0.4411267320705995
Document #3: Predicted Class: 0 <---> 0 :Actual Class   Prob: 0.4903021787093305
Document #4: Predicted Class: 1 <---> 1 :Actual Class   Prob: 0.45395539040586014
Document #5: Predicted Class: 1 <---> 1 :Actual Class   Prob: 0.44606888666595845
Document #6: Predicted Class: 0 <---> 0 :Actual Class   Prob: 0.435478753775912
Document #7: Predicted Class: 1 <---> 1 :Actual Class   Prob: 0.44901663880273196
Document #8: Predicted Class: 1 <---> 1 :Actual Class   Prob: 0.438903148373382
Document #9: Predicted Class: 0 <---> 0 :Actual Class   Prob: 0.47348748842393135
Document #10: Predicted Class: 1 <---> 1 :Actual Class   Prob: 0.45149395751438215
Document #11: Predicted Class: 0 <---> 0 :Actual Class   Prob: 0.4547064987711618
Document #12: Predicted Class: 1 <---> 1 :Actual Class   Prob: 0.4413564053047166
Document #13: Predicted

In [10]:
# Write up  item 3 
#trained model results for program, includ, match, game, plai, window, file, subject, and write

words = ['program', 'includ', 'match', 'game', 'plai', 'window', 'file', 'subject', 'write']
print_training(words, class_dicts)


Word:  program
    Class 0 prob: 0.0059020084988922385
    Class 1 prob: 0.00011006952725138046

Word:  includ
    Class 0 prob: 0.00526640758362692
    Class 1 prob: 0.0006971070059254096

Word:  match
    Class 0 prob: 0.00023608033995568952
    Class 1 prob: 0.00038524334537983157

Word:  game
    Class 0 prob: 0.00021792031380525189
    Class 1 prob: 0.012291097209737483

Word:  plai
    Class 0 prob: 0.0001271201830530636
    Class 1 prob: 0.00726458879859111

Word:  window
    Class 0 prob: 0.013057058802164675
    Class 1 prob: 5.503476362569023e-05

Word:  file
    Class 0 prob: 0.0067010496495114955
    Class 1 prob: 5.503476362569023e-05

Word:  subject
    Class 0 prob: 0.00984273417353721
    Class 1 prob: 0.007686521986388068

Word:  write
    Class 0 prob: 0.0037772854392910324
    Class 1 prob: 0.005081543174772064


In [11]:
# Use this section to print the results from teh training section for any particular word(s)
words = ['game'] # change the word as needed
print_training(words, class_dicts)


Word:  game
    Class 0 prob: 0.00021792031380525189
    Class 1 prob: 0.012291097209737483
