# Imports


In [63]:
import pandas as pd
import numpy as np

# Constants

In [64]:
TRAINING_DATA = r'SpamData\02_Training\train-data.txt'
TEST_DATA = r'SpamData\02_Training\test-data.txt'
WORD_LIST_FILE = 'SpamData/01_Processing/word-by-id.csv'

PROB_TOKEN_SPAM = r'SpamData\03_Testing\prob-spam.txt'
PROB_TOKEN_HAM = r'SpamData\03_Testing\prob-ham.txt'
PROB_TOKEN_ALL = r'SpamData\03_Testing\prob-all.txt'

TEST_FEATURE_MATRIX = r'SpamData\03_Testing\test-features.txt'
TEST_TARGET_MATRIX = r'SpamData\03_Testing\teast-target.txt'

VOCAB_SIZE=2500

# Loading Pre-Processed Data

In [65]:
train_data = np.loadtxt(TRAINING_DATA,delimiter=' ',dtype='int')
train_data[:5]

array([[ 0,  2,  1,  1],
       [ 0,  3,  1,  2],
       [ 0,  4,  1,  1],
       [ 0,  7,  1,  3],
       [ 0, 11,  1,  1]])

In [66]:
test_data = np.loadtxt(TEST_DATA,delimiter=' ',dtype='int')
test_data[:5]

array([[8, 2, 1, 1],
       [8, 3, 1, 4],
       [8, 4, 1, 2],
       [8, 5, 1, 1],
       [8, 6, 1, 2]])

In [67]:
VOCAB = pd.read_csv(WORD_LIST_FILE)

In [68]:
VOCAB.head()

Unnamed: 0,WORD_ID,VOCAB_WORDS
0,0,http
1,1,use
2,2,list
3,3,email
4,4,get


# Creating Full Matrix from the Sparse Matrix

### Creating Empty DataFrame : example

In [69]:
indexes = np.unique(train_data)

In [70]:
cols = ['DOC_ID']+['WORD_ID']+list(range(VOCAB_SIZE))

In [71]:
len(cols)

2502

In [72]:
# indexes = np.unique(train_data)
# cols = ['DOC_ID']+['WORD_ID']+list(range(VOCAB_SIZE))
# full_train_matrix = pd.DataFrame(index=indexes,columns=cols)
# full_train_matrix.fillna(value=0,inplace=True)
# full_train_matrix.head()

### Defining Function to fill the empty Full Matrix

In [73]:
def create_full_matrix(data,VOCAB_SIZE,doc_index=0,word_index=1,class_index=2,freq_index=3):
    
    cols = ['DOC_ID']+['CLASS']+list(range(VOCAB_SIZE))
        
    indexes = np.unique(data[:,0])

    full_matrix = pd.DataFrame(index=indexes,columns=cols)
    full_matrix.fillna(value=0,inplace=True)
    
    for word in data:
        DOC_ID=word[doc_index]
        WORD_ID=word[word_index]
        CLASS=word[class_index]
        OCCURENCE=word[freq_index]
        
        full_matrix.at[DOC_ID,'DOC_ID']=DOC_ID
        full_matrix.at[DOC_ID,'CLASS']=CLASS
        full_matrix.at[DOC_ID,WORD_ID]=OCCURENCE
    full_matrix.set_index('DOC_ID',inplace=True)
    return full_matrix
        

In [74]:
%%time
full_train_matrix=create_full_matrix(data=train_data,VOCAB_SIZE=VOCAB_SIZE)

Wall time: 8.68 s


# Training Bayes Classifier

In [75]:
prob_spam = full_train_matrix[full_train_matrix.CLASS==1].shape[0]/full_train_matrix.shape[0]

In [76]:
prob_ham = full_train_matrix[full_train_matrix.CLASS!=1].shape[0]/full_train_matrix.shape[0]

In [77]:
total_word = full_train_matrix.loc[:,full_train_matrix.columns!='CLASS'].sum(axis=1)
total_word_count = total_word.sum()

In [78]:
spam_word = total_word[full_train_matrix.CLASS==1]
spam_word_count= spam_word.sum()

In [79]:
ham_word = total_word[full_train_matrix.CLASS==0]
ham_word_count= ham_word.sum()

## Spam and Ham tokens summed by word ID

In [80]:
full_train_features = full_train_matrix.loc[:,full_train_matrix.columns!='CLASS']

In [81]:
spam_sumby_id = full_train_features[full_train_matrix.CLASS==1].sum(axis=0) + 1

In [82]:
ham_sumby_id = full_train_features[full_train_matrix.CLASS==0].sum(axis=0) + 1

## P(Token|Spam) - Probablity that the Token occurs given that the eamil is spam

In [83]:
prob_token_spam = spam_sumby_id / (spam_word_count+VOCAB_SIZE)

In [84]:
prob_token_spam.sum()

1.0

## P(Token|Ham) - Probablity that the Token occurs given that the eamil is not spam

In [85]:
prob_token_ham = ham_sumby_id / (ham_word_count+VOCAB_SIZE)

In [86]:
prob_token_ham.sum()

0.9999999999999999

## P(Token) - Probablity that the Token occurs

##### Laplace Smoothing not required for all tokens since they are the most 2500 frequent words

In [87]:
total_sumby_id = full_train_features.sum(axis=0)

In [88]:
prob_token = total_sumby_id / (total_word_count)

In [89]:
prob_token.sum()

1.0000000000000002

# Preparing the Test Data

In [90]:
%%time
full_test_matrix=create_full_matrix(data=test_data,VOCAB_SIZE=VOCAB_SIZE)

Wall time: 4.14 s


In [91]:
X_test = full_test_matrix.loc[:,full_test_matrix.columns!='CLASS']
Y_test = full_test_matrix.loc[:,full_test_matrix.columns=='CLASS']

# Saving All the Data

In [92]:
np.savetxt(PROB_TOKEN_ALL,prob_token)
np.savetxt(PROB_TOKEN_SPAM,prob_token_spam)
np.savetxt(PROB_TOKEN_HAM,prob_token_ham)
np.savetxt(TEST_FEATURE_MATRIX,X_test)
np.savetxt(TEST_TARGET_MATRIX,Y_test)

# Testing the Bayes Classifier

In [93]:
X_test = np.array(X_test)
Y_test = np.array(Y_test)
Y_test = np.array(Y_test[:,0])

In [94]:
# email_spam_prob = (X_test[0].dot(prob_token_spam) * prob_spam) / (X_test[0].dot(prob_token))
# email_ham_prob = (X_test[0].dot(prob_token_ham) * prob_ham) / (X_test[0].dot(prob_token))

In [95]:
# target=[]
# for i in range(X_test.shape[0]):
#     email_spam_prob = (X_test[i].dot(prob_token_spam) * prob_spam) / (X_test[i].dot(prob_token))
#     email_ham_prob = (X_test[i].dot(prob_token_ham) * prob_ham) / (X_test[i].dot(prob_token))
#     if email_ham_prob>email_spam_prob:
#         target.append(0)
#     else:
#         target.append(1)

In [96]:
# result = {'Y_test':Y_test.flatten(),'Predicted':target}
# result_df=pd.DataFrame(result)

In [97]:
# result_df.to_csv('abc.csv')

In [98]:
joint_prob_spam_log = X_test.dot(np.log(prob_token_spam)-np.log(prob_token))+np.log(prob_spam)

In [99]:
joint_prob_ham_log = X_test.dot(np.log(prob_token_ham)-np.log(prob_token))+np.log(prob_ham)

In [100]:
joint_prob_spam_log[:5]

array([22.85949456,  2.06515163, 17.57530083, 18.05639752, 19.70818369])

In [101]:
joint_prob_ham_log[:5]

array([-59.28971636, -10.83968691, -33.3506051 , -58.22637041,
       -53.15302715])

In [102]:
predictions=joint_prob_spam_log>joint_prob_ham_log
print(predictions)

[ True  True  True ... False False False]


In [103]:
accuracy=(Y_test==predictions).sum()/Y_test.shape[0]

In [104]:
print(accuracy)

0.9779582366589327
