# Imports


In [1]:
import pandas as pd
import numpy as np

# Constants

In [72]:
TRAINING_DATA = r'SpamData\02_Training\train-data.txt'
TEST_DATA = r'SpamData\02_Training\test-data.txt'
WORD_LIST_FILE = 'SpamData/01_Processing/word-by-id.csv'

PROB_TOKEN_SPAM = r'SpamData\03_Testing\prob-spam.txt'
PROB_TOKEN_HAM = r'SpamData\03_Testing\prob-ham.txt'
PROB_TOKEN_ALL = r'SpamData\03_Testing\prob-all.txt'

TEST_FEATURE_MATRIX = r'SpamData\03_Testing\test-features.txt'
TEST_TARGET_MATRIX = r'SpamData\03_Testing\teast-target.txt'

VOCAB_SIZE=2500

# Loading Pre-Processed Data

In [3]:
train_data = np.loadtxt(TRAINING_DATA,delimiter=' ',dtype='int')

In [4]:
test_data = np.loadtxt(TEST_DATA,delimiter=' ',dtype='int')
test_data[:5]

array([[8, 2, 1, 1],
       [8, 3, 1, 4],
       [8, 4, 1, 2],
       [8, 5, 1, 1],
       [8, 6, 1, 2]])

In [5]:
VOCAB = pd.read_csv(WORD_LIST_FILE)

In [6]:
VOCAB.head()

Unnamed: 0,WORD_ID,VOCAB_WORDS
0,0,http
1,1,use
2,2,list
3,3,email
4,4,get


# Creating Full Matrix from the Sparse Matrix

### Creating Empty DataFrame : example

In [7]:
indexes = np.unique(train_data)

In [8]:
cols = ['DOC_ID']+['WORD_ID']+list(range(VOCAB_SIZE))

In [9]:
len(cols)

2502

In [10]:
full_train_matrix = pd.DataFrame(index=indexes,columns=cols)
full_train_matrix.fillna(value=0,inplace=True)

In [11]:
full_train_matrix.head()

Unnamed: 0,DOC_ID,WORD_ID,0,1,2,3,4,5,6,7,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Defining Function to fill the empty Full Matrix

In [21]:
def create_full_matrix(data,VOCAB_SIZE,doc_index=0,word_index=1,class_index=2,freq_index=3):
    
    cols = ['DOC_ID']+['CLASS']+list(range(VOCAB_SIZE))
        
    indexes = np.unique(data)

    full_matrix = pd.DataFrame(index=indexes,columns=cols)
    full_matrix.fillna(value=0,inplace=True)
    
    for word in data:
        DOC_ID=word[doc_index]
        WORD_ID=word[word_index]
        CLASS=word[class_index]
        OCCURENCE=word[freq_index]
        
        full_matrix.at[DOC_ID,'DOC_ID']=DOC_ID
        full_matrix.at[DOC_ID,'CLASS']=CLASS
        full_matrix.at[DOC_ID,WORD_ID]=OCCURENCE
    full_matrix.set_index('DOC_ID',inplace=True)
    return full_matrix
        

In [22]:
%%time
full_train_matrix=create_full_matrix(data=train_data,VOCAB_SIZE=VOCAB_SIZE)

Wall time: 9.47 s


# Training Bayes Classifier

In [30]:
prob_spam = full_train_matrix[full_train_matrix.CLASS==1].shape[0]/full_train_matrix.shape[0]

In [33]:
prob_ham = full_train_matrix[full_train_matrix.CLASS!=1].shape[0]/full_train_matrix.shape[0]

In [48]:
total_word = full_train_matrix.loc[:,full_train_matrix.columns!='CLASS'].sum(axis=1)
total_word_count = total_word.sum()

In [49]:
spam_word = total_word[full_train_matrix.CLASS==1]
spam_word_count= spam_word.sum()

In [50]:
ham_word = total_word[full_train_matrix.CLASS==0]
ham_word_count= ham_word.sum()

## Spam and Ham tokens summed by word ID

In [55]:
full_train_features = full_train_matrix.loc[:,full_train_matrix.columns!='CLASS']

In [59]:
spam_sumby_id = full_train_features[full_train_matrix.CLASS==1].sum(axis=0) + 1

In [60]:
ham_sumby_id = full_train_features[full_train_matrix.CLASS==0].sum(axis=0) + 1

## P(Token|Spam) - Probablity that the Token occurs given that the eamil is spam

In [61]:
prob_token_spam = spam_sumby_id / (spam_word_count+VOCAB_SIZE)

In [63]:
prob_token_spam.sum()

1.0

## P(Token|Ham) - Probablity that the Token occurs given that the eamil is not spam

In [64]:
prob_token_ham = ham_sumby_id / (ham_word_count+VOCAB_SIZE)

In [66]:
prob_token_ham.sum()

0.9999999999999999

## P(Token) - Probablity that the Token occurs

##### Laplace Smoothing not required for all tokens since they are the most 2500 frequent words

In [67]:
total_sumby_id = full_train_features.sum(axis=0)

In [68]:
prob_token = total_sumby_id / (total_word_count)

In [71]:
prob_token.sum()

1.0

# Preparing the Test Data

In [73]:
%%time
full_test_matrix=create_full_matrix(data=test_data,VOCAB_SIZE=VOCAB_SIZE)

Wall time: 5.06 s


In [75]:
X_test = full_test_matrix.loc[:,full_test_matrix.columns!='CLASS']
Y_test = full_test_matrix.loc[:,full_test_matrix.columns=='CLASS']

# Saving All the Data

In [77]:
np.savetxt(PROB_TOKEN_ALL,prob_token)
np.savetxt(PROB_TOKEN_SPAM,prob_token_spam)
np.savetxt(PROB_TOKEN_HAM,prob_token_ham)
np.savetxt(TEST_FEATURE_MATRIX,X_test)
np.savetxt(TEST_TARGET_MATRIX,Y_test)

# Testing the Bayes Classifier