# import packages and read data

In [1]:
import scipy.io as scio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = scio.loadmat('spamData.mat')
print(data)

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Mon Aug 16 16:33:53 2021', '__version__': '1.0', '__globals__': [], 'Xtrain': array([[  0.   ,   0.   ,   0.   , ...,   4.5  ,  34.   , 108.   ],
       [  0.   ,   0.   ,   0.71 , ...,   1.974,  34.   , 229.   ],
       [  0.   ,   0.   ,   0.   , ...,   1.   ,   1.   ,   5.   ],
       ...,
       [  0.   ,   0.   ,   0.   , ...,   1.884,   9.   ,  98.   ],
       [  0.   ,   0.   ,   0.7  , ...,   1.333,   4.   ,  16.   ],
       [  0.   ,   0.   ,   0.   , ...,   1.   ,   1.   ,   4.   ]]), 'Xtest': array([[1.500e-01, 0.000e+00, 6.300e-01, ..., 1.111e+00, 7.000e+00,
        3.890e+02],
       [0.000e+00, 0.000e+00, 7.000e-01, ..., 1.820e+00, 1.800e+01,
        3.040e+02],
       [5.800e-01, 0.000e+00, 1.900e-01, ..., 3.015e+00, 2.100e+01,
        1.900e+02],
       ...,
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 1.142e+00, 3.000e+00,
        4.000e+01],
       [0.000e+00, 0.000e+00, 3.100e-01, ..., 1.515e+00, 1

In [3]:
X_train = pd.DataFrame(data['Xtrain'])
X_test = pd.DataFrame(data['Xtest'])
y_train = pd.DataFrame(data['ytrain'])
y_test = pd.DataFrame(data['ytest'])

# Data Processing:
## Use log-transform: transform each feature using log(Xij + 0.1) (assume natural log)

In [4]:
X_train_log = np.log(X_train + 0.1)
X_test_log = np.log(X_test + 0.1)

## We need to use λml to estimate prior like Q1. And use maximum likelihood to estimate the class conditional mean and variance of each feature. Finally, the label of y is predicted by adding the predicted probabilities of y=c and all xi

In [5]:
class Gaussian_NaiveBayes_Classifier():

    '''      
        X_train(pd.DataFrame) : features of train data
        y_train(pd.DataFrame) : label of train data
        X_test(pd.DataFrame) : features of test data
        y_test(pd.DataFrame) : label of test data
    '''
    def __init__(self, X_train, y_train, X_test, y_test):      
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
    
    #calculate the piror probability of y = 1
    def calculate_lamdaML(self):
        return self.y_train.sum()[0] / self.y_train.size
    
    '''
        mean(float) : μ of the Gaussian Distribution
        variance(float) : σ2 of the Gaussian Distribution
        x(float) : input x ,return the p(x)
    '''
    #calculate the p(x) of Gaussian Distribution 
    def GaussianDistribution(self, mean, variance, x):
        return np.exp((-0.5*((x - mean)**2))/variance)/((2*np.pi*variance)**0.5)
    
    '''
        X(pd.DataFrame) : input X,column numbers = 1
    '''
    #calculate the mean of X
    def estimate_mean_ML(self, X):
        return X.sum() / X.size
    
    '''
        X(pd.DataFrame) : input X,column numbers = 1
    '''
    #calculate the variance of X
    def estimate_variance_ML(self, X):
        mean = self.estimate_mean_ML(X)
        return X.apply(lambda x:(x - mean)**2).sum() / X.size
    
    
    #calculate a matrix,len of row equals feature of X; 
    #Column means estimate mean when y=1, estimate variance when y=1,estimate mean when y=0, estimate variance when y=0 
    def calculate_Parameters_ForALLFeaturesAndClass(self):
        result = np.zeros((self.X_train.shape[1],4))
        for i in range(0,self.X_train.shape[1]):
            result[i][0] = self.estimate_mean_ML(self.X_train.loc[self.y_train[0]==1, i])
            result[i][1] = self.estimate_variance_ML(self.X_train.loc[self.y_train[0]==1, i])
            result[i][2] = self.estimate_mean_ML(self.X_train.loc[self.y_train[0]==0, i])
            result[i][3] = self.estimate_variance_ML(self.X_train.loc[self.y_train[0]==0, i])
        return result
    
    '''      
        X_to_predict(pd.DataFrame) : Data to predict
    '''
    #input features ,predict the lable.
    def predict_label(self, X_to_predict):
          
        predict = []
        piror_1 = np.log(self.calculate_lamdaML())
        piror_0 = np.log(1 - self.calculate_lamdaML())
        parameters_jc = self.calculate_Parameters_ForALLFeaturesAndClass()
        
        # loop through each row
        for row_idx in range(0, X_to_predict.shape[0]):
            row = X_to_predict.loc[row_idx,:]
            p1 = piror_1
            p0 = piror_0
            # loop through each feature
            for col_idx in range(0, X_to_predict.shape[1]):
                Xj = row[col_idx]
                p1 += np.log(self.GaussianDistribution(parameters_jc[col_idx][0], parameters_jc[col_idx][1], Xj))
                p0 += np.log(self.GaussianDistribution(parameters_jc[col_idx][2], parameters_jc[col_idx][3], Xj))
            if p1 > p0:
                predict.append(1)
            else:
                predict.append(0)
            
        return predict
        
     
    '''      
        pred(list) : Predict label of data
        true(np.ndarray) : True label of data
    '''
    #calculate the error rate(%) of the predict label and true label
    def calculate_error(self, pred, true):
        return (pred^true).sum() / len(pred) * 100
    
    #return the train error rate and test error rate
    def run(self):
        train_predict = self.predict_label(self.X_train)
        test_predict = self.predict_label(self.X_test)
        # score and return results
        return self.calculate_error(train_predict, self.y_train[0].values), \
               self.calculate_error(test_predict, self.y_test[0].values)

## Q2
## Training and testing error rates for the log-transformed data.

In [6]:
GNC = Gaussian_NaiveBayes_Classifier( X_train_log, y_train, X_test_log, y_test)
GNC.run()
print('train error rate is %f%% , test error rate is %f%%' %(GNC.run()[0], GNC.run()[1]))



train error rate is 16.574225% , test error rate is 16.015625%
