# Section 2. Logistic Regression Spam Classification

> Import Necessary Libraries

In [97]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts
from tqdm import tqdm
from IPython.display import Markdown as md

> Reading in the Data
>> Randomizing the Data

>> Splitting the data into X and Y vectors

In [98]:
# Read in the data
data = np.genfromtxt('spambase.data', delimiter=',')
dataMat = np.array(data)
# Set RNG with seed = 0
np.random.seed(0)
np.random.shuffle(dataMat)
# Splitting the data into X and Y vectors
X = dataMat[:, :-1]
Y =  np.reshape(dataMat[:, -1], (-1, 1))

> Train-Test Split on the data

In [99]:
# Split the training and testing sets in a 2:1 ratio
trainX, testX, trainY, testY = tts(X, Y, test_size=0.333, random_state=1, shuffle=False)

> Standardizing the Data using the training data
>> Take the mean and the standard deviation

In [100]:
mean = trainX.mean(axis=0)
std = trainX.std(axis=0, ddof=1)
####################################################################
trainX_std = (trainX - mean) / std
bias = np.ones((trainX_std.shape[0], 1))
TRAIN_X = np.append(bias, trainX_std , axis=1)

####################################################################
testX_std = (testX - mean) / std
bias = np.ones((testX_std.shape[0], 1))
TEST_X = np.append(bias, testX_std , axis=1)


> Perform Batch Gradient Descent Using the Sigmoid Function

In [101]:

HYPERPARAMETERS = {
  "eta" : 0.01,
  "term" : 2 ** (-23),
  "EPSILON" : 10**(-7),
  "n_iterations" : 1500,
}
def sigmoid(x, thetas):
      return 1 / (1 + np.exp(-x @ thetas))
def dLdtheta(x, y, g):
      return x.T @ (g - y)
def L(x, y, g):
      return -1 / TRAIN_X.shape[0] * y.T @ np.log(g + HYPERPARAMETERS["EPSILON"]) + (1 - y.T) @ np.log(1-g + HYPERPARAMETERS["EPSILON"]) 

thetas = np.random.uniform(-1, 1, (TRAIN_X.shape[1], 1))
prev_cost = 0
for i in tqdm(range(HYPERPARAMETERS["n_iterations"]), ascii=True, desc="Training Logistic Regression Spam Classification"):
  g = sigmoid(TRAIN_X, thetas)
  cost = L(TRAIN_X, trainY, g)
  gradient = dLdtheta(TRAIN_X, trainY, g)

  # update thetas by batch gradient descent
  thetas -= HYPERPARAMETERS["eta"] * gradient
  if np.abs(prev_cost - cost) < HYPERPARAMETERS["term"]:
    i = HYPERPARAMETERS["n_iterations"]
  prev_cost = cost



res = "$$y ="
for idx, theta in enumerate(thetas):
  # print(f'theta_{idx}: {theta[0]:0.4f}')
  if idx != 0:
    res += f' {theta[0]:=+0.4f}x_' + '{' + str(idx) + '}'
    if idx % 10 == 0:
          res += '\\\\'
  else:
      res += f'{theta[0]:= 0.4f}\\\\'
res += "$$"
  
md(res)

Training Logistic Regression Spam Classification: 100%|##########| 1500/1500 [00:06<00:00, 235.55it/s]


$$y =-8.7216\\ -0.3676x_{1} -0.0427x_{2} -0.2194x_{3} +7.4630x_{4} +0.8732x_{5} +0.7764x_{6} +1.8644x_{7} +0.6760x_{8} +0.4837x_{9} +0.1686x_{10}\\ -0.6482x_{11} -0.4498x_{12} -0.1772x_{13} +0.4653x_{14} +1.1342x_{15} +1.3754x_{16} +1.1540x_{17} +0.2235x_{18} -0.6838x_{19} +0.5617x_{20}\\ +0.3475x_{21} +0.1549x_{22} +1.5531x_{23} +0.4138x_{24} -3.3752x_{25} -1.2123x_{26} -21.7450x_{27} +0.9069x_{28} -1.5840x_{29} +0.0948x_{30}\\ -0.1250x_{31} -3.8789x_{32} -0.4879x_{33} +2.4980x_{34} -2.1108x_{35} +0.8383x_{36} +0.4946x_{37} -0.1948x_{38} -0.2009x_{39} +0.3971x_{40}\\ -8.5656x_{41} -3.1524x_{42} -0.1296x_{43} -1.4432x_{44} -1.2485x_{45} -1.7339x_{46} -0.2191x_{47} -0.6023x_{48} -0.1634x_{49} +0.0426x_{50}\\ +0.1286x_{51} +1.2250x_{52} +2.9533x_{53} +2.0048x_{54} -0.8513x_{55} +3.3819x_{56} +1.1628x_{57}$$

> Classification

In [102]:

spam_threshold = 0.50

yhat = sigmoid(TEST_X , thetas)
predictions = np.where(yhat >= spam_threshold, 1, 0)

TP = FP = TN = FN = 0
for prediction , truth in zip(predictions, testY):
    if prediction == truth:
        if truth == 1:
            TP += 1
        else:
            TN += 1
    else:
        if prediction == 1:
            FP += 1
        else:
            FN += 1
# print(TP, FP , TN , FN)
Precision =  TP / (TP + FP) 
Recall = TP / (TP + FN) 
F_1 = (2 * Precision * Recall) / (Precision + Recall) 
Accuracy = (TP + TN) / yhat.shape[0] 
# md("$$Precision = \\frac{TP}{TP + FP},\\ Recall = \\frac{TP}{TP + FN}, F_1  = \\frac{2 \\times Precision \\times Recall}{Precision + Recall}, Accuracy = \\frac{TP + TN}{Y.size}$$")
md(f"$$Precision = {Precision*100:0.4f}\%," + "\\hspace{5pt}" + f"Recall = {Recall*100:0.4f}\%,"+ "\\hspace{5pt}" + f"F_1  = {F_1*100:0.4f}\%," + "\\hspace{5pt}" + f"Accuracy = {Accuracy*100:0.4f}\%$$")

$$Precision = 92.3414\%,\hspace{5pt}Recall = 73.2639\%,\hspace{5pt}F_1  = 81.7038\%,\hspace{5pt}Accuracy = 87.6712\%$$

# Section 3: Naive Bayes Classifier

In [103]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts
from scipy.stats import norm
from IPython.display import Markdown as md

In [104]:
# Read in the data
data = np.genfromtxt('spambase.data', delimiter=',')
dataMat = np.array(data)
# Set RNG with seed = 0
np.random.seed(0)
np.random.shuffle(dataMat)
# Splitting the data into X and Y vectors
X = dataMat[:, :-1]
Y =  np.reshape(dataMat[:, -1], (-1, 1))

In [105]:
# Split the training and testing sets in a 2:1 ratio
trainX, testX, trainY, testY = tts(X, Y, test_size=0.33, random_state=1, shuffle=False)

In [106]:
mean = trainX.mean(axis=0)
std = trainX.std(axis=0, ddof=1)
####################################################################
trainX_std = (trainX - mean) / std
bias = np.ones((trainX_std.shape[0], 1))
TRAIN_X = np.append(bias, trainX_std , axis=1)

####################################################################
testX_std = (testX - mean) / std
bias = np.ones((testX_std.shape[0], 1))
TEST_X = np.append(bias, testX_std , axis=1)

In [107]:
spam_mask = np.asarray(np.where(trainY == 1, True, False)).reshape(-1)
non_spam_mask = np.invert(spam_mask)

spam_train = np.compress(spam_mask, trainX, axis=0)
non_spam_train = np.compress(non_spam_mask, trainX, axis=0)

spam_train_mean = np.mean(spam_train, axis=0)
non_spam_train_mean = np.mean(non_spam_train, axis=0)

spam_train_std = np.std(spam_train, axis=0, ddof=1)
non_spam_train_std = np.std(non_spam_train, axis=0, ddof=1)

spam_prior = spam_mask.shape[0] / trainY.shape[0]
non_spam_prior = non_spam_mask.shape[0] / trainY.shape[0]


In [108]:
TP = FP = TN = FN = 0
# adding the epsilon because there will be divide by zero errors
spam_norm = norm.pdf(testX, spam_train_mean, spam_train_std + np.finfo(float).eps) 
non_spam_norm = norm.pdf(testX, non_spam_train_mean, non_spam_train_std + np.finfo(float).eps) 


p_spam = np.nan_to_num(np.prod(spam_norm, axis=1) * spam_prior)
p_non_spam = np.nan_to_num(np.prod(non_spam_norm, axis=1) * non_spam_prior)

predictions = np.asarray(np.where(p_spam >= p_non_spam, 1, 0)).reshape(-1)
for prediction , truth in zip(predictions, testY):
    if prediction == truth:
        if truth == 1:
            TP += 1
        else:
            TN += 1
    else:
        if prediction == 1:
            FP += 1
        else:
            FN += 1

print(TP, FP , TN , FN)
Precision =  TP / (TP + FP) 
Recall = TP / (TP + FN) 
F_1 = (2 * Precision * Recall) / (Precision + Recall) 
Accuracy = (TP + TN) / yhat.shape[0] 
# md("$$Precision = \\frac{TP}{TP + FP},\\ Recall = \\frac{TP}{TP + FN}, F_1  = \\frac{2 \\times Precision \\times Recall}{Precision + Recall}, Accuracy = \\frac{TP + TN}{Y.size}$$")
md(f"$$Precision = {Precision*100:0.4f}\%," + "\\hspace{5pt}" + f"Recall = {Recall*100:0.4f}\%,"+ "\\hspace{5pt}" + f"F_1  = {F_1*100:0.4f}\%," + "\\hspace{5pt}" + f"Accuracy = {Accuracy*100:0.4f}\%$$")

561 337 609 12


$$Precision = 62.4722\%,\hspace{5pt}Recall = 97.9058\%,\hspace{5pt}F_1  = 76.2746\%,\hspace{5pt}Accuracy = 76.3209\%$$