In [1]:
!pip install numpy
!pip install pandas
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.6


# Dataset

1. load csv file (panda, numpy)

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from ucimlrepo import fetch_ucirepo

dataset = fetch_ucirepo(id=94)

x = dataset.data.features
y = dataset.data.targets

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

probSpam = y_train[y_train == 1].shape[0] / len(y_train)
probNotSpam = y_train[y_train == 0].shape[0] / len(y_train)

# Naive Bayes Implementation

In [3]:
def naiveBayes(x_train, x_test, y_train, y_test):
  for label in x_train.columns:
    labelProbSpam, labelProbNotSpam = calculate_likelihood(label, x_train, y_train)
    likelihoods[label] = {"spam": labelProbSpam, "notSpam": labelProbNotSpam}

def calculate_likelihood(label, x_train, y_train):
    """Calculate likelihood probabilities."""
    smoothing = 1
    numLabels = x_train.shape[1]
    labelSpamCount = 0
    labelSpamNotCount = 0

    spamTotal = y_train[y_train == 1].shape[0]
    notSpamTotal = y_train[y_train == 0].shape[0]

    for item in range(0, len(x_train[label])):
        if x_train[label].iloc[item] > 0:
          if y_train.iloc[item][0] == 1:
            labelSpamCount += 1
          else:
            labelSpamNotCount += 1

    labelSpam = labelSpamCount / spamTotal
    labelNotSpam = labelSpamNotCount / notSpamTotal

    spamProb = (labelSpam + 1) / (spamTotal + smoothing * numLabels)
    notSpamProb = (labelNotSpam + 1) / (spamTotal + smoothing * numLabels)

    return spamProb, notSpamProb

def naiveBayesClassifier(x_test, likelihoods, spamProb, notSpamProb):
  result = []
  spamLogProb = np.log(spamProb)
  notSpamLogProb = np.log(notSpamProb)

  for row in range(0, len(x_test)):
    for label in x_test.columns:
      if x_test[label].iloc[row] > 0:
        spamLogProb += np.log(likelihoods[label]["spam"])
        notSpamLogProb += np.log(likelihoods[label]["notSpam"])

    if spamLogProb > notSpamLogProb:
      result.append(1)
    else:
      result.append(0)

  return result

# K Nearest Neighbors Implementaion

In [50]:
def euclidean_distance(x_test, x_train):
    pointSum = 0
    neighbors = []
    all_distances = []
    for test_row in range(0, len(x_test)):
      for email in range(0, len(x_train)):
        for label in range(1, len(x_test[:].iloc[test_row]) - 1):
          'print("test row index: ", test_row, "email index: ", email, "label index: ", label)'
          pointSum += (x_test.iloc[label][test_row] - x_train.iloc[email][label]) ** 2
        neighbors.append(np.sqrt(pointSum))
      all_distances.append(neighbors)
    return all_distances

def knn_predict(x_train, y_train, x_test, k=3):
    y_pred = []
    distances = euclidean_distance(x_test, x_train)
    print("Distance Calculated: ", distances)
    k_indices = np.argsort(distances)[:k]
    k_nearest_labels = [y_train.iloc[0][i] for i in k_indices]
    most_common = np.bincount(k_nearest_labels).argmax()
    y_pred.append(most_common)
    return y_pred

# Logistic Regression Implementation

In [53]:
def sigmoid(z):
    """Sigmoid function."""
    return 1 / (1 + np.exp(-z))

def cost_function(x, y, theta):
    """Cost function."""
    m = len(y)
    h = sigmoid(np.dot(x, theta))
    return (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))

def gradient_descent(x, y, theta, learning_rate, num_iterations):
    """Gradient descent to optimize parameters."""
    m = len(y)
    costs = []
    for _ in range(num_iterations):
        h = sigmoid(np.dot(x, theta))
        gradient = np.dot(x.T, (h - y)) / m
        theta -= learning_rate * gradient
        cost = cost_function(x, y, theta)
        costs.append(cost)
    return theta, costs

def logistic_regression_train(x_train, y_train, learning_rate=0.01, num_iterations=1000):
    """Train Logistic Regression classifier."""
    intercept = np.ones((x_train.shape[0], 1))
    x_train = np.concatenate((intercept, x_train), axis=1)
    theta = np.zeros(x_train.shape[1])
    theta, _ = gradient_descent(x_train, y_train, theta, learning_rate, num_iterations)
    return theta

def logistic_regression_predict(x_test, theta):
    """Predict using Logistic Regression classifier."""
    intercept = np.ones((x_test.shape[0], 1))
    x_test = np.concatenate((intercept, x_test), axis=1)
    return sigmoid(np.dot(x_test, theta))

# Model Evaluation

https://scikit-learn.org/stable/modules/model_evaluation.html

In [None]:
def assessment(results, y_train):
  falsePos = 0;
  for value in range(0, len(y_train)):
    if results[value] == 1 and y_train["Class"].iloc[value] != 1:
      falsePos += 1
  return falsePos, len(y_train) - falsePos

likelihoods = {}
naiveBayes(x_train, x_test, y_train, y_test)
predictions = naiveBayesClassifier(x_test, likelihoods, probSpam, probNotSpam)
falsePos, truePos = assessment(predictions, y_test)
accuracy = accuracy_score(predictions, y_test)
print("Naive Bayes Accuracy:", accuracy)
print("Naive Bayes False Positives", falsePos)
print("Naive Bayes True Positives", truePos)

k = 5
y_pred = knn_predict(x_train, y_train, x_test, k)
accuracy = accuracy_score(y_test, predictions)
falsePos, truePos = assessment(predictions, y_test)
print("KNN Accuracy:", accuracy)
print("KNN False Positives", falsePos)
print("KNN True Positives", truePos)

theta = logistic_regression_train(x_train, y_train)
predictions = logistic_regression_predict(x_test, theta)
accuracy = accuracy_score(y_test, predictions)
falsePos, truePos = assessment(predictions, y_test)
print("LR Accuracy:", accuracy)
print("LR False Positives", falsePos)
print("LR True Positives", truePos)