In [1]:
#Solution by Christian Johnston, CPSC 4383, Fall 2023

#example and data provided by https://machinelearningmastery.com/standard-machine-learning-datasets/ example 4, Sonar Dataset

#The purpose of this example is to use a naive bayes theorem algorithm to predict whether a series of sonar readings
#indicate either a rock or a mine underneath the surface of the earth.
#this dataset (as indicated by the example website) is a binary-class problem, there are 2 classes that are indicated:
#R for rock, and M for mine.
#The dataset has 208 observations, with 60 input variables and 1 output variable.
#I will be using the provided k-fold cross-validation code with a k value of 10.

from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi

#Here is where I provide the data loading method.
def loadCSV(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csvReader = reader(file)
        for row in csvReader:
            if not row:
                continue
            dataset.append(row)
    return dataset
        
#Here are the helper functions for converting the dataset as well as the mathmatical operations for mean and stdev
def columnStringToFloat(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

def columnStringToInt(dataset, column):
    classValues = [row[column] for row in dataset]
    unique = set(classValues)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

def crossValidationSplit(dataset, nFolds):
    datasetSplit = list()
    datasetCopy = list(dataset)
    foldSize  = int(len(dataset) / nFolds)
    for _ in range(nFolds):
        fold = list()
        while len(fold) < foldSize:
            index = randrange(len(datasetCopy))
            fold.append(datasetCopy.pop(index))
        datasetSplit.append(fold)
    return datasetSplit

def accuracyMetric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

def evaluateAlgorithm(dataset, algorithm, nFolds, *args):
    folds = crossValidationSplit(dataset, nFolds)
    scores = list()
    for fold in folds:
        trainingSet = list(folds)
        trainingSet.remove(fold)
        trainingSet = sum(trainingSet, [])
        testSet = list()
        for row in fold:
            rowCopy = list(row)
            testSet.append(rowCopy)
            rowCopy[-1] = None
        predicted = algorithm(trainingSet, testSet, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracyMetric(actual, predicted)
        scores.append(accuracy)
    return scores

def mean(numbers):
    return sum(numbers)/float(len(numbers))

def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)


#following the tutorial provided by Dr. Milanova the first step is to Separate the data by class.
def separateByClass(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        classValue = vector[-1]
        if (classValue not in separated):
            separated[classValue] = list()
        separated[classValue].append(vector)
    return separated

#the next step is to summarize the dataset by calculating the mean, stdev, and count for each column vector.

def summarizeDataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries

#next we split the dataset by class and calculate statistics for each row.
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = dict()
    for classValue, rows in separated.items():
        summaries[classValue] = summarizeDataset(rows)
    return summaries

#the next step is to calculate the gaussian PDF
def calculateProbability(x, mean, stdev):
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

#the final step uses the training data to calculate probabilities for new data
def calculateClassProbabilities(summaries, row):
    totalRows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = summaries[classValue][0][2]/float(totalRows)
        for i in range(len(classSummaries) - 1):
            mean, stdev, count = classSummaries[i]
            probabilities[classValue] *= calculateProbability(row[i], mean, stdev)
    return probabilities

#now we have the prediction function that will predict the class for the given row
def predict(summaries, row):
    probabilities = calculateClassProbabilities(summaries, row)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

#here is the implementation of the naive bayes theorem
def naiveBayes(train, test):
    summarize = summarizeByClass(train)
    predictions = list()
    for row in test:
        output = predict(summarize, row)
        predictions.append(output)
    return(predictions)

#here I am using the provided dataset to display scores and the mean accuracy of the data
seed(1)
filename = 'sonarDataset.csv'
dataset = loadCSV(filename)
for i in range(len(dataset[0])-1):
    columnStringToFloat(dataset, i)
columnStringToInt(dataset, len(dataset[0])-1)
nFolds = 10
scores = evaluateAlgorithm(dataset, naiveBayes, nFolds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

#now to make a prediction based on some arbitrary values for a new sonar reading (just a test)
filename = 'sonarDataset.csv'
dataset = loadCSV(filename)
for i in range(len(dataset[0])-1):
    columnStringToFloat(dataset, i)
columnStringToInt(dataset, len(dataset[0])-1)
model = summarizeByClass(dataset)
# here I am adding a randomized assortment of values to create a test row
from random import random
row = list()
for i in range(0, 59):
    row.append(random())
   
label = predict(model, row)
print('Data=%s, Predicted: %s' % (row, label))

Scores: [65.0, 80.0, 60.0, 70.0, 65.0, 60.0, 70.0, 75.0, 80.0, 75.0]
Mean Accuracy: 70.000%
Data=[0.035326046636964814, 0.9598933172242908, 0.4456685163423838, 0.506309313164471, 0.42666502702759324, 0.8322449044927935, 0.9769761560529061, 0.6307718366221402, 0.6950508916124665, 0.45084502714273167, 0.5238954398669781, 0.030700276754036526, 0.6749025775182691, 0.8033855134341553, 0.6598238917510797, 0.4262993787138545, 0.7374512500957098, 0.12568332230972723, 0.21213169303189394, 0.04744017352451846, 0.07072687788690923, 0.07644615553590373, 0.9171763132459553, 0.29787980574251793, 0.15820738983282634, 0.5649407226767994, 0.13039112842440792, 0.5607173210286936, 0.850526660963271, 0.5905839712874152, 0.21759033920020698, 0.9008129952123803, 0.460852490153291, 0.8279131566567798, 0.8698864279223623, 0.7800172694327171, 0.6229628138905298, 0.03742337508476401, 0.20040745546617267, 0.0990253627729546, 0.5733827030223086, 0.8965657460164287, 0.5914093121448057, 0.4923507504494802, 0.937953