<a href="https://colab.research.google.com/github/Charlee0616/Data-Mining/blob/main/NaiveBayesClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
penguins = pd.read_csv("https://github.com/mbrudd/csci290/raw/refs/heads/main/data/penguins.csv")

In [None]:
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi

In [None]:
def cross_validation_split(dataset, n_folds):
 dataset_split = list()
 dataset_copy = list(dataset)
 fold_size = int(len(dataset) / n_folds)
 for _ in range(n_folds):
  fold = list()
  while len(fold) < fold_size:
    index = randrange(len(dataset_copy))
    fold.append(dataset_copy.pop(index))
    dataset_split.append(fold)
 return dataset_split

In [None]:
def accuracy_metric(actual, predicted):
 correct = 0
 for i in range(len(actual)):
  if actual[i] == predicted[i]:
    correct += 1
 return correct / float(len(actual)) * 100.0

In [None]:
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
 folds = cross_validation_split(dataset, n_folds)
 scores = list()
 for fold in folds:
  train_set = list(folds)
  train_set.remove(fold)
  train_set = sum(train_set, [])
  test_set = list()
 for row in fold:
  row_copy = list(row)
  test_set.append(row_copy)
  row_copy[-1] = None
  predicted = algorithm(train_set, test_set, *args)
  actual = [row[-1] for row in fold]
  accuracy = accuracy_metric(actual, predicted)
  scores.append(accuracy)
 return scores

In [None]:
def separate_by_class(dataset):
 separated = dict()
 for i in range(len(dataset)):
   vector = dataset[i]
 class_value = vector[-1]
 if (class_value not in separated):
  separated[class_value] = list()
 separated[class_value].append(vector)
 return separated

In [None]:
def mean(numbers):
 return sum(numbers)/float(len(numbers))

In [None]:
def stdev(numbers):
 avg = mean(numbers)
 variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
 return sqrt(variance)

In [None]:
def summarize_dataset(dataset):
 summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
 del(summaries[-1])
 return summaries

In [None]:
def summarize_by_class(dataset):
 separated = separate_by_class(dataset)
 summaries = dict()
 for class_value, rows in separated.items():
  summaries[class_value] = summarize_dataset(rows)
 return summaries

In [None]:
def calculate_probability(x, mean, stdev):
 exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
 return (1 / (sqrt(2 * pi) * stdev)) * exponent


In [None]:
def calculate_class_probabilities(summaries, row):
 total_rows = sum([summaries[label][0][2] for label in summaries])
 probabilities = dict()
 for class_value, class_summaries in summaries.items():
  probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
 for i in range(len(class_summaries)):
  mean, stdev, _ = class_summaries[i]
  probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
 return probabilities

In [None]:
def predict(summaries, row):
 probabilities = calculate_class_probabilities(summaries, row)
 best_label, best_prob = None, -1
 for class_value, probability in probabilities.items():
  if best_label is None or probability > best_prob:
    best_prob = probability
    best_label = class_value
 return best_label

In [None]:
def naive_bayes(train, test):
 summarize = summarize_by_class(train)
 predictions = list()
 for row in test:
  output = predict(summarize, row)
  predictions.append(output)
 return(predictions)

In [None]:
seed(1)
n_folds = 5
scores = evaluate_algorithm(penguins, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))