# Hayes-Roth Dataset

In [1]:
import random
import csv
import math
from math import log, pi, sqrt

#Load a CSV File
def load_csv(file_path):
    with open(file_path, 'r') as csv_file:
        csv_reader = csv.reader(csv_file)
        dataset = []
        for row in csv_reader:
            dataset.append(row)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column])

# Convert string column to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

# Split a dataset into k folds
def cross_validation_split(dataset, num_folds):
    dataset_split = []
    fold_size = len(dataset) // num_folds
    start = 0
    for i in range(num_folds):
        fold = []
        while len(fold) < fold_size and start < len(dataset):
            fold.append(dataset[start])
            start += 1
        dataset_split.append(fold)
    return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual_value, predicted_value):
    correct = 0
    for i in range(len(actual_value)):
        if actual_value[i] == predicted_value[i]:
            correct += 1
    return correct / float(len(actual_value)) * 100.0

# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if vector[-1] not in separated:
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

# Calculate the mean of a list of numbers
def mean(nums):
    return sum(nums) / float(len(nums))

# Calculate the standard deviation of a list of numbers
def stdev(nums):
    avg = mean(nums)
    variance = sum([(x - avg) ** 2 for x in nums]) / float(len(nums) - 1)
    return math.sqrt(variance)

# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries

# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = {}
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

# Calculate the Log probability distribution function for x
def calculate_probability(x, mean, stdev):
    log_constant = -0.5 * log(2 * pi) - log(stdev)
    log_exponent = -((x - mean)**2 / (2 * stdev**2))
    return log_constant + log_exponent

# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2] / float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

# Calculate the probabilities of predicting each class for a given row
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for i in range(len(folds)):
        train_set = list(folds)
        train_set.remove(folds[i])
        train_set = sum(train_set, [])
        test_set = list()
        for row in folds[i]:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in folds[i]]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

# Predict the class for a given row
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

# Naive Bayes Algorithm
def naive_bayes(train, test):
    summarize = summarize_by_class(train)
    predictions = list()
    for row in test:
        NB_output = predict(summarize, row)
        predictions.append(NB_output)
    return(predictions)

# Test Naive Bayes on Hayes-Roth Dataset
import random
random.seed(1)
filename = 'hayes-roth.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
str_column_to_int(dataset, len(dataset[0])-1)
model = summarize_by_class(dataset)
# define a new record
row = [1,2,4,3,3]

# Attribute Information:

# -- 1. name: distinct for each instance and represented numerically
# -- 2. hobby: nominal values ranging between 1 and 3
# -- 3. age: nominal values ranging between 1 and 4
# -- 4. educational level: nominal values ranging between 1 and 4
# -- 5. marital status: nominal values ranging between 1 and 4
# -- 6. class: nominal value between 1 and 3


# predict the label
label = predict(model, row)
print('Data=%s, Predicted: %s' % (row, label))
num_folds = 10
scores = evaluate_algorithm(dataset, naive_bayes, num_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Data=[1, 2, 4, 3, 3], Predicted: 2
Scores: [76.92307692307693, 61.53846153846154, 76.92307692307693, 76.92307692307693, 69.23076923076923, 69.23076923076923, 69.23076923076923, 53.84615384615385, 92.3076923076923, 76.92307692307693]
Mean Accuracy: 72.308%


# Car Evaluation Dataset

In [2]:
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi
from math import log
import pandas as pd
import numpy as np

file_handler = open("car.csv", "r")
df = pd.read_csv(file_handler, sep = ",")
df.columns = ["buying", "maint", "doors", "persons","lug_boot","safety","condition"]
file_handler.close()
df.buying[df.buying == 'vhigh'] = 4
df.buying[df.buying == 'high'] = 3
df.buying[df.buying == 'med'] = 2
df.buying[df.buying == 'low'] = 1

df.maint[df.maint == 'vhigh'] = 4
df.maint[df.maint == 'high'] = 3
df.maint[df.maint == 'med'] = 2
df.maint[df.maint == 'low'] = 1

df.doors[df.doors == '5more'] = 5

df.persons[df.persons == 'more'] = 5

df.lug_boot[df.lug_boot == 'small'] = 1
df.lug_boot[df.lug_boot == 'med'] = 2
df.lug_boot[df.lug_boot == 'big'] = 3

df.safety[df.safety == 'low'] = 1
df.safety[df.safety == 'med'] = 2
df.safety[df.safety == 'high'] = 3

df.condition[df.condition == 'unacc'] = 1
df.condition[df.condition == 'acc'] = 2
df.condition[df.condition == 'good'] = 3
df.condition[df.condition == 'vgood'] = 4
df.head()
df.to_csv("car-modified.csv")
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,condition
0,4,4,2,2,1,2,1
1,4,4,2,2,1,3,1
2,4,4,2,2,2,1,1
3,4,4,2,2,2,2,1
4,4,4,2,2,2,3,1
...,...,...,...,...,...,...,...
1722,1,1,5,5,2,2,3
1723,1,1,5,5,2,3,4
1724,1,1,5,5,3,1,1
1725,1,1,5,5,3,2,3


In [3]:
# Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
  folds = cross_validation_split(dataset,n_folds)
  scores = list()
  for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set,[])
    test_set = list()
    for row in fold:
      row_copy = list(row)
      test_set.append(row_copy)
      row_copy[-1] = None
    predicted = algorithm(train_set,test_set,*args)
    actual = [row[-1] for row in fold]
    accuracy = accuracy_metric(actual,predicted)
    scores.append(accuracy)
  return scores

# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers)/float(len(numbers))

# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)

# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries

# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
    if stdev == 0:
        if x == mean:
            return 1
        else:
            return 0
    else:
        exponent = exp(-((x-mean)**2 / (2 * stdev**2)))
        return (1 / (sqrt(2 * pi) * stdev)) * exponent

# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

# Predict the class for a given row
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

# Naive Bayes Algorithm
def naive_bayes(train, test):
    summarize = summarize_by_class(train)
    predictions = list()
    for row in test:
        output = predict(summarize, row)
        predictions.append(output)
    return(predictions)

# Test Naive Bayes on Car Dataset
seed(1)
filename = 'car-modified.csv'
dataset = load_csv(filename)
dataset.remove(dataset[0])
dataset = [row[1:] for row in dataset]

for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
model = summarize_by_class(dataset)
# define a new record
row = [1,1,5,5,2,2]
# predict the label
label = predict(model, row)
print('Data=%s, Predicted: %s' % (row, label))
n_folds = 10
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Data=[1, 1, 5, 5, 2, 2], Predicted: 3
Scores: [85.46511627906976, 84.88372093023256, 84.88372093023256, 87.20930232558139, 77.90697674418605, 83.13953488372093, 89.53488372093024, 88.95348837209302, 80.23255813953489, 85.46511627906976]
Mean Accuracy: 84.767%


# Breast Cancer Dataset

In [4]:
import pandas as pd
import numpy as np
#We are reading the data in pandas and are assigning the column names for the data. Then we are replacing strings with numbers.
file_handler = open("breast-cancer.csv", "r")
df = pd.read_csv(file_handler, sep = ",")
df.columns = ["Class", "age", "menopause", "tumorsize","invnodes","nodecaps","degmalig","breast","breastquad","irradiat"]
file_handler.close()
df.Class[df.Class == 'no-recurrence-events'] = 0
df.Class[df.Class == 'recurrence-events'] = 1



df.age[df.age == '10-19'] = 1
df.age[df.age == '20-29'] = 2
df.age[df.age == '30-39'] = 3
df.age[df.age == '40-49'] = 4
df.age[df.age == '50-59'] = 5
df.age[df.age == '60-69'] = 6
df.age[df.age == '70-79'] = 7
df.age[df.age == '80-89'] = 8
df.age[df.age == '90-99'] = 9




df.menopause[df.menopause == 'lt40'] = 1
df.menopause[df.menopause == 'ge40'] = 2
df.menopause[df.menopause == 'premeno'] = 3



df.tumorsize[df.tumorsize == '0-4'] = 1
df.tumorsize[df.tumorsize == '5-9'] = 2
df.tumorsize[df.tumorsize == '10-14'] = 3
df.tumorsize[df.tumorsize == '15-19'] = 4
df.tumorsize[df.tumorsize == '20-24'] = 5
df.tumorsize[df.tumorsize == '25-29'] = 6
df.tumorsize[df.tumorsize == '30-34'] = 7
df.tumorsize[df.tumorsize == '35-39'] = 8
df.tumorsize[df.tumorsize == '40-44'] = 9
df.tumorsize[df.tumorsize == '45-49'] = 10
df.tumorsize[df.tumorsize == '50-54'] = 11
df.tumorsize[df.tumorsize == '55-59'] = 12




df.invnodes[df.invnodes == '0-2'] = 1
df.invnodes[df.invnodes == '3-5'] = 2
df.invnodes[df.invnodes == '6-8'] = 3
df.invnodes[df.invnodes == '9-11'] = 4
df.invnodes[df.invnodes == '12-14'] = 5
df.invnodes[df.invnodes == '15-17'] = 6
df.invnodes[df.invnodes == '18-20'] = 7
df.invnodes[df.invnodes == '21-23'] = 8
df.invnodes[df.invnodes == '24-26'] = 9
df.invnodes[df.invnodes == '27-29'] = 10
df.invnodes[df.invnodes == '30-32'] = 11
df.invnodes[df.invnodes == '33-35'] = 12
df.invnodes[df.invnodes == '36-39'] = 13



df.nodecaps[df.nodecaps == 'yes'] = 1
df.nodecaps[df.nodecaps == 'no'] = 0



df.breast[df.breast == 'left'] = 1
df.breast[df.breast == 'right'] = 2



df.breastquad[df.breastquad == 'left_up'] = 1
df.breastquad[df.breastquad == 'left_low'] = 2
df.breastquad[df.breastquad == 'right_up'] = 3
df.breastquad[df.breastquad == 'right_low'] = 4
df.breastquad[df.breastquad == 'central'] = 5



df.irradiat[df.irradiat == 'yes'] = 1
df.irradiat[df.irradiat == 'no'] = 0


df.head()
print(df.shape)
print(df.isin(['?']).sum())
df = df.replace('?', np.nan)
df = df.dropna()
df.to_csv("breast-cancer-modified.csv", index=True)
df


(285, 10)
Class         0
age           0
menopause     0
tumorsize     0
invnodes      0
nodecaps      8
degmalig      0
breast        0
breastquad    1
irradiat      0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Class[df.Class == 'no-recurrence-events'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Class[df.Class == 'recurrence-events'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.age[df.age == '10-19'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.age[df.age == '20-29'] = 2
A value is tr

Unnamed: 0,Class,age,menopause,tumorsize,invnodes,nodecaps,degmalig,breast,breastquad,irradiat
0,0,4,3,5,1,0.0,2,2,3.0,0
1,0,4,3,5,1,0.0,2,1,2.0,0
2,0,6,2,4,1,0.0,2,2,1.0,0
3,0,4,3,1,1,0.0,2,2,4.0,0
4,0,6,2,4,1,0.0,2,1,2.0,0
...,...,...,...,...,...,...,...,...,...,...
280,1,3,3,7,1,0.0,2,1,1.0,0
281,1,3,3,5,1,0.0,3,1,1.0,1
282,1,6,2,5,1,0.0,1,2,1.0,0
283,1,4,2,7,2,0.0,3,1,2.0,0


In [5]:
import random
import math
import csv
from math import log, pi, sqrt

#Load a CSV File
def load_csv(file_path):
    with open(file_path, 'r') as csv_file:
        csv_reader = csv.reader(csv_file)
        dataset = []
        for row in csv_reader:
            dataset.append(row)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column])

# Convert string column to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

# Split a dataset into k folds
def cross_validation_split(dataset, num_folds):
    dataset_split = []
    fold_size = len(dataset) // num_folds
    start = 0
    for i in range(num_folds):
        fold = []
        while len(fold) < fold_size and start < len(dataset):
            fold.append(dataset[start])
            start += 1
        dataset_split.append(fold)
    return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual_value, predicted_value):
    correct = 0
    for i in range(len(actual_value)):
        if actual_value[i] == predicted_value[i]:
            correct += 1
    return correct / float(len(actual_value)) * 100.0

# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if vector[-1] not in separated:
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

# Calculate the mean of a list of numbers
def mean(nums):
    return sum(nums) / float(len(nums))

# Calculate the standard deviation of a list of numbers
def stdev(nums):
    avg = mean(nums)
    variance = sum([(x - avg) ** 2 for x in nums]) / float(len(nums) - 1)
    return math.sqrt(variance)

# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries

# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = {}
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

# Calculate the Log probability distribution function for x
def calculate_probability(x, mean, stdev):
    if stdev <= 0:
        return float('-inf')  # return negative infinity for zero or negative stdev
    log_constant = -0.5 * log(2 * pi) - log(stdev)
    log_exponent = -((x - mean)**2 / (2 * stdev**2))
    return log_constant + log_exponent

# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2] / float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

# Calculate the probabilities of predicting each class for a given row
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for i in range(len(folds)):
        train_set = list(folds)
        train_set.remove(folds[i])
        train_set = sum(train_set, [])
        test_set = list()
        for row in folds[i]:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in folds[i]]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

# Predict the class for a given row
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

# Naive Bayes Algorithm
def naive_bayes(train, test):
    summarize = summarize_by_class(train)
    predictions = list()
    for row in test:
        NB_output = predict(summarize, row)
        predictions.append(NB_output)
    return(predictions)

# Test Naive Bayes on Breast-Cancer Dataset
import random
random.seed(1)
filename = 'breast-cancer-modified.csv'
dataset = load_csv(filename)
dataset.remove(dataset[0])
dataset = [row[1:] for row in dataset]
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
str_column_to_int(dataset, len(dataset[0])-1)
model = summarize_by_class(dataset)
# define a new record
row = [1,5,2,7,2,0.0,3,1,2.0]

# predict the label
label = predict(model, row)
print('Data=%s, Predicted: %s' % (row, label))
num_folds = 10
scores = evaluate_algorithm(dataset, naive_bayes, num_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Data=[1, 5, 2, 7, 2, 0.0, 3, 1, 2.0], Predicted: 1
Scores: [100.0, 100.0, 100.0, 100.0, 88.88888888888889, 40.74074074074074, 51.85185185185185, 88.88888888888889, 44.44444444444444, 55.55555555555556]
Mean Accuracy: 77.037%
