In [None]:
from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd
from collections import Counter
from sklearn import metrics
from random import randrange
from math import sqrt
from math import exp
from math import pi

file_path = "/content/drive/MyDrive/dm/Project2/project2_dataset2.txt"
df = pd.read_csv(file_path,sep='\t',header=None)
for i in range(df.shape[1]):
  if df[i].dtype == object and isinstance(df.iloc[0][i], str):
   df[i] = pd.factorize(df.iloc[:,i])[0]

X = df.iloc[:,:-1]
Y = df.iloc[:,-1]
print(X)
print(Y)

#Preparing the dataset for the algorithm
data = df.values.tolist()
print(data)

Step 1: Separate By Class

In [None]:
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        row = dataset[i]
        class_value = row[-1]
        if class_value not in separated:
            separated[class_value] = list()
        separated[class_value].append(row)
    return separated

Step 2: Summarize the dataset

In [None]:
def mean(dataset):
  mean_values = np.mean(dataset,axis=0)
  return mean_values

def stddev(dataset):
  std_values = np.std(dataset, axis=0)
  return std_values

def summarize_dataset(dataset):
	summaries = [(mean(column), stddev(column), len(column)) for column in zip(*dataset)]
	del(summaries[-1])
	return summaries

Step 3: Summarize data by class

In [None]:
# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
	separated = separate_by_class(dataset)
	summaries = dict()
	for class_value, rows in separated.items():
		summaries[class_value] = summarize_dataset(rows)
	return summaries

summary = summarize_by_class(data)
for label in summary:
	print(label)
	for row in summary[label]:
		print(row)

*Step* 4: Gaussian Probability Density Function

In [None]:
import math
def calculate_probability(x, mean, stdev):
	exponent = math.exp(-((x-mean)**2 / (2 * stdev**2 )))
	return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

Step 5: Class Proabilities

In [None]:
def calculate_probability(x, mean, stdev):
	exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
	return (1 / (sqrt(2 * pi) * stdev)) * exponent

def calculate_class_probabilities(summaries, row):
	total_rows = sum([summaries[label][0][2] for label in summaries])
	probabilities = dict()
	for class_value, class_summaries in summaries.items():
		probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
		for i in range(len(class_summaries)):
			mean, stdev, count = class_summaries[i]
			probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
	return probabilities

Step 6: Predict

In [None]:
def predict(summaries, row):
	probabilities = calculate_class_probabilities(summaries, row)
	best_label, best_prob = None, -1
	for class_value, probability in probabilities.items():
		if best_label is None or probability > best_prob:
			best_prob = probability
			best_label = class_value
	return best_label

# Naive Bayes Algorithm
def naive_bayes(train, test):
	summarize = summarize_by_class(train)
	predictions = list()
	for row in test:
		output = predict(summarize, row)
		predictions.append(output)
	return(predictions)

Step 7. 10-fold Cross Validation

In [None]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:

            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
    dataset_split.append(fold)
    return dataset_split


def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    actual = []
    predicted = []
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
    return actual, predicted

Evaluation Metrics - Accuracy, Precision, Recall and F-1 measure

In [None]:
# Evaluate Naive Bayes on project1_dataset1
n_folds = 10
actual, predicted = evaluate_algorithm(df, naive_bayes, n_folds)

print (actual, predicted)

accuracy = metrics.accuracy_score(actual, predicted)
precision = metrics.precision_score(actual, predicted)
recall = metrics.recall_score(actual, predicted)
f1_measure = metrics.f1_score(actual, predicted)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1-measure: ", f1_measure)
