In [1]:
def splitDataset(dataset, splitRatio):
    testSize = int(len(dataset) * splitRatio);
    trainSet = list(dataset);
    testSet = []
    while len(testSet) < testSize:
    #randomly pick an instance from training data 
        index = random.randrange(len(trainSet)); 
        testSet.append(trainSet.pop(index))
    return [trainSet, testSet]


In [2]:
import random, math
import statistics as st


In [3]:
def estimateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

In [4]:
def calculateClassProbabilities(summaries, testVector):
    p = {}
#class and attribute information as mean and sd
    for classValue, classSummaries in summaries.items():
        p[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = testVector[i] #testvector's first attribute
            #use normal distribution
            p[classValue] *= estimateProbability(x, mean, stdev)
    return p

In [5]:
def predict(summaries, testVector):
    all_p = calculateClassProbabilities(summaries, testVector)
    print("All Probabilities\n")
    print(all_p)
    bestLabel, bestProb = None, -1
    for lbl, p in all_p.items():#assigns that class which has he highest prob 
        if bestLabel is None or p > bestProb:
            bestProb = p
            bestLabel = lbl
    return bestLabel

In [6]:
def perform_classification(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

In [7]:
def compute_mean_std(dataset):
    mean_std = [ (st.mean(attribute), st.stdev(attribute)) for attribute in zip(*dataset)]; #zip(*res) transposes a matrix (2-d array/list) 
    del mean_std[-1] # Exclude label
    return mean_std

In [8]:
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [9]:
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        x = dataset[i]
        if (x[-1] not in separated):
            separated[x[-1]] = []
        separated[x[-1]].append(x)
    return separated

In [10]:
def summarizeByClass(dataset):
    separated = separateByClass(dataset);
    print("separated by 0 and 1 class\n")
    print(separated)
    summary = {} # to store mean and std of +ve and -ve instances 
    for classValue, instances in separated.items():
        #summaries is a dictionary of tuples(mean,std) for each class value 
        summary[classValue] = compute_mean_std(instances)
    print("Summary of mean and standard deviataion") 
    print(summary)
    return summary

In [11]:
def loadCsv(filename):
    lines = csv.reader(open(filename, "r"));
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

In [12]:
import numpy as np
import pandas as pd
import csv
dataset = loadCsv('data5.csv');
print('Pima Indian Diabetes Dataset loaded...')
print('Total instances available :',len(dataset))

Pima Indian Diabetes Dataset loaded...
Total instances available : 768


In [13]:
print('Total attributes present :',len(dataset[0])-1)
print("First Five instances of dataset:")
for i in range(5):
    print(i+1 , ':' , dataset[i])


Total attributes present : 8
First Five instances of dataset:
1 : [6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
2 : [1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0, 0.0]
3 : [8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0]
4 : [1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.167, 21.0, 0.0]
5 : [0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.288, 33.0, 1.0]


In [14]:
splitRatio = 0.2
trainingSet, testSet = splitDataset(dataset, splitRatio)
print('\nDataset is split into training and testing set.')
print('Training Set Size',len(trainingSet))
print('Testing Set Size',len(testSet))



Dataset is split into training and testing set.
Training Set Size 615
Testing Set Size 153


In [15]:
summaries = summarizeByClass(trainingSet)


separated by 0 and 1 class

{1.0: [[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0], [8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0], [0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.288, 33.0, 1.0], [2.0, 197.0, 70.0, 45.0, 543.0, 30.5, 0.158, 53.0, 1.0], [8.0, 125.0, 96.0, 0.0, 0.0, 0.0, 0.232, 54.0, 1.0], [1.0, 189.0, 60.0, 23.0, 846.0, 30.1, 0.398, 59.0, 1.0], [7.0, 100.0, 0.0, 0.0, 0.0, 30.0, 0.484, 32.0, 1.0], [0.0, 118.0, 84.0, 47.0, 230.0, 45.8, 0.551, 31.0, 1.0], [1.0, 115.0, 70.0, 30.0, 96.0, 34.6, 0.529, 32.0, 1.0], [7.0, 196.0, 90.0, 0.0, 0.0, 39.8, 0.451, 41.0, 1.0], [9.0, 119.0, 80.0, 35.0, 0.0, 29.0, 0.263, 29.0, 1.0], [11.0, 143.0, 94.0, 33.0, 146.0, 36.6, 0.254, 51.0, 1.0], [7.0, 147.0, 76.0, 0.0, 0.0, 39.4, 0.257, 43.0, 1.0], [3.0, 158.0, 76.0, 36.0, 245.0, 31.6, 0.851, 28.0, 1.0], [9.0, 102.0, 76.0, 37.0, 0.0, 32.9, 0.665, 46.0, 1.0], [2.0, 90.0, 68.0, 42.0, 0.0, 38.2, 0.503, 27.0, 1.0], [4.0, 111.0, 72.0, 47.0, 207.0, 37.1, 1.39, 56.0, 1.0], [9.0, 171.0, 110.0, 24.

In [16]:
predictions = perform_classification(summaries, testSet)
print("predicted values\n")
print(predictions)


All Probabilities

{1.0: 9.68191438043664e-14, 0.0: 9.733803154277547e-14}
All Probabilities

{1.0: 5.471789784770153e-13, 0.0: 4.210710498938299e-12}
All Probabilities

{1.0: 8.762993149534879e-13, 0.0: 2.2219981791727073e-13}
All Probabilities

{1.0: 4.144706123679555e-13, 0.0: 4.149669973603987e-12}
All Probabilities

{1.0: 6.56005591630547e-14, 0.0: 2.7840554350653554e-12}
All Probabilities

{1.0: 1.5284370767195854e-14, 0.0: 7.235403748697247e-17}
All Probabilities

{1.0: 7.402524450814701e-14, 0.0: 2.7982651188756686e-12}
All Probabilities

{1.0: 1.176096913938118e-13, 0.0: 9.73298095484315e-13}
All Probabilities

{1.0: 2.2269024993879015e-13, 0.0: 1.1139848841810558e-14}
All Probabilities

{1.0: 2.1658275854192633e-17, 0.0: 4.0775433584814087e-16}
All Probabilities

{1.0: 9.152070883125073e-15, 0.0: 9.577181082508136e-16}
All Probabilities

{1.0: 2.6978784675272517e-18, 0.0: 1.1331193653187973e-18}
All Probabilities

{1.0: 1.2068103946419965e-13, 0.0: 2.2337490636316464e-14}
All

In [17]:
accuracy = getAccuracy(testSet, predictions)
print('\nAccuracy of the Naive Baysian Classifier is :', accuracy)


Accuracy of the Naive Baysian Classifier is : 72.54901960784314
