In [10]:
import csv
import random
import math
 
def loadCsv(filename):
    lines = csv.reader(open(filename, "r"));
    dataset = list(lines)
    for i in range(len(dataset)):
       #converting strings into numbers for processing
        dataset[i] = [float(x) for x in dataset[i]]
        
    return dataset
 
def splitDataset(dataset, splitRatio):
    #67% training size
    trainSize = int(len(dataset) * splitRatio);
    trainSet = []
    copy = list(dataset);    
    while len(trainSet) < trainSize:
#generate indices for the dataset list randomly to pick ele for training data
        index = random.randrange(len(copy));       
        trainSet.append(copy.pop(index))    
    return [trainSet, copy]
 
def separateByClass(dataset):
    separated = {}
#creates a dictionary of classes 1 and 0 where the values are the instacnes belonging to each class
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    print("\n\n\n\n\Seperated Values:")
    print(separated)
    return separated
 
def mean(numbers):
    return sum(numbers)/float(len(numbers))
 
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)
 
def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)];
    del summaries[-1]
    return summaries
 
def summarizeByClass(dataset):
    separated = separateByClass(dataset);   
    summaries = {}
    for classValue, instances in separated.items():
    #summaries is a dic of tuples(mean,std) for each class value        
        summaries[classValue] = summarize(instances)
    print("\n\n\n\nDataset Summary")
    print(summaries)
    return summaries
 
def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
 
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():#class and attribute information as mean and sd
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i] #take mean and sd of every attribute for class 0 and 1 seperaely
            x = inputVector[i] #testvector's first attribute
            probabilities[classValue] *= calculateProbability(x, mean, stdev);#use normal dist
    print("\n\n\n\nClass Probabilities")
    print(probabilities)
    return probabilities

def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():#assigns that class which has he highest prob
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel
 
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    print("\n\n\n\n\nAlgorithm Predictions : ")
    print(predictions)
    return predictions
 
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0
 
def main():
    filename = '../../Downloads/Dataset/Dataset/5th data.csv'
    splitRatio = 0.70
    dataset = loadCsv(filename);
    print('Pima Indian Diabetes Dataset loaded...')
    print('Total instances available :',len(dataset))
    print('Total attributes present :',len(dataset[0])-1)
    print("First Five instances of dataset:")
    print("The attributes of the Dataset are \n 1.Pregnancies 2.Glucose 3.Bloodpressure 4.Skin Thickness 5.Insulin 6.BMI 7.Diabetes 8.Age 9.Outcome")
    
    for i in range(5):
        print(i+1 , ':' , dataset[i])
     
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    print('\nDataset is split into training and testing set.')
    print('Training examples = {0} \nTesting examples = {1}'.format(len(trainingSet),len(testSet)))
    
    summaries = summarizeByClass(trainingSet);   
    # test model
    predictions = getPredictions(summaries, testSet)
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy of the classifier is : {0}%'.format(accuracy))
 
main()

Pima Indian Diabetes Dataset loaded...
Total instances available : 768
Total attributes present : 8
First Five instances of dataset:
The attributes of the Dataset are 
 1.Pregnancies 2.Glucose 3.Bloodpressure 4.Skin Thickness 5.Insulin 6.BMI 7.Diabetes 8.Age 9.Outcome
1 : [6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
2 : [1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0, 0.0]
3 : [8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0]
4 : [1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.167, 21.0, 0.0]
5 : [0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.288, 33.0, 1.0]

Dataset is split into training and testing set.
Training examples = 537 
Testing examples = 231




\Seperated Values:
{0.0: [[3.0, 126.0, 88.0, 41.0, 235.0, 39.3, 0.704, 27.0, 0.0], [8.0, 194.0, 80.0, 0.0, 0.0, 26.1, 0.551, 67.0, 0.0], [0.0, 147.0, 85.0, 54.0, 0.0, 42.8, 0.375, 24.0, 0.0], [9.0, 72.0, 78.0, 25.0, 0.0, 31.6, 0.28, 38.0, 0.0], [2.0, 127.0, 46.0, 21.0, 335.0, 34.4, 0.176, 22.0, 0.0], [4.0, 131.0, 68.0, 21.0, 166.0, 33.1, 