In [301]:
import urllib

link = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"

response = urllib.urlopen(link)
cr = csv.reader(response)
dataset = list(cr)
for i in range(len(dataset)):
    dataset[i] = [float(x) for x in dataset[i]]



In [302]:
import random
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

In [303]:
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
            
        separated[vector[-1]].append(vector)
        
    return separated


In [304]:
import math
def mean(numbers):
    return sum(numbers)/float(len(numbers))


In [305]:
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

In [306]:
def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    
    del summaries[-1]
    return summaries


In [307]:
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    
    summaries = {}
    for classValue, instances in separated.iteritems():
        summaries[classValue] = summarize(instances)
    return summaries


In [308]:
import math
def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

In [309]:
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.iteritems():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

In [310]:
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.iteritems():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [311]:
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

In [312]:
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [313]:
trainingSet, testSet = splitDataset(dataset, splitRatio)
train = list(trainingSet)
test = list(testSet)
print('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet), len(testSet))
# prepare model
summaries = summarizeByClass(trainingSet)
# test model
predictions = getPredictions(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: {0}%').format(accuracy)

Split 768 rows into train=514 and test=254 rows
Accuracy: 75.9842519685%


In [314]:


features_train = []
labels_train = []
def labelextraction(set):
    labels = []
    data = []
    for i in range(len(set)):
        
        labels.append(set[i][-1] )
        del set[i][-1]
        
        data.append(set[i])
    return data,labels

features_train,labels_train = labelextraction(train)
features_test, labels_test = labelextraction(test)


In [315]:
import numpy as np
features_train = np.array(features_train)
labels_train = np.array(labels_train)
features_test = np.array(features_test)
labels_test = np.array(labels_test)

In [316]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
clf = GaussianNB()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
accuracy = accuracy_score(labels_test,pred) #TODO
print('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(train), len(test))
print('Accuracy: {0}%').format(accuracy * 100)

Split 768 rows into train=514 and test=254 rows
Accuracy: 77.5590551181%
