In [1]:
import numpy as np
import math
import csv

def read_data(filename):
    with open(filename, 'r') as csvfile:
        datareader = csv.reader(csvfile)
        metadata = next(datareader)
        traindata = [row for row in datareader]
    return (metadata, traindata)

def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    testSet = list(dataset)
    i = 0
    while len(trainSet) < trainSize:
        trainSet.append(testSet.pop(i))
    return [trainSet, testSet]

def classify(data, test):
    total_size = data.shape[0]
    print("\n")
    print("Training data size:", total_size)
    print("Test data size:", test.shape[0])

    countYes = 0
    countNo = 0
    probYes = 0
    probNo = 0
    print("\n")
    print("Target count probability")

    for x in range(data.shape[0]):
        if data[x, data.shape[1] - 1] == 'yes':
            countYes += 1
        if data[x, data.shape[1] - 1] == 'no':
            countNo += 1

    probYes = countYes / total_size
    probNo = countNo / total_size

    print('Yes', "\t", countYes, "\t", probYes)
    print('No', "\t", countNo, "\t", probNo)

    prob0 = np.zeros((test.shape[1] - 1))
    prob1 = np.zeros((test.shape[1] - 1))
    accuracy = 0
    print("\n")
    print("Instance prediction target")

    for t in range(test.shape[0]):
        for k in range(test.shape[1] - 1):
            count1 = count0 = 0
            for j in range(data.shape[0]):
                # How many times appeared with no
                if test[t, k] == data[j, k] and data[j, data.shape[1] - 1] == 'no':
                    count0 += 1
                # How many times appeared with yes
                if test[t, k] == data[j, k] and data[j, data.shape[1] - 1] == 'yes':
                    count1 += 1
            prob0[k] = count0 / countNo
            prob1[k] = count1 / countYes

        probno = probNo
        probyes = probYes
        for i in range(test.shape[1] - 1):
            probno = probno * prob0[i]
            probyes = probyes * prob1[i]

        if probno > probyes:
            predict = 'no'
        else:
            predict = 'yes'

        print(t + 1, "\t", predict, "\t ", test[t, test.shape[1] - 1])
        if predict == test[t, test.shape[1] - 1]:
            accuracy += 1

    final_accuracy = (accuracy / test.shape[0]) * 100
    print("Accuracy:", final_accuracy, "%")
    return

metadata, traindata = read_data("3-dataset.csv")
print("The attributes names of training data: ", metadata)
splitRatio = 0.6
trainingSet, testSet = splitDataset(traindata, splitRatio)
training = np.array(trainingSet)
print("\nThe training data set are:")
for x in trainingSet:
    print(x)

testing = np.array(testSet)
print("\nThe test data set are:")
for x in testing:
    print(x)

classify(training, testing)


The attributes names of training data:  ['weather', 'temperature', 'humidity', 'wind', 'play football']

The training data set are:
['sunny', 'hot', 'high', 'weak', 'no']
['sunny', 'hot', 'high', 'strong', 'no']
['cloudy', 'hot', 'high', 'weak', 'yes']
['rain', 'mild', 'high', 'weak', 'yes']
['rain', 'cool', 'normal', 'weak', 'yes']
['rain', 'cool', 'normal', 'strong', 'no']
['cloudy', 'cool', 'normal', 'strong', 'yes']
['sunny', 'mild', 'high', 'weak', 'no']

The test data set are:
['sunny' 'cool' 'normal' 'weak' 'yes']
['rain' 'mild' 'normal' 'weak' 'yes']
['sunny' 'mild' 'normal' 'strong' 'yes']
['cloudy' 'mild' 'high' 'strong' 'yes']
['cloudy' 'hot' 'normal' 'weak' 'yes']
['rain' 'mild' 'high' 'strong' 'no']


Training data size: 8
Test data size: 6


Target count probability
Yes 	 4 	 0.5
No 	 4 	 0.5


Instance prediction target
1 	 no 	  yes
2 	 yes 	  yes
3 	 no 	  yes
4 	 yes 	  yes
5 	 yes 	  yes
6 	 no 	  no
Accuracy: 66.66666666666666 %
