In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [9]:
import random
import math
%matplotlib inline

In [6]:
df = pd.read_csv("./weather.csv")

In [7]:
df.head()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Class
0,0,0,0,0,0
1,0,0,0,1,0
2,1,0,0,0,1
3,2,1,0,0,1
4,2,2,1,0,1


In [10]:
dataset = df.values

In [11]:
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

In [18]:
separateByClass(dataset)

{0: [array([0, 0, 0, 0, 0], dtype=int64),
  array([0, 0, 0, 1, 0], dtype=int64),
  array([2, 2, 1, 1, 0], dtype=int64),
  array([0, 1, 0, 0, 0], dtype=int64),
  array([2, 1, 0, 1, 0], dtype=int64)],
 1: [array([1, 0, 0, 0, 1], dtype=int64),
  array([2, 1, 0, 0, 1], dtype=int64),
  array([2, 2, 1, 0, 1], dtype=int64),
  array([1, 2, 1, 1, 1], dtype=int64),
  array([0, 2, 1, 0, 1], dtype=int64),
  array([2, 1, 1, 0, 1], dtype=int64),
  array([0, 1, 1, 1, 1], dtype=int64),
  array([1, 1, 0, 1, 1], dtype=int64),
  array([1, 0, 1, 0, 1], dtype=int64)]}

In [12]:
# Returns the average of 'numbers'
def mean(numbers):
    return sum(numbers)/float(len(numbers))
# Returns the standard deviation of 'numbers'
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)
# Returns the mean and stdev of each feature in 'dataset'
def summarize(dataset):
    summaries = [(np.mean(attribute), np.std(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [13]:
summarize(dataset)

[(1.0, 0.8451542547285166),
 (1.0, 0.7559289460184544),
 (0.5, 0.5),
 (0.42857142857142855, 0.49487165930539345)]

In [16]:
# This function summarizeByClass() takes the entire data set as a parameter.
# First, it uses the function separateByClass() to separate the data into lists 
#according to the class value.
# Next, it uses the function summarize() to find the mean and stdev for each 
#feature, in each of the subsets
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

In [17]:
summarizeByClass(dataset)

{0: [(0.8, 0.9797958971132713),
  (0.8, 0.7483314773547883),
  (0.2, 0.4000000000000001),
  (0.6, 0.48989794855663565)],
 1: [(1.1111111111111112, 0.7370277311900889),
  (1.1111111111111112, 0.7370277311900889),
  (0.6666666666666666, 0.4714045207910317),
  (0.3333333333333333, 0.4714045207910317)]}

In [19]:
# calculateProbability() returns the probability corresponding to a value, based on a Gaussian distribution with a specific mean and standard deviation
import math
def calculateProbability(x, mean, stdev):
    if stdev == 0: return 0
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent



In [20]:
# Example
x = 5
mean = 5;  stdev = 2
probability = calculateProbability(x, mean, stdev)
print('Probability of belonging to class:', probability)

Probability of belonging to class: 0.19947114020071635


In [21]:
# For each data point x with (n features), calculate the probability that x belongs to each of the classes in the data set
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

In [22]:
# Example
summaries = {0:[(1, 0.5)], 1:[(20, 5.0)]}
inputVector = [1.1]
probabilities = calculateClassProbabilities(summaries, inputVector)
print('Probabilities for each class:', probabilities)


Probabilities for each class: {0: 0.7820853879509118, 1: 6.298736258150442e-05}


Probabilities for each class: {0: 0.7820853879509118, 1: 6.298736258150442e-05}

For the given input 1.1, there is more than 78% probability that it belongs to class 
0, and less than 0.0063% probability that it belongs to class 1.

In [23]:
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [24]:
# For example:
summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]}
inputVector = [1.1]
label = predict(summaries, inputVector)
print('Prediction:', label)

Prediction: A


In [27]:
# For example:
summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]}
inputVector = [18.1]
label = predict(summaries, inputVector)
print('Prediction:', label)

Prediction: B


In [25]:
# getPredictions() takes a list of inputs and returns the prediction for each of the inputs 
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions



In [26]:
# Example
summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]}
testSet = [[1.1, '?'], [19.1, '?']]
predictions = getPredictions(summaries, testSet)
print('Prediction:', predictions)

Prediction: ['A', 'B']


In [28]:
# getAccuracy() takes the computed predictions and the known labels to compute 
#and return the accuracy of the predictions. 
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

# Example
testSet = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]
predictions = ['a', 'a', 'a']
accuracy = getAccuracy(testSet, predictions)
print('Accuracy:', accuracy)

Accuracy: 66.66666666666666


In [29]:
# The function SplitDataset() divides the dataset into training and testing subsets.
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

In [33]:
def main():
    df = pd.read_csv("./weather.csv")
    dataset = df.values
    summaries = summarizeByClass(dataset)
    predictions = getPredictions(summaries, [[2,0,0,1]])
    print(predictions)
 


In [34]:
main()

[0]
