In [1]:
import csv
import random
from math import sqrt
from math import exp
from math import pi

In [2]:
def loadDataset(filename, split, trainingSet=[], testSet=[], content_header=[]):
    a = csv.reader(open(filename,'r'))
    dataset = []
    for i in a:
        dataset.append(i)
    
    for x in range(len(dataset) - 1):
        for y in range(1, len(content_header) - 1):
            dataset[x][y] = float(dataset[x][y])
        if random.random() < split:
            trainingSet.append(dataset[x])
        else:
            testSet.append(dataset[x])

In [3]:
# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

In [4]:
# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [5]:
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)

In [6]:
def summarize_dataset(dataset):
    for i in dataset:
        i.pop()
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    return summaries

In [7]:
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

In [8]:
# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

In [9]:
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

In [10]:
# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

In [11]:
# Predict the class for a given row
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

In [12]:
# Naive Bayes Algorithm
def naive_bayes(train, test):
    summarize = summarize_by_class(train)
    predictions = list()
    for row in test:
        output = predict(summarize, row)
        predictions.append(output)
    return(predictions)

In [13]:
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [14]:
def predictFor(filename, stockname, split):
    iv = ["date", "open", "high", "low", "yesterday closing adj", "state change"]
    trainingSet = []
    testSet = []
    totalCount = 0
    loadDataset(filename, split, trainingSet, testSet, iv)
    for i in trainingSet:
        i.pop(0)
    test_dates = []
    for i in testSet:
        j = i.pop(0)
        test_dates.append(j)
    predictions = naive_bayes(trainingSet,testSet)
    acc = getAccuracy(testSet, predictions)
    totalCount += len(trainingSet) + len(testSet)
    print("Predicting for ", stockname)
    print("Train: " + repr(len(trainingSet)))
    print("Test: " + repr(len(testSet)))
    print("Total: " + repr(totalCount))
    print("Accuracy: ",acc)

In [15]:
split = 0.90
predictFor('amtd.csv', 'AMTD', split)

Predicting for  AMTD
Train: 3462
Test: 384
Total: 3846
Accuracy:  54.947916666666664


In [16]:
predictFor('yahoo.csv', 'YHOO', split)

Predicting for  YHOO
Train: 3463
Test: 383
Total: 3846
Accuracy:  54.56919060052219


In [17]:
predictFor('twtr.csv', 'TWTR', split)

Predicting for  TWTR
Train: 769
Test: 93
Total: 862
Accuracy:  52.68817204301075


In [18]:
predictFor('sbux.csv', 'SBUX', split)

Predicting for  SBUX
Train: 3478
Test: 368
Total: 3846
Accuracy:  51.358695652173914


In [19]:
predictFor('disney.csv', 'DIS', split)

Predicting for  DIS
Train: 3462
Test: 384
Total: 3846
Accuracy:  49.47916666666667


In [20]:
predictFor('amazon.csv', 'AMZN', split)

Predicting for  AMZN
Train: 3465
Test: 381
Total: 3846
Accuracy:  51.44356955380578
