In [1]:
import csv
import random

In [2]:
def loadDataset(filename, split1, trainingSet=[], testSet=[], content_header=[]):
    a = csv.reader(open(filename,'r'))
    dataset = []
    for i in a:
        dataset.append(i)
    
    for x in range(len(dataset) - 1):
        for y in range(1, len(content_header) - 1):
            dataset[x][y] = float(dataset[x][y])
        if random.random() < split1:
            trainingSet.append(dataset[x])
        else:
            testSet.append(dataset[x])

In [3]:
def test_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

In [4]:
# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
    # count all samples at split point
    n_instances = float(sum([len(group) for group in groups]))
    # sum weighted Gini index for each group
    gini = 0.0
    for group in groups:
        size = float(len(group))
        # avoid divide by zero
        if size == 0:
            continue
        score = 0.0
        # score the group based on the score for each class
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            score += p * p
        # weight the group score by its relative size
        gini += (1.0 - score) * (size / n_instances)
    return gini

In [5]:
# Select the best split point for a dataset
def get_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    for index in range(len(dataset[0])-1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    return {'index':b_index, 'value':b_value, 'groups':b_groups}

In [6]:
# Create a terminal node value
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

In [7]:
def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    # check for a no split
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    # check for max depth
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    # process left child
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left)
        split(node['left'], max_depth, min_size, depth+1)
    # process right child
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right)
        split(node['right'], max_depth, min_size, depth+1)

In [8]:
def build_tree(train, max_depth, min_size):
    root = get_split(train)
    split(root, max_depth, min_size, 1)
    return root

In [9]:
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

In [10]:
# Classification and Regression Tree Algorithm
def decision_tree(train, test, max_depth, min_size):
    tree = build_tree(train, max_depth, min_size)
    predictions = list()
    for row in test:
        prediction = predict(tree, row)
        predictions.append(prediction)
    return(predictions)

In [11]:
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [12]:
def predictFor(filename, stockname, split1):
    iv = ["date", "open", "high", "low", "yesterday closing adj", "state change"]
    trainingSet = []
    testSet = []
    totalCount = 0
    loadDataset(filename, split1, trainingSet, testSet, iv)
    for i in trainingSet:
        i.pop(0)
    test_dates = []
    for i in testSet:
        j = i.pop(0)
        test_dates.append(j)
    max_depth = 5
    min_size = 10
    predictions = decision_tree(trainingSet, testSet, max_depth, min_size)
    acc = getAccuracy(testSet, predictions)
    totalCount += len(trainingSet) + len(testSet)
    print("Predicting for ", stockname)
    print("Train: " + repr(len(trainingSet)))
    print("Test: " + repr(len(testSet)))
    print("Total: " + repr(totalCount))
    print("Accuracy: ",acc)

In [13]:
split1 = 0.90
predictFor('amtd.csv', 'AMTD', split1)

Predicting for  AMTD
Train: 3432
Test: 414
Total: 3846
Accuracy:  50.72463768115942


In [14]:
predictFor('yahoo.csv', 'YHOO', split1)

Predicting for  YHOO
Train: 3475
Test: 371
Total: 3846
Accuracy:  55.79514824797843


In [15]:
predictFor('twtr.csv', 'TWTR', split1)

Predicting for  TWTR
Train: 782
Test: 80
Total: 862
Accuracy:  56.25


In [19]:
predictFor('sbux.csv', 'SBUX', split1)

Predicting for  SBUX
Train: 3476
Test: 370
Total: 3846
Accuracy:  52.43243243243243


In [17]:
predictFor('disney.csv', 'DIS', split1)

Predicting for  DIS
Train: 3490
Test: 356
Total: 3846
Accuracy:  53.65168539325843


In [21]:
predictFor('amazon.csv', 'AMZN', split1)

Predicting for  AMZN
Train: 3467
Test: 379
Total: 3846
Accuracy:  52.242744063324544
