In [1]:
import csv
import random
from math import sqrt
from random import randrange

In [2]:
def loadDataset(filename, split1, trainingSet=[], testSet=[], content_header=[]):
    a = csv.reader(open(filename,'r'))
    dataset = []
    for i in a:
        dataset.append(i)
    
    for x in range(len(dataset) - 1):
        for y in range(1, len(content_header) - 1):
            dataset[x][y] = float(dataset[x][y])
        if random.random() < split1:
            trainingSet.append(dataset[x])
        else:
            testSet.append(dataset[x])

In [3]:
def test_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

In [4]:
# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
    # count all samples at split point
    n_instances = float(sum([len(group) for group in groups]))
    # sum weighted Gini index for each group
    gini = 0.0
    for group in groups:
        size = float(len(group))
        # avoid divide by zero
        if size == 0:
            continue
        score = 0.0
        # score the group based on the score for each class
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            score += p * p
        # weight the group score by its relative size
        gini += (1.0 - score) * (size / n_instances)
    return gini

In [5]:
# Select the best split point for a dataset
def get_split(dataset, n_features):
    class_values = list(set(row[-1] for row in dataset))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    features = list()
    while len(features) < n_features:
        index = randrange(len(dataset[0])-1)
        if index not in features:
            features.append(index)
    for index in features:
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    return {'index':b_index, 'value':b_value, 'groups':b_groups}

In [6]:
# Create a terminal node value
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

In [7]:
def split(node, max_depth, min_size, n_features, depth):
	left, right = node['groups']
	del(node['groups'])
	# check for a no split
	if not left or not right:
		node['left'] = node['right'] = to_terminal(left + right)
		return
	# check for max depth
	if depth >= max_depth:
		node['left'], node['right'] = to_terminal(left), to_terminal(right)
		return
	# process left child
	if len(left) <= min_size:
		node['left'] = to_terminal(left)
	else:
		node['left'] = get_split(left, n_features)
		split(node['left'], max_depth, min_size, n_features, depth+1)
	# process right child
	if len(right) <= min_size:
		node['right'] = to_terminal(right)
	else:
		node['right'] = get_split(right, n_features)
		split(node['right'], max_depth, min_size, n_features, depth+1)

In [8]:
def build_tree(train, max_depth, min_size, n_features):
    root = get_split(train, n_features)
    split(root, max_depth, min_size, n_features, 1)
    return root

In [9]:
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

In [10]:
# Create a random subsample from the dataset with replacement
def subsample(dataset, ratio):
    sample = list()
    n_sample = round(len(dataset) * ratio)
    while len(sample) < n_sample:
        index = randrange(len(dataset))
        sample.append(dataset[index])
    return sample

In [11]:
# Make a prediction with a list of bagged trees
def bagging_predict(trees, row):
    predictions = [predict(tree, row) for tree in trees]
    return max(set(predictions), key=predictions.count)

In [12]:
# Random Forest Algorithm
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
    trees = list()
    for i in range(n_trees):
        sample = subsample(train, sample_size)
        tree = build_tree(sample, max_depth, min_size, n_features)
        trees.append(tree)
    predictions = [bagging_predict(trees, row) for row in test]
    return(predictions)

In [13]:
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [14]:
def predictFor(filename, stockname, split1):
    iv = ["date", "open", "high", "low", "yesterday closing adj", "state change"]
    trainingSet = []
    testSet = []
    totalCount = 0
    loadDataset(filename, split1, trainingSet, testSet, iv)
    for i in trainingSet:
        i.pop(0)
    test_dates = []
    for i in testSet:
        j = i.pop(0)
        test_dates.append(j)
    n_folds = 5
    max_depth = 10
    min_size = 5
    sample_size = 1.0
    n_trees = 10
    n_features = int(sqrt(len(trainingSet[0])-1))
    predictions = random_forest(trainingSet, testSet, max_depth, min_size, sample_size, n_trees, n_features)
    acc = getAccuracy(testSet, predictions)
    totalCount += len(trainingSet) + len(testSet)
    print("Predicting for ", stockname)
    print("Train: " + repr(len(trainingSet)))
    print("Test: " + repr(len(testSet)))
    print("Total: " + repr(totalCount))
    print("Accuracy: ",acc)

In [15]:
split1 = 0.90
predictFor('amtd.csv', 'AMTD', split1)

Predicting for  AMTD
Train: 3465
Test: 381
Total: 3846
Accuracy:  61.942257217847775


In [16]:
predictFor('yahoo.csv', 'YHOO', split1)

Predicting for  YHOO
Train: 3467
Test: 379
Total: 3846
Accuracy:  60.15831134564644


In [17]:
predictFor('twtr.csv', 'TWTR', split1)

Predicting for  TWTR
Train: 762
Test: 100
Total: 862
Accuracy:  72.0


In [18]:
predictFor('sbux.csv', 'SBUX', split1)

Predicting for  SBUX
Train: 3449
Test: 397
Total: 3846
Accuracy:  58.94206549118388


In [19]:
predictFor('disney.csv', 'DIS', split1)

Predicting for  DIS
Train: 3448
Test: 398
Total: 3846
Accuracy:  54.773869346733676


In [20]:
predictFor('amazon.csv', 'AMZN', split1)

Predicting for  AMZN
Train: 3447
Test: 399
Total: 3846
Accuracy:  56.64160401002506
