# Basic decision tree (the forrest should be better than this)

In [175]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml
import math
import time

%matplotlib inline
np.random.seed(0)    # Default seeding for hw

# Data Loading
X = np.genfromtxt('data/X_train.txt', delimiter=None)
Y = np.genfromtxt('data/Y_train.txt', delimiter=None)
X,Y = ml.shuffleData(X,Y)

# Split the data
Xtr, Xva, Ytr, Yva = ml.splitData(X, Y)
subsamples_train = len(Xtr)
# subsamples_train = 10000
Xt, Yt = Xtr[:subsamples_train], Ytr[:subsamples_train] # subsample for efficiency (you can go higher)

In [125]:
print('Xtr.shape = {}'.format(Xtr.shape))
print('Xva.shape = {}'.format(Xva.shape))
print('Xt.shape = {}'.format(Xt.shape))

Xtr.shape = (160000, 14)
Xva.shape = (40000, 14)
Xt.shape = (160000, 14)


In [178]:
# Train
learner = ml.dtree.treeClassify(Xt, Yt, maxDepth=7, minParent=3, minLeaf=2,nFeatures=16)
print('Testing AUC   : {}'.format(learner.auc(Xt, Yt)))
print('Validation AUC: {}'.format(learner.auc(Xva, Yva)))

Testing AUC   : 0.6857440969185055
Validation AUC: 0.6752014857978347


# Messing with parameters

In [170]:
# Tuning nFeatures
for _nFeatures in range(1,14):
    total_auc = 0
    for i in range(10):
        learner = ml.dtree.treeClassify(Xt, Yt, maxDepth=7, minParent=3, minLeaf=2,nFeatures=_nFeatures)
        total_auc += learner.auc(Xva,Yva)
    avg_auc = total_auc/10
    print('nFeature = {}  Avg auc={}'.format(_nFeatures,avg_auc))
    
# Looks like features beyond 6 doesn't really improve the predictions much

nFeature = 1  Avg auc=0.6288440869587419
nFeature = 2  Avg auc=0.6514535042645788
nFeature = 3  Avg auc=0.6576695735624394
nFeature = 4  Avg auc=0.6615853807650984
nFeature = 5  Avg auc=0.6653308351016844
nFeature = 6  Avg auc=0.6662518670827849
nFeature = 7  Avg auc=0.6684394470694192
nFeature = 8  Avg auc=0.669220779342931
nFeature = 9  Avg auc=0.6690266824443138
nFeature = 10  Avg auc=0.6704918646807367
nFeature = 11  Avg auc=0.6722473017958346


KeyboardInterrupt: 

In [52]:
# Tuning Maxdepth
for _maxDepth in range(1,17):
    total_auc = 0
    for i in range(10):
        learner = ml.dtree.treeClassify(Xt, Yt, maxDepth=_maxDepth, minParent=3, minLeaf=2,nFeatures=14)
        total_auc += learner.auc(Xva,Yva)
    avg_auc = total_auc/10
    print('_maxDepth = {}  Avg auc={}'.format(_maxDepth,avg_auc))
    
# Depth is good from 3-7 after that it prediction really degrades

_maxDepth = 1  Avg auc=0.5973462799440625
_maxDepth = 2  Avg auc=0.618007911519961
_maxDepth = 3  Avg auc=0.6334816449573025
_maxDepth = 4  Avg auc=0.6427321490597465
_maxDepth = 5  Avg auc=0.6382593383388953
_maxDepth = 6  Avg auc=0.6329911044117383
_maxDepth = 7  Avg auc=0.6307541527126326
_maxDepth = 8  Avg auc=0.6215438860702186
_maxDepth = 9  Avg auc=0.6154564529472364
_maxDepth = 10  Avg auc=0.6097160731326976
_maxDepth = 11  Avg auc=0.6071693753119712
_maxDepth = 12  Avg auc=0.6051109123898939
_maxDepth = 13  Avg auc=0.6029751469404045
_maxDepth = 14  Avg auc=0.5966665084726621
_maxDepth = 15  Avg auc=0.5970649049179793
_maxDepth = 16  Avg auc=0.5943862482342586


In [63]:
# Tuning MinParent
for _minParent in range(2,20,2):
    total_auc = 0
    for i in range(10):
        learner = ml.dtree.treeClassify(Xt, Yt, maxDepth=7, minParent=_minParent, minLeaf=2,nFeatures=14)
        total_auc += learner.auc(Xva,Yva)
    avg_auc = total_auc/10
    print('minParent = {}  Avg auc={}'.format(_minParent,avg_auc))
    
# MinParent seem to be same up to 12 then it degrades slowly

minParent = 2  Avg auc=0.6309293577360257
minParent = 4  Avg auc=0.6309809181524203
minParent = 6  Avg auc=0.6309934203072137
minParent = 8  Avg auc=0.6309235135169744
minParent = 10  Avg auc=0.6307604245132203
minParent = 12  Avg auc=0.6308346693581752
minParent = 14  Avg auc=0.6295176354720928
minParent = 16  Avg auc=0.6291832216357507
minParent = 18  Avg auc=0.6292303637561169


In [66]:
# Tuning minLeaf
for _minLeaf in range(2,1000,100):
    total_auc = 0
    for i in range(10):
        learner = ml.dtree.treeClassify(Xt, Yt, maxDepth=7, minParent=10, minLeaf=_minLeaf,nFeatures=14)
        total_auc += learner.auc(Xva,Yva)
    avg_auc = total_auc/10
    print('minLeaf = {}  Avg auc={}'.format(_minLeaf,avg_auc))
    
# Upping Minleaf seem to improve from 2 to 400. Afterwards it starts to degrade slowly for every hundred.

minLeaf = 2  Avg auc=0.6309634928093011
minLeaf = 102  Avg auc=0.6446044222715324
minLeaf = 202  Avg auc=0.6439664089377091
minLeaf = 302  Avg auc=0.6405412087064668
minLeaf = 402  Avg auc=0.6425872714848808
minLeaf = 502  Avg auc=0.6379778057607378
minLeaf = 602  Avg auc=0.6277992370494064
minLeaf = 702  Avg auc=0.6268183156036036
minLeaf = 802  Avg auc=0.62136495268654
minLeaf = 902  Avg auc=0.62136495268654


In [127]:
# Putting all the best choices together...
# Tuning MinParent
total_auc = 0
for i in range(10):
    learner = ml.dtree.treeClassify(Xt, Yt, maxDepth=4, minParent=10, minLeaf=100, nFeatures=7)
    auc = learner.auc(Xva,Yva)
    total_auc += auc
avg_auc = total_auc/10
print('Avg validation auc={}'.format(avg_auc))

Avg validation auc=0.644184571950524


In [310]:
# How predictions work
print(Xte[:3,:])
preds = learner.predictSoft(Xte[:3,:])
print(preds)
print(np.mean(preds, axis=0))

[[  2.46000000e+02   2.26000000e+02   2.43270000e+02   2.33430000e+02
    7.12100000e+03   6.48000000e+02   0.00000000e+00   1.12350000e+00
    5.52260000e+00   1.97470000e+00   2.33900000e+00   1.14380000e+00
    5.30220000e+00   0.00000000e+00]
 [  2.39000000e+02   2.26000000e+02   2.40770000e+02   2.33310000e+02
    2.10900000e+03   5.60000000e+02   0.00000000e+00   5.71540000e+00
    6.32130000e+00   1.59990000e+00   4.33180000e+00   3.18340000e+00
    3.05790000e+00   0.00000000e+00]
 [  2.51620000e+02   2.32000000e+02   2.45820000e+02   2.33570000e+02
    1.99000000e+02   1.40000000e+01   0.00000000e+00   3.41310000e+00
    5.73950000e+00   1.29360000e+00   5.61420000e+00   2.06010000e+00
    2.00000000e+01   0.00000000e+00]]
[[ 0.51230769  0.48769231]
 [ 0.68570108  0.31429892]
 [ 0.83820851  0.16179149]]
[ 0.6787391  0.3212609]


# Random Forrest

In [322]:
n_bags = 3
bags = []   # self.learners
for l in range(n_bags):
    # Each boosted data is the size of the original data. 
    Xi, Yi = ml.bootstrapData(Xt, Yt, Xt.shape[0])

    # Train the model on that draw
    tree = ml.dtree.treeClassify(Xi, Yi, maxDepth=4, minParent=10, minLeaf=100, nFeatures=7)
    bags.append(tree)

In [323]:
# If you wanna check AUC of each tree
# for l in range(n_bags):
#     print(l)
#     print("{0:>15}: {1:.4f}".format('Train AUC', bags[l].auc(Xt, Yt)))
#     print("{0:>15}: {1:.4f}".format('Validation AUC', bags[l].auc(Xva, Yva)))

In [326]:
class BaggedTree(ml.base.classifier):
    def __init__(self, learners):
        """Constructs a BaggedTree class with a set of learners. """
        self.learners = learners
    
    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        n_bags = len(self.learners)
        preds = [self.learners[l].predictSoft(X) for l in range(n_bags)]
        return np.mean(preds, axis=0)

In [342]:
bt = BaggedTree(bags)
bt.classes = np.unique(Y)    # Tell it how many classes there are

print("{0:>15}: {1:.4f}".format('Train AUC', bt.auc(Xt, Yt)))
print("{0:>15}: {1:.4f}".format('Validation AUC', bt.auc(Xva, Yva)))

      Train AUC: 0.6607
 Validation AUC: 0.6558


In [341]:
# What do predictions look like
Xte = np.genfromtxt('data/X_test.txt', delimiter=None)

In [365]:
print(Xte[:2,:])
z = bt.predictSoft(Xte[:2,:])
# print(z)
print(bt.predictSoft(Xte[17:18,:]))

[[  2.46000000e+02   2.26000000e+02   2.43270000e+02   2.33430000e+02
    7.12100000e+03   6.48000000e+02   0.00000000e+00   1.12350000e+00
    5.52260000e+00   1.97470000e+00   2.33900000e+00   1.14380000e+00
    5.30220000e+00   0.00000000e+00]
 [  2.39000000e+02   2.26000000e+02   2.40770000e+02   2.33310000e+02
    2.10900000e+03   5.60000000e+02   0.00000000e+00   5.71540000e+00
    6.32130000e+00   1.59990000e+00   4.33180000e+00   3.18340000e+00
    3.05790000e+00   0.00000000e+00]]
[[ 0.33533168  0.66466832]]


In [235]:
# Save this sucker
pickle.dump(learner, open('pickle/1_random_forrest.pkl', "wb" ))

In [366]:
# Load this sucker
random_forrest_learner = pickle.load( open('pickle/1_random_forrest.pkl', "rb" ) )

In [367]:
print(Xte[:2,:])
z = random_forrest_learner.predictSoft(Xte[:2,:])
# print(z)
print(bt.predictSoft(Xte[17:18,:]))

[[  2.46000000e+02   2.26000000e+02   2.43270000e+02   2.33430000e+02
    7.12100000e+03   6.48000000e+02   0.00000000e+00   1.12350000e+00
    5.52260000e+00   1.97470000e+00   2.33900000e+00   1.14380000e+00
    5.30220000e+00   0.00000000e+00]
 [  2.39000000e+02   2.26000000e+02   2.40770000e+02   2.33310000e+02
    2.10900000e+03   5.60000000e+02   0.00000000e+00   5.71540000e+00
    6.32130000e+00   1.59990000e+00   4.33180000e+00   3.18340000e+00
    3.05790000e+00   0.00000000e+00]]
[[ 0.33533168  0.66466832]]


In [331]:
# Test submission
Xte = np.genfromtxt('data/X_test.txt', delimiter=None)
Yte = np.vstack((np.arange(Xte.shape[0]), bt.predictSoft(Xte)[:,1])).T
np.savetxt('Y_submit.txt', Yte, '%d, %.2f', header='ID,Prob1', comments='', delimiter=',')