# Putting Together Ensemble and Submitting Predictions

In [1]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml
import math

%matplotlib inline
np.random.seed(0)    # Default seeding for hw

# Data Loading
X = np.genfromtxt('data/X_train.txt', delimiter=None)
Y = np.genfromtxt('data/Y_train.txt', delimiter=None)
X,Y = ml.shuffleData(X,Y)

# Split the data
Xtr, Xva, Ytr, Yva = ml.splitData(X, Y)
Xt, Yt = Xtr[:5000], Ytr[:5000] # subsample for efficiency (you can go higher)

# Get Each Learners

In [2]:
#Learner 1
knn1 = ml.knn.knnClassify( Xtr, Ytr, 3)
pred = knn1.predictSoft(Xva[1:4])
print(pred)

[[ 0.66666667  0.33333333]
 [ 0.66666667  0.33333333]
 [ 0.66666667  0.33333333]]


In [3]:
#Learner 2
knn2 = ml.knn.knnClassify( Xtr, Ytr, 10)
pred = knn2.predictSoft(Xva[1:4])
print(pred)

[[ 0.5  0.5]
 [ 0.6  0.4]
 [ 0.8  0.2]]


In [26]:
#Learner 3
random_forrest_learner = pickle.load( open('pickle/1_random_forrest.pkl', "rb" ) )
pred = random_forrest_learner.predictSoft(Xva[1:4])
print(pred)

[[ 0.79423729  0.20576271]
 [ 0.62446477  0.37553523]
 [ 0.79423729  0.20576271]]


# Put together ensemble

In [8]:
class Ensemble(ml.base.classifier):
    def __init__(self, learners):
        """Constructs a BaggedTree class with a set of learners. """
        self.learners = learners
    
    def predictSoft(self, X):
        """
        Predicts the probabilities with each bagged learner and average over the results. 
        Each learner has equal weights
        """
        n_bags = len(self.learners)
        preds = [self.learners[l].predictSoft(X) for l in range(n_bags)]
        return np.mean(preds, axis=0)
    
classifiers = []
classifiers.append(knn1)
classifiers.append(knn2)
classifiers.append(random_forrest_learner)

ensemble = Ensemble(classifiers)
ensemble.classes = np.unique(Y)    # Tell it how many classes there are

In [10]:
# Test driving ensemble
print(knn1.predictSoft(Xva[1:2]))
print(knn2.predictSoft(Xva[1:2]))
print(random_forrest_learner.predictSoft(Xva[1:2]))
print(ensemble.predictSoft(Xva[1:2]))

[[ 0.66666667  0.33333333]]
[[ 0.5  0.5]]
[[ 0.79423729  0.20576271]]
[[ 0.65363465  0.34636535]]


# Make Prediction + Kaggle Submission

In [None]:
# Submission
"""
This makes a text file with format like this where second column is probability of rain:
ID,Prob1
0, 0.34
1, 0.37
2, 0.23
3, 0.24
...
"""
Xte = np.genfromtxt('data/X_test.txt', delimiter=None)
Yte = np.vstack((np.arange(Xte.shape[0]), ensemble.predictSoft(Xte)[:,1])).T
np.savetxt('Y_submit.txt', Yte, '%d, %.2f', header='ID,Prob1', comments='', delimiter=',')