In [None]:
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml

# Load the data
X = np.genfromtxt("data/X_train.txt", delimiter=None)
Y = np.genfromtxt("data/Y_train.txt", delimiter=None)
Xtest = np.genfromtxt("data/X_test.txt",delimiter=None)

X, Y = ml.shuffleData(X, Y)

# Random Forest

In [6]:
Xtr,Xte,Ytr,Yte = ml.splitData(X,Y,0.8)

ensemble = [None] * 25
Ytr_rf = np.zeros((len(Ytr), 25))
Yte_rf = np.zeros((len(Yte), 25))

for i in range(25):
    Xi, Yi = ml.bootstrapData(Xtr, Ytr)
    ensemble[i] = ml.dtree.treeClassify(Xi, Yi, maxDepth=20, nFeatures=10)
    Ytr_rf[:,i], Yte_rf[:,i] = ensemble[i].predict(Xtr), ensemble[i].predict(Xte)
    
    errorTrain, errorValid = np.zeros(4), np.zeros(4)
    bags = [1, 5, 10, 25]
    for i, j in enumerate(bags):
        errorTrain[i] = np.mean((Ytr - Ytr_rf[:,:i+1].mean(axis=1)>0.5))
        errorValid[i] = np.mean((Yte - Yte_rf[:,:i+1].mean(axis=1)>0.5))
        #print("{:02d} members: {} train, {} valid".format(i+1,errorTrain,errorValid))

In [7]:
print(errorTrain)
print(errorValid)

[ 0.14205  0.0796   0.13585  0.09445]
[ 0.1987   0.14295  0.207    0.17055]


In [8]:
class randomForest(ml.base.classifier):
     def __init__(self, learners):
         self.learners = learners
         self.classes = learners[0].classes
            
     def predictSoft(self,X):
         ysoft = np.zeros((X.shape[0], len(self.classes)))
         for i in range(len(self.learners)): 
            ysoft[:,1] += self.learners[i].predict(X)
         return ysoft / len(self.learners)
    
rf = randomForest(ensemble);
print("AUC Train: ", rf.auc(Xtr,Ytr))
print("AUC Valid: ", rf.auc(Xte,Yte))

AUC Train:  0.955640037652  Valid:  0.732607595574


In [9]:
predict = rf.predictSoft(Xtest)

np.savetxt('random_forest.txt',
np.vstack( (np.arange(len(predict)) , predict) ).T,
'%d, %.2f',header='ID,Prob1',comments='',delimiter=',')

# Linear Regression

In [25]:
D = [0, 1, 2]

ErrTrain = []
ErrTest = []

for i,d in enumerate(D):
    XtrP = ml.transforms.fpoly(Xtr, d, bias=False) 
    XtrP,params = ml.transforms.rescale(XtrP)
    lr = ml.linear.linearRegress( XtrP, Ytr )
    
    Phi = lambda X: ml.transforms.rescale(ml.transforms.fpoly(X, d, False), params)[0]
    
    ErrTrain.append(lr.mse(Phi(Xtr), Ytr))
    ErrTest.append(lr.mse(Phi(Xte), Yte))
    
print(ErrTrain)
print(ErrTest)

[0.22509126937499996, 0.20956680310642684, 0.20323935782516994]
[0.22358404062500048, 0.20745619666742099, 0.20222935929908625]


In [30]:
predict = lr.predict(Phi(Xtest))

np.savetxt('linear_regression.txt',
np.vstack( (np.arange(len(predict)) , predict[:,0]) ).T,
'%d, %.2f',header='ID,Prob1',comments='',delimiter=',')