In [1]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_classification

  from numpy.core.umath_tests import inner1d


In [39]:
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [40]:
def run_forest(tree_num,depth,numfeatures):
    test_err = []
    train_err = []
    ytrain = 'y_train{}.npy'
    xtrain = 'X_train_{}.npy'
    ytest = 'y_test_{}.npy'
    xtest = 'X_test_{}.npy'
    for i in range(3): 

        y_train = np.load(ytrain.format(i)).reshape(-1)
        name = 'X_train_'+ str(i) + '.npy'
        x_train = np.load(xtrain.format(i)).reshape(len(y_train),-1)
        model = RandomForestRegressor(n_estimators=tree_num,max_features=numfeatures, bootstrap = False,
                                   max_depth=depth)
        model.fit(x_train, y_train)
        y_test = np.load(ytest.format(i)).reshape(-1)
        x_test = np.load(xtest.format(i)).reshape(len(y_test),-1)
        y_hat = model.predict(x_test)
        print(y_hat)
        idx = y_test != 0
        test_err.append(metric (y_test[idx],y_hat[idx]))
        y_hat = model.predict(x_train)
        idx = y_train != 0
        train_err.append(metric (y_train[idx],y_hat[idx]))
        
   
    return test_err,train_err

In [41]:
testerr,trainerr=run_forest(100,16,14)

[ 4226.31029657 10591.19776513  4627.92506621 ...  9336.55203282
  3957.3435108   3112.9943557 ]
[9311.32789429 8652.81844717 4955.78471005 ... 4164.643232   6448.93621739
 5389.38717622]
[6656.22667524 6930.42435227 6776.4489819  ... 6052.8584268  8998.00751415
 7052.6504017 ]


In [44]:
testerr

[13.613340159577504, 13.435270860328085, 14.353581615424002]

In [45]:
trainerr


[12.79722042760723, 13.160765933038968, 13.343593105136994]

### testing on num features revealed that under 100 trees and 10 depth 16 features was best

In [51]:
numfeatures=[4,8,12,16]
testerr=[]
trainerr=[]
for num in numfeatures:
    tester,trainer=run_forest(100,10,num)
    testerr.append(np.mean(tester))
    trainerr.append(np.mean(trainer))
        

[4598.04752163 9813.58778773 4706.98450864 ... 8975.33070778 5025.69545573
 4288.23695556]
[8490.03706375 8315.49398779 4723.41852462 ... 4591.68770435 6591.72272185
 5415.33841861]
[6067.68382675 7565.74420162 6203.30360707 ... 6458.10982843 8809.20633846
 7885.30352215]
[ 4236.32513058 10521.19501186  4632.79133613 ...  9377.70278126
  4114.30518947  3516.30584996]
[8955.1479239  8565.58056924 4794.78719287 ... 4262.99499867 6406.15053745
 5308.27047092]
[6392.84648544 7234.2063102  6359.93219369 ... 5921.36756296 8853.55732514
 7348.69543305]
[ 4209.27840592 10702.62770528  4665.34917648 ...  9563.17148488
  4043.96836149  3153.77033539]
[9251.1135357  8745.88575923 4884.55670782 ... 4176.05506766 6414.11833538
 5364.09268452]
[6513.15021562 6954.80934136 6613.52931671 ... 5945.52545561 8934.94549043
 7042.12803289]
[ 4260.04185353 10543.1224265   4619.42915649 ...  9477.14686209
  4010.6049908   3090.17004604]
[9351.60590113 8655.62293717 4904.85248803 ... 4130.95034553 6462.177729

In [52]:
testerr

[18.58405755518362, 14.790968279584794, 13.900278705660313, 13.787394924403579]

In [54]:
trainerr

[17.282034347507082, 13.912703450990554, 13.193055600081486, 13.10401265439984]

### exploring different depths

In [None]:
depth=[7,10,13,16]
testerr=[]
trainerr=[]
for num in depth:
    tester,trainer=run_forest(100,num,16)
    testerr.append(np.mean(tester))
    trainerr.append(np.mean(trainer))

[ 4229.53969026 10538.04868614  4868.71887426 ...  9979.03898604
  3952.26232358  3097.41731731]
[9518.6707149  8930.5551288  4593.73918397 ... 4239.15132149 6249.68540135
 5386.39786951]
