# Random forests P22250 model

## Setup

### Imports

In [44]:
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
import os

## Data

Get PUF records.

In [45]:
train = pd.read_csv('puf80%training')
test = pd.read_csv('puf20%testing')

cols =  ['P22250', 'P22250_sign','DSI', 'EIC', 'MARS', 'XTOT', 'E00200', 
         'E00300', 'E00400', 'E00600', 'E00650', 'E00800', 'E00900', 'E01100', 
         'E01400', 'E01500', 'E01700', 'E02100', 'E02300', 'E02400', 'E03150', 
         'E03210', 'E03240', 'E03270', 'E03300', 'E17500', 'E18400', 'E18500', 
         'E19200', 'E19800', 'E20100', 'E20400', 'E32800', 'F2441', 'N24']

predictors = cols[2:]

train = train[cols]
test = test[cols]

In [46]:
X_train = train[predictors]
X_test = test[predictors]

Y_train = train['P22250']
Y_test = test['P22250']

Y_train_sign = train['P22250_sign']
Y_test_sign = test['P22250_sign']

## Model

Train a random forests model.

In [47]:
# 1000 estimators
N_ESTIMATORS = 1000
rf = ensemble.RandomForestRegressor(n_estimators=N_ESTIMATORS, 
                                    min_samples_leaf=1, random_state=3, 
                                    verbose=True, 
                                    n_jobs=-1)  # Use maximum number of cores.
rf.fit(X_train, Y_train)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 25.9min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 32.3min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
           oob_score=False, random_state=3, verbose=True, warm_start=False)

### Model description

Show the ten most important features.

In [48]:
feature_importance = pd.Series(rf.feature_importances_, index=X_train.columns)
feature_importance.sort_values(ascending=False)[:10]

E00300    0.195434
E00600    0.085867
E00200    0.079515
E19200    0.067404
E18400    0.062765
E03300    0.052022
XTOT      0.047897
E20400    0.045814
E00650    0.044599
E19800    0.044535
dtype: float64

## Predict

### Top-line (average)

In [49]:
pred = pd.DataFrame({'actual': Y_test,
                     'pred': rf.predict(X_test)})
pred['error'] = pred.pred - pred.actual
pred['actual_sign'] = np.sign(pred.actual)
pred['pred_sign'] = np.sign(pred.pred)
pred['correct_sign'] = (pred.actual_sign == pred.pred_sign)
pred['count'] = 1

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    4.4s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    7.6s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    9.4s finished


MAE, RMSE, and % negative/zero/positive.

In [50]:
pred.error.abs().mean()

33938.6560024059

In [51]:
pred.error.pow(2).mean() ** 0.5

294872.51180626405

In [52]:
pred.pivot_table(index='actual_sign', columns='pred_sign', values='count', 
                 aggfunc=sum, margins=True)

pred_sign,-1.0,0.0,1.0,All
actual_sign,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1.0,3235,29,1411,4675
0.0,9589,8927,5468,23984
1.0,2638,21,1295,3954
All,15462,8977,8174,32613


In [53]:
pred.correct_sign.mean()

0.41262686658694386

### All trees

In [54]:
preds = []
for estimator in rf.estimators_:
    preds.append(estimator.predict(X_test))
preds = np.array(preds).transpose()  # One row per record.

In [55]:
rand_col = np.random.randint(N_ESTIMATORS, size=preds.shape[0])
random_tree = preds[np.arange(preds.shape[0]), rand_col]

In [56]:
pred_random_tree = pd.DataFrame({'actual': Y_test,
                                 'pred': random_tree})
pred_random_tree['error'] = pred_random_tree.pred - pred_random_tree.actual
pred_random_tree['actual_sign'] = np.sign(pred_random_tree.actual)
pred_random_tree['pred_sign'] = np.sign(pred_random_tree.pred)
pred_random_tree['correct_sign'] = (
    pred_random_tree.actual_sign == pred_random_tree.pred_sign)
pred_random_tree['count'] = 1

As expected, MAE and RMSE exceed values from the point estimate prediction.

In [57]:
pred_random_tree.error.abs().mean()

55210.21825817192

In [58]:
pred_random_tree.error.pow(2).mean() ** 0.5

856710.2393972115

But the distribution of sign is closer to correct, since it's not averaging out the zeros.

In [59]:
pred_random_tree.pivot_table(index='actual_sign', columns='pred_sign', 
                             values='count', aggfunc=sum, margins=True)

pred_sign,-1.0,0.0,1.0,All
actual_sign,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1.0,1848,1402,1425,4675
0.0,1693,20817,1474,23984
1.0,1471,1301,1182,3954
All,5012,23520,4081,32613


In [60]:
pred_random_tree.correct_sign.mean()

0.7312114800846289

#### Log-loss of sign

In [61]:
preds_neg = np.sum(preds < 0, axis=1) / 100
preds_zero = np.sum(preds == 0, axis=1) / 100
preds_pos = np.sum(preds > 0, axis=1) / 100

rf_pred_proba = list(map(list, zip(*[preds_neg, preds_zero, preds_pos])))

metrics.log_loss(Y_test_sign, rf_pred_proba)

0.6563460095593087