# Random forests imputation

## Setup

### Imports

In [47]:
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
import os

## Data

Get PUF records.

In [48]:
'''df = pd.read_csv('puf2011.csv')

# Removes aggregate rows
df = df[(df.RECID != 999996) & 
        (df.RECID != 999997) & 
        (df.RECID != 999998) &
        (df.RECID != 999999)]'''

"df = pd.read_csv('puf2011.csv')\n\n# Removes aggregate rows\ndf = df[(df.RECID != 999996) & \n        (df.RECID != 999997) & \n        (df.RECID != 999998) &\n        (df.RECID != 999999)]"

In [49]:
'''P22250_indp = ['DSI', 'EIC', 'MARS', 'E00200', 'E00300','E00600', 'E00800', 'E00900', 
               'E01100', 'E01400', 'E01500', 'E01700','E02100', 'E02300', 'E02400', 
               'E03150', 'E03210', 'E03240', 'E03270','E03300', 'E17500', 'E18400', 
               'E18500', 'E19200', 'E19800', 'E20100','E20400', 'E32800', 'P22250']
df.drop([i for i in df.columns if i not in P22250_indp], axis=1, inplace=True)'''

"P22250_indp = ['DSI', 'EIC', 'MARS', 'E00200', 'E00300','E00600', 'E00800', 'E00900', \n               'E01100', 'E01400', 'E01500', 'E01700','E02100', 'E02300', 'E02400', \n               'E03150', 'E03210', 'E03240', 'E03270','E03300', 'E17500', 'E18400', \n               'E18500', 'E19200', 'E19800', 'E20100','E20400', 'E32800', 'P22250']\ndf.drop([i for i in df.columns if i not in P22250_indp], axis=1, inplace=True)"

In [51]:
'df.describe().transpose()'

'df.describe().transpose()'

## Model

Train a random forests model.

In [52]:
#YCOL = 'P22250'

train = pd.read_csv('puf80%training')
train.drop(columns = [i for i in train.columns if i not in P22250_indp], axis = 1, inplace = True)

test = pd.read_csv('puf20%validation')
test.drop(columns = [i for i in test.columns if i not in P22250_indp], axis = 1, inplace = True)

In [53]:
#X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
    #df.drop(YCOL, axis=1), df[YCOL], random_state=3)

X_train = train.drop(columns = ['P22250'], axis = 1)
X_test = test.drop(columns = ['P22250'], axis = 1)
Y_train = train['P22250']
Y_test = test['P22250']

In [54]:
Y_train_sign = np.sign(Y_train)
Y_test_sign = np.sign(Y_test)

In [55]:
# Reduce for faster runtime.
N_ESTIMATORS = 100
rf = ensemble.RandomForestRegressor(n_estimators=N_ESTIMATORS, 
                                    min_samples_leaf=1, random_state=3, 
                                    verbose=True, 
                                    n_jobs=-1)  # Use maximum number of cores.
rf.fit(X_train, Y_train)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.4min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=3, verbose=True, warm_start=False)

### Model description

Show the ten most important features.

In [56]:
feature_importance = pd.Series(rf.feature_importances_, index=X_train.columns)
feature_importance.sort_values(ascending=False)[:10]

E00300    0.188059
E00600    0.104188
MARS      0.102279
E00200    0.097157
E18400    0.077229
E19200    0.071628
E20400    0.057189
E19800    0.049433
E03300    0.045403
E18500    0.044406
dtype: float64

## Predict

### Top-line (average)

In [57]:
pred = pd.DataFrame({'actual': Y_test,
                     'pred': rf.predict(X_test)})
pred['error'] = pred.pred - pred.actual
pred['actual_sign'] = np.sign(pred.actual)
pred['pred_sign'] = np.sign(pred.pred)
pred['correct_sign'] = (pred.actual_sign == pred.pred_sign)
pred['count'] = 1

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.5s finished


MAE, RMSE, and % negative/zero/positive.

In [58]:
pred.error.abs().mean()

35601.24474728538

In [59]:
pred.error.pow(2).mean() ** 0.5

296625.87750879786

In [60]:
pred.pivot_table(index='actual_sign', columns='pred_sign', values='count', 
                 aggfunc=sum, margins=True)

pred_sign,-1.0,0.0,1.0,All
actual_sign,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1.0,2982,55,1638,4675
0.0,7572,11356,5056,23984
1.0,2392,44,1518,3954
All,12946,11455,8212,32613


In [61]:
pred.correct_sign.mean()

0.48618649004998005

### All trees

In [62]:
preds = []
for estimator in rf.estimators_:
    preds.append(estimator.predict(X_test))
preds = np.array(preds).transpose()  # One row per record.

In [63]:
rand_col = np.random.randint(N_ESTIMATORS, size=preds.shape[0])
random_tree = preds[np.arange(preds.shape[0]), rand_col]

In [64]:
pred_random_tree = pd.DataFrame({'actual': Y_test,
                                 'pred': random_tree})
pred_random_tree['error'] = pred_random_tree.pred - pred_random_tree.actual
pred_random_tree['actual_sign'] = np.sign(pred_random_tree.actual)
pred_random_tree['pred_sign'] = np.sign(pred_random_tree.pred)
pred_random_tree['correct_sign'] = (
    pred_random_tree.actual_sign == pred_random_tree.pred_sign)
pred_random_tree['count'] = 1

As expected, MAE and RMSE exceed values from the point estimate prediction.

In [65]:
pred_random_tree.error.abs().mean()

50240.61039866049

In [66]:
pred_random_tree.error.pow(2).mean() ** 0.5

542762.7703379896

But the distribution of sign is closer to correct, since it's not averaging out the zeros.

In [67]:
pred_random_tree.pivot_table(index='actual_sign', columns='pred_sign', 
                             values='count', aggfunc=sum, margins=True)

pred_sign,-1.0,0.0,1.0,All
actual_sign,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1.0,1764,1424,1487,4675
0.0,1671,20791,1522,23984
1.0,1363,1365,1226,3954
All,4798,23580,4235,32613


In [68]:
pred_random_tree.correct_sign.mean()

0.7291877472173673

#### Log-loss of sign

In [69]:
preds_neg = np.sum(preds < 0, axis=1) / 100
preds_zero = np.sum(preds == 0, axis=1) / 100
preds_pos = np.sum(preds > 0, axis=1) / 100

rf_pred_proba = list(map(list, zip(*[preds_neg, preds_zero, preds_pos])))

metrics.log_loss(Y_test_sign, rf_pred_proba)

0.6379758818358536

## Multinomial logistic regression of sign

In [70]:
mult = linear_model.LogisticRegression(
    multi_class='multinomial', solver='newton-cg', random_state=3)
mult.fit(X_train, Y_train_sign)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=3, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

### Coefficients

p-values are tricky to calculate, so just show the features with highest absolute coefficient on the zero class.

In [71]:
mult_coef = pd.DataFrame({
    'negative': mult.coef_[0],
    'zero': mult.coef_[1],
    'positive': mult.coef_[2],
    'abs_zero': np.abs(mult.coef_[1])},
    index=X_train.columns)

mult_coef.sort_values('abs_zero', ascending=False)[:10]

Unnamed: 0,abs_zero,negative,positive,zero
MARS,0.073554,-0.036209,-0.037345,0.073554
E01100,0.018495,-0.009427,-0.009067,0.018495
EIC,0.011337,-0.005715,-0.005622,0.011337
DSI,0.003024,-0.00152,-0.001504,0.003024
E03210,0.002131,-0.000908,-0.001223,0.002131
E32800,0.000271,-0.000122,-0.000149,0.000271
E02300,0.000154,-6.2e-05,-9.3e-05,0.000154
E00800,4.4e-05,1.9e-05,-6.3e-05,4.4e-05
E03150,4.2e-05,-1.6e-05,-2.5e-05,4.2e-05
E02400,2.7e-05,-8e-06,-1.8e-05,2.7e-05


### Predict

In [72]:
mult_pred = pd.DataFrame({'actual': Y_test_sign,
                          'pred': mult.predict(X_test)})
mult_pred['sign_correct'] = (mult_pred.actual == mult_pred.pred)
mult_pred['count'] = 1
mult_pred.sign_correct.mean()

0.7642657835832337

In [73]:
mult_pred.pivot_table(index='actual', columns='pred', 
                      values='count', aggfunc=sum, margins=True)

pred,-1.0,0.0,1.0,All
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1.0,1786,2733,156,4675
0.0,913,22987,84,23984
1.0,1342,2460,152,3954
All,4041,28180,392,32613


#### Log-loss

In [74]:
mult_pred_proba = mult.predict_proba(X_test)
metrics.log_loss(Y_test_sign, mult_pred_proba)

0.8995764140809479