In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

from statsmodels import regression
from sklearn import ensemble
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split

## Data

In [2]:
# Remove aggregate rows, replace NaN with 0

puf = pd.read_csv('puf2011.csv')

puf = puf[(puf['RECID'] != 999996) &
          (puf['RECID'] != 999997) &
          (puf['RECID'] != 999998) &
          (puf['RECID'] != 999999)
         ]
           
puf = puf.fillna(0)

# Constant column

puf['constant'] = 1

#  MARS to K - 1 dummies

puf[['MARS2', 'MARS3', 'MARS4']] = pd.get_dummies(puf['MARS'], drop_first = True)

# E19800 and E20100 combined in CPS

puf['E19800_E20100'] = puf['E19800'] + puf['E20100']

# Categorical dependent variable for 1st stage

puf['sign'] = np.where(puf['P22250'] == 0, 'zer', np.where(puf['P22250'] > 0, 'pos', 'neg'))

# Log response column. When x < 0, result is -log(-x)

puf['log_P22250'] = np.where(puf['P22250'] == 0, 0, np.sign(puf['P22250'])*np.log(abs(puf['P22250'])))

# All variables shared between puf and cps, except for E00650 (colinear w/E00600)

predictors =  [
              'DSI', 'EIC', 'MARS2', 'MARS3', 'MARS4', 'XTOT', 
              'E00200', 'E00300', 'E00400', 'E00600', 'E00800', 'E00900', 
              'E01400', 'E01500', 'E01700', 'E02100', 'E02300', 'E02400', 
              'E03150', 'E03210', 'E03240', 'E03270', 'E03300', 'E17500', 
              'E18400', 'E18500', 'E19200', 'E19800_E20100','E20400', 
              'E32800', 'F2441', 'N24', 'E01100'
              ]

# Log columns for continuous predictors.  When x < 0, result is -log(-x)

discretes = ['DSI', 'EIC', 'MARS2', 'MARS3', 'MARS4', 'XTOT', 'F2441', 'N24']

logs = []
for i in predictors:
    if i not in discretes:
        puf['log_' + i] = np.where(puf[i] == 0, 0, np.sign(puf[i])*np.log(abs(puf[i])))     
        logs.append('log_' + i)

keep = ['RECID', 'AGIR1', 'sign', 'P22250', 'log_P22250', 'constant'] + predictors + logs

puf = puf[keep]

np.random.seed(100)

train, test = train_test_split(puf.copy(), test_size=0.2)

# Sub-df's where P22250 > 0 or < 0 pos or neg for 2nd stage imputation

pos_train = train.copy()[train.copy()['P22250'] > 0]
neg_train = train.copy()[train.copy()['P22250'] < 0]

pos_test =  test.copy()[test.copy()['P22250'] > 0]
neg_test =  test.copy()[test.copy()['P22250'] < 0]




## Random Forests
1-stage prediction

In [3]:
# 100 estimators
N_ESTIMATORS = 100
rf = ensemble.RandomForestRegressor(n_estimators=N_ESTIMATORS, 
                                    min_samples_leaf=1, random_state=3, 
                                    verbose=True, 
                                    n_jobs=-1)  # Use maximum number of cores.
rf.fit(train[predictors], train['P22250'])

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.0min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=3, verbose=True, warm_start=False)

In [4]:
feature_importance = pd.Series(rf.feature_importances_, index=predictors)
feature_importance.sort_values(ascending=False)[:10]

E00300           0.140773
E00600           0.102607
E00200           0.100833
E18400           0.084332
E19800_E20100    0.072810
E20400           0.065087
E19200           0.057776
E18500           0.053526
E03300           0.044759
XTOT             0.041464
dtype: float64

rf_preds = array of estimators

In [5]:
rf_preds = []
for estimator in rf.estimators_:
    rf_preds.append(estimator.predict(test[predictors]))
rf_preds = np.array(rf_preds).transpose()  # One row per record.

### Validation

Log-loss

We can calculate the RF model's predicted probability for each sign (and thus its log-loss) using the % of estimators predicting that sign for each observation.

Note: `metrics.log_loss()` assumes that the order of the columns of predicted probabilities correspond to their categories' alphabetical order. Our categories are `'neg'`, `'zer'` and `'pos'`, which have the alphabetical order of `'neg'`, `'pos'`, `'zer'`, thus they appear in that order in `rf_pred_proba` as `[preds_neg, preds_pos, preds_zer]`

In [6]:
preds_neg = np.sum(rf_preds < 0, axis=1) / 100
preds_zer = np.sum(rf_preds == 0, axis=1) / 100
preds_pos = np.sum(rf_preds > 0, axis=1) / 100

rf_pred_proba = list(map(list, zip(*[preds_neg, preds_pos, preds_zer])))

metrics.log_loss(test['sign'], rf_pred_proba)

0.5767110204358304

Continuous prediction

Random estimator selected so that imputation is stochastic

In [7]:
rand_col = np.random.randint(N_ESTIMATORS, size=rf_preds.shape[0])
random_tree = rf_preds[np.arange(rf_preds.shape[0]), rand_col]

In [8]:
pred_random_tree = pd.DataFrame({'actual': test['P22250'],
                                 'actual_sign': test['sign'],
                                 'pred': random_tree})
pred_random_tree['error'] = pred_random_tree.pred - pred_random_tree.actual
pred_random_tree['pred_sign'] = np.where(pred_random_tree['pred'] == 0, 'zer', np.where(pred_random_tree['pred'] > 0, 'pos', 'neg'))
pred_random_tree['correct_sign'] = (
    pred_random_tree.actual_sign == pred_random_tree.pred_sign)
pred_random_tree['count'] = 1

RMSE on whole test set

In [9]:
pred_random_tree.error.pow(2).mean() ** 0.5

886796.139738029

RMSE on positive data

In [10]:
pred_random_tree[pred_random_tree['actual'] > 0]['error'].pow(2).mean()**0.5

1227639.780955069

RMSE on negative data

In [11]:
pred_random_tree[pred_random_tree['actual'] < 0]['error'].pow(2).mean()**0.5

999927.2267220098

## OLS

### Positive data

#### Regressing on log(P22250)

In [12]:
# None of the non-zero observations of E01100 made it into training data, so pos_train['E01100'] 
# is just a column of zeroes, and is thus excluded.
# Predictors included are those with p values <= 0.1

ols_pos_predictors = [
                     'constant', 'DSI', 'EIC', 'XTOT', 'E00200', 'E00300', 
                     'E00400', 'E00600', 'E01400', 'E18400', 'E18500', 'E19200', 
                     'E19800_E20100', 'E20400', 'E32800'
                     ]

ols_pos_fit = sm.OLS(pos_train['log_P22250'], pos_train[ols_pos_predictors]).fit()
summary = ols_pos_fit.summary()

summary

0,1,2,3
Dep. Variable:,log_P22250,R-squared:,0.119
Model:,OLS,Adj. R-squared:,0.118
Method:,Least Squares,F-statistic:,152.1
Date:,"Tue, 07 Aug 2018",Prob (F-statistic):,0.0
Time:,10:19:43,Log-Likelihood:,-39127.0
No. Observations:,15800,AIC:,78280.0
Df Residuals:,15785,BIC:,78400.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
constant,7.2283,0.054,132.703,0.000,7.122,7.335
DSI,-2.0860,0.293,-7.114,0.000,-2.661,-1.511
EIC,-1.0299,0.224,-4.605,0.000,-1.468,-0.592
XTOT,0.0957,0.018,5.192,0.000,0.060,0.132
E00200,1.545e-07,1.7e-08,9.082,0.000,1.21e-07,1.88e-07
E00300,2.98e-07,5.45e-08,5.464,0.000,1.91e-07,4.05e-07
E00400,7.882e-07,1.08e-07,7.272,0.000,5.76e-07,1e-06
E00600,1.508e-07,3.46e-08,4.365,0.000,8.31e-08,2.19e-07
E01400,7.844e-07,1.36e-07,5.772,0.000,5.18e-07,1.05e-06

0,1,2,3
Omnibus:,553.708,Durbin-Watson:,1.991
Prob(Omnibus):,0.0,Jarque-Bera (JB):,612.602
Skew:,-0.478,Prob(JB):,9.44e-134
Kurtosis:,3.128,Cond. No.,20200000.0


RMSE is massive

In [13]:
ols_pos_pred = np.exp(ols_pos_fit.predict(pos_test[ols_pos_predictors]))
(pos_test['P22250'] - ols_pos_pred).pow(2).mean()**0.5

19746052052.35679

#### Regressing on log(P22250), using log-transformed continuous predictors

In [14]:
# Predictors included are those with p values <= 0.1

ols_logpos_predictors = [
                         'constant', 'DSI', 'MARS3', 'N24', 'log_E00200', 'log_E00300', 
                         'log_E00400', 'log_E00600', 'log_E01500', 'log_E02400', 'log_E03210', 
                         'log_E03270', 'log_E03300', 'log_E17500', 'log_E18400', 'log_E19200',
                         'log_E20400', 'log_E32800'
                        ]
ols_logpos_fit = sm.OLS(pos_train['log_P22250'], pos_train[ols_logpos_predictors]).fit()
summary = ols_logpos_fit.summary()

summary

0,1,2,3
Dep. Variable:,log_P22250,R-squared:,0.204
Model:,OLS,Adj. R-squared:,0.203
Method:,Least Squares,F-statistic:,238.1
Date:,"Tue, 07 Aug 2018",Prob (F-statistic):,0.0
Time:,10:19:43,Log-Likelihood:,-38323.0
No. Observations:,15800,AIC:,76680.0
Df Residuals:,15782,BIC:,76820.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
constant,4.7811,0.089,54.000,0.000,4.608,4.955
DSI,-1.0334,0.281,-3.682,0.000,-1.584,-0.483
MARS3,0.7251,0.152,4.758,0.000,0.426,1.024
N24,0.1314,0.025,5.217,0.000,0.082,0.181
log_E00200,-0.0084,0.005,-1.834,0.067,-0.017,0.001
log_E00300,0.2174,0.008,27.163,0.000,0.202,0.233
log_E00400,0.0137,0.006,2.450,0.014,0.003,0.025
log_E00600,0.1513,0.009,17.788,0.000,0.135,0.168
log_E01500,-0.0336,0.005,-7.345,0.000,-0.043,-0.025

0,1,2,3
Omnibus:,436.191,Durbin-Watson:,2.006
Prob(Omnibus):,0.0,Jarque-Bera (JB):,473.257
Skew:,-0.413,Prob(JB):,1.71e-103
Kurtosis:,3.192,Cond. No.,269.0


RMSE is lower than random forest's positive predictions

In [15]:
ols_logpos_pred = np.exp(ols_logpos_fit.predict(pos_test[ols_logpos_predictors]))
(pos_test['P22250'] - ols_logpos_pred).pow(2).mean()**0.5

890112.1924438546

### Negative data

#### Regressing on log(P22250)

In [16]:
# Predictors with p value <= 0.1

ols_neg_predictors = [
                      'constant', 'DSI', 'EIC', 'MARS3', 'E00200', 'E00300', 'E00400', 
                      'E00600', 'E01400', 'E01500', 'E02100', 'E02400', 'E03210', 'E03270',
                      'E18400', 'E18500', 'E19200', 'E20400', 'E32800', 'F2441'
                      ]


ols_neg_fit = sm.OLS(neg_train['log_P22250'], neg_train[ols_neg_predictors]).fit()
summary = ols_neg_fit.summary()

summary

0,1,2,3
Dep. Variable:,log_P22250,R-squared:,0.118
Model:,OLS,Adj. R-squared:,0.118
Method:,Least Squares,F-statistic:,131.5
Date:,"Tue, 07 Aug 2018",Prob (F-statistic):,0.0
Time:,10:19:43,Log-Likelihood:,-45001.0
No. Observations:,18600,AIC:,90040.0
Df Residuals:,18580,BIC:,90200.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
constant,-8.0031,0.029,-273.851,0.000,-8.060,-7.946
DSI,2.0478,0.278,7.372,0.000,1.503,2.592
EIC,0.3723,0.162,2.298,0.022,0.055,0.690
MARS3,-0.5615,0.140,-4.017,0.000,-0.836,-0.288
E00200,-6.888e-08,1.27e-08,-5.434,0.000,-9.37e-08,-4.4e-08
E00300,-1.539e-07,5.3e-08,-2.901,0.004,-2.58e-07,-4.99e-08
E00400,-6.337e-07,9.36e-08,-6.771,0.000,-8.17e-07,-4.5e-07
E00600,-2.037e-07,3.8e-08,-5.368,0.000,-2.78e-07,-1.29e-07
E01400,-8.916e-07,1.19e-07,-7.523,0.000,-1.12e-06,-6.59e-07

0,1,2,3
Omnibus:,941.187,Durbin-Watson:,2.013
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1099.948
Skew:,0.562,Prob(JB):,1.41e-239
Kurtosis:,3.394,Cond. No.,26100000.0


RMSE is even larger than positive predictors'

In [17]:
ols_neg_pred = -np.exp(-ols_neg_fit.predict(neg_test[ols_neg_predictors]))
(neg_test['P22250'] - ols_neg_pred).pow(2).mean()**0.5

23007880474031.184

#### Regressing on log(P22250), using log-transformed continuous predictors

In [18]:
# Predictors included are those with p values <= 0.1

ols_logneg_predictors = [
                         'constant', 'DSI', 'EIC', 'MARS3', 'XTOT', 'F2441', 'log_E00200', 
                         'log_E00300', 'log_E00400', 'log_E00600', 'log_E01500', 'log_E01700', 
                         'log_E03270', 'log_E03300', 'log_E17500', 'log_E18400', 'log_E19200', 
                         'log_E19800_E20100', 'log_E20400'
                        ]
ols_logneg_fit = sm.OLS(neg_train['log_P22250'], neg_train[ols_logneg_predictors]).fit()
summary = ols_logneg_fit.summary()

summary

0,1,2,3
Dep. Variable:,log_P22250,R-squared:,0.207
Model:,OLS,Adj. R-squared:,0.206
Method:,Least Squares,F-statistic:,269.0
Date:,"Tue, 07 Aug 2018",Prob (F-statistic):,0.0
Time:,10:19:43,Log-Likelihood:,-44020.0
No. Observations:,18600,AIC:,88080.0
Df Residuals:,18581,BIC:,88230.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
constant,-5.5988,0.082,-68.144,0.000,-5.760,-5.438
DSI,1.2281,0.270,4.555,0.000,0.700,1.757
EIC,-0.4802,0.156,-3.085,0.002,-0.785,-0.175
MARS3,-0.4808,0.134,-3.576,0.000,-0.744,-0.217
XTOT,-0.0506,0.017,-3.000,0.003,-0.084,-0.018
F2441,0.2700,0.062,4.381,0.000,0.149,0.391
log_E00200,0.0338,0.004,8.690,0.000,0.026,0.041
log_E00300,-0.1557,0.007,-21.567,0.000,-0.170,-0.142
log_E00400,-0.0237,0.005,-4.801,0.000,-0.033,-0.014

0,1,2,3
Omnibus:,623.801,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,869.103
Skew:,0.355,Prob(JB):,1.89e-189
Kurtosis:,3.786,Cond. No.,328.0


RMSE is again much better when regressing on log(predictors)

In [19]:
ols_logneg_pred = -np.exp(-ols_logneg_fit.predict(neg_test[ols_logneg_predictors]))
(neg_test['P22250'] - ols_logneg_pred).pow(2).mean()**0.5

602248.0174974591