## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split

## Data prep

In [2]:
# Remove aggregate rows, replace NaN with 0

puf = pd.read_csv('puf2011.csv')

puf = puf[(puf['RECID'] != 999996) &
          (puf['RECID'] != 999997) &
          (puf['RECID'] != 999998) &
          (puf['RECID'] != 999999)
         ]
           
puf = puf.fillna(0)

#  MARS to K - 1 dummies

puf[['MARS2', 'MARS3', 'MARS4']] = pd.get_dummies(puf['MARS'], drop_first = True)

# E19800 and E20100 combined in CPS

puf['E19800_E20100'] = puf['E19800'] + puf['E20100']

# All variables shared between puf and cps, except for E01100 (crashing mnlogit), E00650 (colinear w/E00600)

pred =  [
         'DSI', 'EIC', 'MARS2', 'MARS3', 'MARS4', 'XTOT', 
         'E00200', 'E00300', 'E00400', 'E00600', 'E00800', 'E00900', 
         'E01400', 'E01500', 'E01700', 'E02100', 'E02300', 'E02400', 
         'E03150', 'E03210', 'E03240', 'E03270', 'E03300', 'E17500', 
         'E18400', 'E18500', 'E19200', 'E19800_E20100','E20400', 
         'E32800', 'F2441', 'N24'
        ]

keep = ['RECID', 'AGIR1', 'P22250'] + pred

puf = puf[keep]

np.random.seed(100)

train, test = train_test_split(puf, test_size=0.2)

# Sub-df's where P22250 > 0 or < 0 pos or neg for 2nd stage imputation

pos_train = train.copy()[train.copy()['P22250'] > 0]
neg_train = train.copy()[train.copy()['P22250'] < 0]

pos_test =  test.copy()[test.copy()['P22250'] > 0]
neg_test =  test.copy()[test.copy()['P22250'] < 0]


## Models

### Positive data

LassoCV

In [3]:
pos_Lasso = linear_model.LassoCV(cv = 10, n_jobs = -1).fit(pos_train[pred], pos_train['P22250'])
pos_Lasso_predictions = pos_Lasso.predict(pos_test[pred])

In [4]:
pos_Lasso_MSE = metrics.mean_squared_error(pos_test['P22250'], pos_Lasso_predictions)
pos_Lasso_MSE**0.5

861831.864777408

Random forests

In [5]:
# 100 estimators
N_ESTIMATORS = 100
pos_rf = ensemble.RandomForestRegressor(n_estimators=N_ESTIMATORS, 
                                    min_samples_leaf=1, random_state=3, 
                                    verbose=True, 
                                    n_jobs=-1)  # Use maximum number of cores.
pos_rf.fit(pos_train[pred], pos_train['P22250'])
pos_rf_predictions = pos_rf.predict(pos_test[pred])

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   11.7s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


In [6]:
pos_rf_MSE = metrics.mean_squared_error(pos_test['P22250'], pos_rf_predictions)
pos_rf_MSE**0.5

899732.1339124828

### Negative data

LassoCV

In [7]:
neg_Lasso = linear_model.LassoCV(cv = 10, n_jobs = -1).fit(neg_train[pred], neg_train['P22250'])
neg_Lasso_predictions = neg_Lasso.predict(neg_test[pred])

In [8]:
neg_Lasso_MSE = metrics.mean_squared_error(neg_test['P22250'], neg_Lasso_predictions)
neg_Lasso_MSE**0.5

586291.2798122863

Random forests

In [9]:
# 100 estimators
N_ESTIMATORS = 100
neg_rf = ensemble.RandomForestRegressor(n_estimators=N_ESTIMATORS, 
                                    min_samples_leaf=1, random_state=3, 
                                    verbose=True, 
                                    n_jobs=-1)  # Use maximum number of cores.
neg_rf.fit(neg_train[pred], neg_train['P22250'])
neg_rf_predictions = neg_rf.predict(neg_test[pred])

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   13.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


In [10]:
neg_rf_MSE = metrics.mean_squared_error(neg_test['P22250'], neg_rf_predictions)
neg_rf_MSE**0.5

843195.0226865638