# nnet P22250 Imputation

## Data

In [59]:
library('nnet')

In [60]:
# read, change sign col to factor, drop aggregate rows.

puf <- read.csv('puf2011.csv')
puf$P22250_sign <- sign(puf$P22250)
puf <- puf[!(puf$RECID %in% c(999996, 999997, 999998, 999999)),]

#80% training data, 20% testing

set.seed(1994)
sample <- sample(2, nrow(puf), replace = TRUE, prob = c(0.8,0.2))
training <- puf[sample == 1,]
testing <- puf[sample == 2,]

# Write training and testing data to csv for later use in Python's sklearn 

write.csv(training, file = 'puf80%training')
write.csv(testing, file = 'puf20%testing')

In [61]:
# Change sign column to factor, prune puf, training and testing datasets to include only predictors & response

puf$P22250_sign <- as.factor(puf$P22250_sign)

predictors <- c('DSI', 'EIC', 'MARS', 'XTOT', 'E00200', 'E00300', 'E00400','E00600', 
               'E00650', 'E00800', 'E00900', 'E01100', 'E01400', 'E01500', 'E01700',
               'E02100', 'E02300', 'E02400', 'E03150', 'E03210', 'E03240', 'E03270',
               'E03300', 'E17500', 'E18400', 'E18500', 'E19200', 'E19800', 'E20100',
               'E20400', 'E32800', 'F2441', 'N24')
response <- 'P22250_sign'

puf <- puf[, c(predictors, response)]
training <- training[, c(predictors, response)]
testing <- testing[, c(predictors, response)]

In [62]:
#Model
P22250_model <- multinom(P22250_sign ~ ., data = training, maxit = 500)

# weights:  105 (68 variable)
initial  value 144108.269742 
iter  10 value 140867.179647
iter  20 value 138589.315503
iter  30 value 136602.376007
iter  40 value 135230.640806
iter  50 value 131855.142331
iter  60 value 86188.631740
iter  70 value 80173.463898
iter  80 value 76944.317456
iter  90 value 76474.395805
iter 100 value 75839.531023
iter 110 value 75792.118327
iter 120 value 75774.428135
iter 130 value 75524.342249
iter 140 value 75222.597179
iter 150 value 74588.897844
iter 160 value 74437.651063
iter 170 value 74408.325228
final  value 74407.927993 
converged


## Summary

In [63]:
model_sum <- summary(P22250_model)

print('Model Summary')

# Pulls coefficients & standard errors from summary, creates df and adds columns for t-stats

model_sum_df <- rbind(model_sum$coefficients, model_sum$standard.errors)
model_sum_df <- data.frame(t(model_sum_df))
model_sum_df <- model_sum_df[,c(1, 3, 2, 4)]
colnames(model_sum_df) <- c('Coefficients_0', 'Std_Errors_0', 'Coefficients_1', 'Std_Errors_1')

T_Stats_0 <- model_sum_df[,'Coefficients_0']/model_sum_df[,'Std_Errors_0']
T_Stats_1 <- model_sum_df[,'Coefficients_1']/model_sum_df[,'Std_Errors_1']
model_sum_df$T_Stats_0 <- T_Stats_0
model_sum_df$T_Stats_1 <- T_Stats_1

model_sum_df <- model_sum_df[,c(1,2,5,3,4,6)]

model_sum_df

# Adds residual deviance

print('Residual Deviance')
model_sum$deviance

[1] "Model Summary"


Unnamed: 0,Coefficients_0,Std_Errors_0,T_Stats_0,Coefficients_1,Std_Errors_1,T_Stats_1
(Intercept),3.112499,2.361219e-08,131817500.0,-0.1378111,1.391492e-08,-9903836.0
DSI,0.7592383,3.687069e-12,205919200000.0,0.2528583,2.293313e-12,110258900000.0
EIC,1.534153,2.779099e-10,5520325000.0,0.006628599,1.838842e-10,36047670.0
MARS,0.06186998,4.112774e-08,1504337.0,-0.01428183,2.455599e-08,-581602.7
XTOT,-0.3631604,5.793565e-08,-6268340.0,0.002983405,3.57664e-08,83413.61
E00200,-8.671794e-07,2.285494e-08,-37.94276,-1.417845e-08,7.140882e-09,-1.985532
E00300,-2.071553e-06,1.697768e-07,-12.20163,1.076609e-07,3.477696e-08,3.095754
E00400,-2.730946e-06,2.212968e-07,-12.34065,-1.151207e-07,4.951397e-08,-2.325014
E00600,-4.431918e-05,1.289552e-06,-34.36788,-7.055359e-08,4.711402e-08,-1.497507
E00650,4.399775e-05,1.294609e-06,33.98535,8.564496e-08,5.078378e-08,1.686463


[1] "Residual Deviance"


## Intercept-only summary

In [64]:
# Model, prediction df, summary

int_P22250_model <- multinom(P22250_sign ~ 1 , data = puf)
int_P22250_model_probs <- predict(int_P22250_model, puf, type = 'prob')
int_model_sum <- summary(int_P22250_model)

print('Model summary')
int_model_sum_df <- data.frame(int_model_sum$coefficients, int_model_sum$standard.errors,
                           int_model_sum$coefficients/int_model_sum$standard.errors)
names(int_model_sum_df) <- c('Coefficients', 'Std Errors', 'T-stats')
int_model_sum_df

print('Residual deviance')
int_P22250_model$deviance

print('Predicted probabilities')
t(int_P22250_model_probs[1,])

# weights:  6 (2 variable)
initial  value 179937.312312 
final  value 124030.211417 
converged
[1] "Model summary"


Unnamed: 0,Coefficients,Std Errors,T-stats
0,1.6431718,0.007149548,229.82877
1,-0.1696247,0.009673945,-17.53418


[1] "Residual deviance"


[1] "Predicted probabilities"


-1,0,1
0.1425409,0.7371572,0.1203019


# Imputation & testing

## Stochastic imputation function

In [65]:
# 'probs0,' 'probs0_full' contain predicted sign probabilities for testing, full datasets. 
# Probabilities are written to csv for log-loss computation.

probs0 <- predict(P22250_model, testing, type = 'prob')
write.csv(probs0, file = 'nnet_predictions_test')
probs0_full <- predict(P22250_model, puf, type = 'prob')
write.csv(probs0_full, file = 'nnet_predictions_full')

# Create imputation function. Arguments = random uniform in [0,1] and one row of probs0 named probs.
# Function outputs predicted category if its associated CDF value > runif.
# Probs[[1]] = P(negative), Probs[[2]] = P(zero), Probs[[3]] = P(positive)

stoch_imp <- function(runif, probs){
    if (runif < probs[[1]]) {
        return (-1)
    }
    else if (runif < (probs[[2]] + probs[[1]])) {
        return (0)
    }
    else {
        return (1)
    }
}


## Imputation

In [66]:
# Creates empty prediction column in testing, and a vector of runifs in [0,1] which is the length of 
# the testing dataset.

testing$P22250_p0_sign <- NA
set.seed(1995)
test_rand0 <- runif(nrow(testing))

count <- 1

# Inserts output of stoch_imp function row-by-row into the testing dataframe's empty prediction column,
# given runif and corresponding row of probs0 as arguments. 

for(i in test_rand0){
    probs <- probs0[count,]
    testing[count,'P22250_p0_sign'] <- stoch_imp(i, probs)
    count <- count + 1
}


## Testing

### Percent accuracy

In [67]:
# Creates df containing columns for predicted and actual P22250 signs (P22250_sign = actual, P22250_p0_sign = predicted)

P22250_test <- data.frame(testing$P22250_sign, testing$P22250_p0_sign)

# Prepares columns for analysis

names(P22250_test) <- c('actual', 'predicted')
P22250_test$actual <- as.numeric(as.character(P22250_test$actual))

# Create boolean accuracy column, 0 = wrong, 1 = correct prediction.

P22250_test$correct_sign <- P22250_test$actual == P22250_test$predicted

accuracy = mean(P22250_test$correct_sign)

accuracy