# nnet P22250 Imputation

## Data

In [50]:
library('nnet')

In [51]:
# read, change sign col to factor, drop aggregate rows.

puf <- read.csv('puf2011.csv')
puf$P22250_sign <- sign(puf$P22250)
puf <- puf[!(puf$RECID %in% c(999996, 999997, 999998, 999999)),]

#80% training data, 20% testing

set.seed(1994)
sample <- sample(2, nrow(puf), replace = TRUE, prob = c(0.8,0.2))
training <- puf[sample == 1,]
testing <- puf[sample == 2,]

# Write training and testing data to csv for later use in Python's sklearn 

write.csv(training, file = 'puf80%training')
write.csv(testing, file = 'puf20%testing')

In [52]:
# Change sign column to factor, prune puf, training and testing datasets to include only predictors & response

puf$P22250_sign <- as.factor(puf$P22250_sign)

predictors <- c('DSI', 'EIC', 'MARS', 'XTOT', 'E00200', 'E00300', 'E00400','E00600', 
               'E00800', 'E00900', 'E01100', 'E01400', 'E01500', 'E01700','E02100', 
               'E02300', 'E02400', 'E03150', 'E03210', 'E03240', 'E03270','E03300', 
               'E17500', 'E18400', 'E18500', 'E19200', 'E19800', 'E20100','E20400', 
               'E32800')
response <- 'P22250_sign'

puf <- puf[, c(predictors, response)]
training <- training[, c(predictors, response)]
testing <- testing[, c(predictors, response)]

In [53]:
#Model
P22250_model <- multinom(P22250_sign ~ ., data = training, maxit = 500)

# weights:  96 (62 variable)
initial  value 144108.269742 
iter  10 value 141078.648283
iter  20 value 138576.467153
iter  30 value 136897.685827
iter  40 value 134681.888657
iter  50 value 132500.425734
iter  60 value 93978.222514
iter  70 value 78295.933067
iter  80 value 77723.679613
iter  90 value 77452.000461
iter 100 value 77420.353477
iter 110 value 77330.265376
iter 120 value 76971.508268
iter 130 value 75962.217237
iter 130 value 75962.217237
iter 140 value 75842.116701
iter 150 value 75817.000814
iter 160 value 75810.015419
iter 170 value 75808.790848
iter 180 value 75807.603888
iter 190 value 75689.430498
iter 200 value 75583.121997
iter 210 value 75567.543683
final  value 75567.531211 
converged


## Summary

In [54]:
model_sum <- summary(P22250_model)

print('Model Summary')

# Pulls coefficients & standard errors from summary, creates df and adds columns for t-stats

model_sum_df <- rbind(model_sum$coefficients, model_sum$standard.errors)
model_sum_df <- data.frame(t(model_sum_df))
model_sum_df <- model_sum_df[,c(1, 3, 2, 4)]
colnames(model_sum_df) <- c('Coefficients_0', 'Std_Errors_0', 'Coefficients_1', 'Std_Errors_1')

T_Stats_0 <- model_sum_df[,'Coefficients_0']/model_sum_df[,'Std_Errors_0']
T_Stats_1 <- model_sum_df[,'Coefficients_1']/model_sum_df[,'Std_Errors_1']
model_sum_df$T_Stats_0 <- T_Stats_0
model_sum_df$T_Stats_1 <- T_Stats_1

model_sum_df <- model_sum_df[,c(1,2,5,3,4,6)]

model_sum_df

# Adds residual deviance

print('Residual Deviance')
model_sum$deviance

[1] "Model Summary"


Unnamed: 0,Coefficients_0,Std_Errors_0,T_Stats_0,Coefficients_1,Std_Errors_1,T_Stats_1
(Intercept),2.916984,2.392707e-08,121911400.0,-0.154442,1.408746e-08,-10963090.0
DSI,0.9705252,3.209538e-12,302387800000.0,0.2699394,2.051094e-12,131607500000.0
EIC,1.628664,2.607538e-10,6245983000.0,0.01636738,1.75786e-10,93109650.0
MARS,0.03929251,4.145239e-08,947895.1,-0.01756157,2.473908e-08,-709871.5
XTOT,-0.213737,5.861429e-08,-3646500.0,0.01825081,3.610222e-08,505531.6
E00200,-9.187332e-07,2.275662e-08,-40.37212,-1.385492e-08,7.13181e-09,-1.942693
E00300,-2.939419e-06,1.848629e-07,-15.90054,1.041168e-07,3.465442e-08,3.004431
E00400,-4.161534e-06,2.552957e-07,-16.30084,-1.13054e-07,4.949649e-08,-2.284081
E00600,-1.28235e-06,1.051005e-07,-12.20118,3.428553e-09,1.607993e-08,0.2132194
E00800,-1.901714e-05,5.998101e-06,-3.170527,-1.904443e-05,1.155125e-05,-1.64869


[1] "Residual Deviance"


## Intercept-only summary

In [55]:
# Model, prediction df, summary

int_P22250_model <- multinom(P22250_sign ~ 1 , data = puf)
int_P22250_model_probs <- predict(int_P22250_model, puf, type = 'prob')
int_model_sum <- summary(int_P22250_model)

print('Model summary')
int_model_sum_df <- data.frame(int_model_sum$coefficients, int_model_sum$standard.errors,
                           int_model_sum$coefficients/int_model_sum$standard.errors)
names(int_model_sum_df) <- c('Coefficients', 'Std Errors', 'T-stats')
int_model_sum_df

print('Residual deviance')
int_P22250_model$deviance

print('Predicted probabilities')
t(int_P22250_model_probs[1,])

# weights:  6 (2 variable)
initial  value 179937.312312 
final  value 124030.211417 
converged
[1] "Model summary"


Unnamed: 0,Coefficients,Std Errors,T-stats
0,1.6431718,0.007149548,229.82877
1,-0.1696247,0.009673945,-17.53418


[1] "Residual deviance"


[1] "Predicted probabilities"


-1,0,1
0.1425409,0.7371572,0.1203019


# Imputation & testing

## Stochastic imputation function

In [56]:
# 'probs0,' 'probs0_full' contain predicted sign probabilities for testing, full datasets. 
# Probabilities are written to csv for log-loss computation.

probs0 <- predict(P22250_model, testing, type = 'prob')
write.csv(probs0, file = 'nnet_predictions_test')
probs0_full <- predict(P22250_model, puf, type = 'prob')
write.csv(probs0_full, file = 'nnet_predictions_full')

# Create imputation function. Arguments = random uniform in [0,1] and one row of probs0 named probs.
# Function outputs predicted category if its associated CDF value > runif.
# Probs[[1]] = P(negative), Probs[[2]] = P(zero), Probs[[3]] = P(positive)

stoch_imp <- function(runif, probs){
    if (runif < probs[[1]]) {
        return (-1)
    }
    else if (runif < (probs[[2]] + probs[[1]])) {
        return (0)
    }
    else {
        return (1)
    }
}


## Imputation

In [57]:
# Creates empty prediction column in testing, and a vector of runifs in [0,1] which is the length of 
# the testing dataset.

testing$P22250_p0_sign <- NA
set.seed(1995)
test_rand0 <- runif(nrow(testing))

count <- 1

# Inserts output of stoch_imp function row-by-row into the testing dataframe's empty prediction column,
# given runif and corresponding row of probs0 as arguments. 

for(i in test_rand0){
    probs <- probs0[count,]
    testing[count,'P22250_p0_sign'] <- stoch_imp(i, probs)
    count <- count + 1
}


## Testing

### Percent accuracy

In [58]:
# Creates df containing columns for predicted and actual P22250 signs (P22250_sign = actual, P22250_p0_sign = predicted)

P22250_test <- data.frame(testing$P22250_sign, testing$P22250_p0_sign)

# Prepares columns for analysis

names(P22250_test) <- c('actual', 'predicted')
P22250_test$actual <- as.numeric(as.character(P22250_test$actual))

# Create boolean accuracy column, 0 = wrong, 1 = correct prediction.

P22250_test$correct_sign <- P22250_test$actual == P22250_test$predicted

accuracy = mean(P22250_test$correct_sign)

accuracy