# nnet P22250 Model

In [13]:
library('nnet')

In [14]:
# puf2011signed contains only predictor/response variables as well sign columns (-1,0,1 for neg/zero/pos)

puf2011 <- read.csv('puf2011signed.csv')
puf2011test <- read.csv('puf2011test')

# P22250_dum = P22250's sign column

puf2011$P22250_dum <- as.factor(puf2011$P22250_dum)

In [15]:
#80% training data, 20% validation

set.seed(1994)
sample <- sample(2, nrow(puf2011), replace = TRUE, prob = c(0.8,0.2))
training <- puf2011[sample == 1,]
validation <- puf2011[sample == 2,]

# Writes training and validation data to csv for later use in Python's sklearn 

write.csv(training, file = 'puf80%training')
write.csv(validation, file = 'puf20%validation')


In [16]:
#Model
P22250_model <- multinom(P22250_dum ~ DSI + EIC + MARS + E00200 + E00300 + E00600 + 
                                      E00800 + E00900 + E01100 + E01400 + E01500 + 
                                      E01700 + E02100 + E02300 + E02400 + E03150 + 
                                      E03210 + E03240 + E03270 + E03300 + E17500 + 
                                      E18400 + E18500 + E19200 + E19800 + E20100 + 
                                      E20400 + E32800, data = training, maxit = 500)

# weights:  90 (58 variable)
initial  value 144108.269742 
iter  10 value 141199.529004
iter  20 value 138914.010614
iter  30 value 136645.050312
iter  40 value 134404.358898
iter  50 value 132732.392183
iter  60 value 81914.988284
iter  70 value 78659.009089
iter  80 value 77592.410493
iter  90 value 77332.207517
iter 100 value 77324.500150
iter 110 value 77310.057698
iter 120 value 77134.832568
iter 130 value 76753.415788
iter 140 value 76628.082428
iter 150 value 76627.210928
iter 160 value 76626.999565
iter 170 value 76626.052425
iter 180 value 76555.734649
iter 190 value 76497.404597
iter 200 value 76496.091278
iter 210 value 76495.774015
iter 220 value 76492.753263
iter 230 value 76481.332955
iter 240 value 76390.609197
iter 250 value 76355.207421
iter 260 value 76354.989095
iter 270 value 76354.816129
iter 280 value 76354.579849
iter 290 value 76354.294635
iter 300 value 76284.182418
iter 310 value 76274.089401
iter 320 value 76274.042392
iter 330 value 76273.033974
iter 340 val

## Summary

In [17]:
model_sum <- summary(P22250_model)

print('Model Summary')

# Pulls coefficients & standard errors from summary, creates df and adds columns for t-stats

model_sum_df <- rbind(model_sum$coefficients, model_sum$standard.errors)
model_sum_df <- data.frame(t(model_sum_df))
model_sum_df <- model_sum_df[,c(1, 3, 2, 4)]
colnames(model_sum_df) <- c('Coefficients_0', 'Std_Errors_0', 'Coefficients_1', 'Std_Errors_1')

T_Stats_0 <- model_sum_df[,'Coefficients_0']/model_sum_df[,'Std_Errors_0']
T_Stats_1 <- model_sum_df[,'Coefficients_1']/model_sum_df[,'Std_Errors_1']
model_sum_df$T_Stats_0 <- T_Stats_0
model_sum_df$T_Stats_1 <- T_Stats_1

model_sum_df <- model_sum_df[,c(1,2,5,3,4,6)]

model_sum_df

# Adds residual deviance

print('Residual Deviance')
model_sum$deviance

[1] "Model Summary"


Unnamed: 0,Coefficients_0,Std_Errors_0,T_Stats_0,Coefficients_1,Std_Errors_1,T_Stats_1
(Intercept),2.701511,2.39481e-08,112806900.0,-0.1165657,1.386505e-08,-8407162.0
DSI,1.369243,3.409256e-12,401625000000.0,0.3161023,2.202593e-12,143513700000.0
EIC,1.776884,2.480736e-10,7162730000.0,0.1388389,1.753808e-10,791642700.0
MARS,-0.1004924,4.078906e-08,-2463709.0,-0.01246345,2.397615e-08,-519827.0
E00200,-1.013915e-06,2.315448e-08,-43.78917,-1.263558e-08,7.106322e-09,-1.778076
E00300,-3.355194e-06,1.894106e-07,-17.71387,9.54782e-08,3.364994e-08,2.837396
E00600,-1.575232e-06,1.113731e-07,-14.14374,-1.091615e-09,1.588065e-08,-0.06873868
E00800,-1.51625e-05,5.873933e-06,-2.58132,-2.070762e-05,1.20224e-05,-1.722419
E00900,1.911272e-08,3.617405e-08,0.5283544,-3.469739e-08,3.314916e-08,-1.046705
E01100,0.6108949,2.154595e-11,28353110000.0,0.3530921,1.759658e-11,20065960000.0


[1] "Residual Deviance"


## Intercept-only summary

In [18]:
# Model, prediction df, summary

int_P22250_model <- multinom(P22250_dum ~ 1 , data = puf2011)
int_P22250_model_probs <- predict(int_P22250_model, puf2011, type = 'prob')
int_model_sum <- summary(int_P22250_model)

print('Model summary')
int_model_sum_df <- data.frame(int_model_sum$coefficients, int_model_sum$standard.errors,
                           int_model_sum$coefficients/int_model_sum$standard.errors)
names(int_model_sum_df) <- c('Coefficients', 'Std Errors', 'T-stats')
int_model_sum_df

print('Residual deviance')
int_P22250_model$deviance

print('Predicted probabilities')
t(int_P22250_model_probs[1,])

# weights:  6 (2 variable)
initial  value 179937.312312 
final  value 124030.211417 
converged
[1] "Model summary"


Unnamed: 0,Coefficients,Std Errors,T-stats
0,1.6431718,0.007149548,229.82877
1,-0.1696247,0.009673945,-17.53418


[1] "Residual deviance"


[1] "Predicted probabilities"


-1,0,1
0.1425409,0.7371572,0.1203019


# Imputation & validation

## Stochastic imputation function

In [25]:
# 'probs0,' 'probs0_full' contain sign probability predictions for validation, full datasets. 
# Probabilities are written to csv for log-loss computation.

probs0 <- predict(P22250_model, validation, type = 'prob')
write.csv(probs0, file = 'nnet_predictions_valid')
probs0_full <- predict(P22250_model, puf2011, type = 'prob')
write.csv(probs0_full, file = 'nnet_predictions_full')

# Create imputation function. Arguments = random uniform in [0,1] and one row of probs0 named probs.
# Function outputs predicted category if its associated CDF value > runif.
# Probs[[1]] = P(negative), Probs[[2]] = P(zero), Probs[[3]] = P(positive)

stoch_imp <- function(runif, probs){
    if (runif < probs[[1]]) {
        return (-1)
    }
    else if (runif < (probs[[2]] + probs[[1]])) {
        return (0)
    }
    else {
        return (1)
    }
}


## Imputation

In [22]:
# Creates empty prediction column in validation, and a vector of runifs in [0,1] which is the length of 
# the validation dataset.

validation$P22250_p0_dum <- NA
set.seed(1995)
valid_rand0 <- runif(nrow(validation))

count <- 1

# Inserts output of stoch_imp function row-by-row into the validation dataframe's empty prediction column,
# given runif and corresponding row of probs0 as arguments. 

for(i in valid_rand0){
    probs <- probs0[count,]
    validation[count,'P22250_p0_dum'] <- stoch_imp(i, probs)
    count <- count + 1
}


## Validation

### Percent accuracy

In [42]:
# Creates df containing columns for predicted and actual P22250 signs (P22250_dum = actual, P22250_p0_dum = predicted)

P22250_valid <- data.frame(validation$P22250_dum, validation$P22250_p0_dum)

# Prepares columns for analysis

names(P22250_valid) <- c('actual', 'predicted')
P22250_valid$actual <- as.numeric(as.character(P22250_valid$actual))

# Create boolean accuracy column, 0 = wrong, 1 = correct prediction.

P22250_valid$correct_sign <- P22250_valid$actual == P22250_valid$predicted

accuracy = mean(P22250_valid$correct_sign)

accuracy