# P22250 Model

In [88]:
library('nnet')

In [89]:
# puf2011signed contains only the columns of interest (dependents, indepedents) as well 'dependent_dum'
# columns which equal -1, 0 or 1 for negative / 0 / positive sign for given dependent variable.
puf2011 <- read.csv('puf2011signed.csv')

#Change P22250 sign column to factor
puf2011$P22250_dum <- as.factor(puf2011$P22250_dum)

In [90]:
#80% training data, 20% validation
set.seed(1994)
sample <- sample(2, nrow(puf2011), replace = TRUE, prob = c(0.8,0.2))
training <- puf2011[sample == 1,]
validation <- puf2011[sample == 2,]


In [91]:
#Model
P22250_model <- multinom(P22250_dum ~ DSI + EIC + MARS1 + MARS2 + MARS3 + MARS4 +
                                      E00200 + E00300 + E00600 + E00800 + E00900 +
                                      E01100 + E01400 + E01500 + E01700 + E02100 + 
                                      E02300 + E02400 + E03150 + E03210 + E03240 +
                                      E03270 + E03300 + E17500 + E18400 + E18500 + 
                                      E19200 + E19800 + E20100 + E20400 + E32800, data = training, maxit = 500)

# weights:  99 (64 variable)
initial  value 144108.269742 
iter  10 value 141199.529538
iter  20 value 138914.014521
iter  30 value 136645.121220
iter  40 value 134404.959155
iter  50 value 132804.441287
iter  60 value 85186.309161
iter  70 value 76187.667467
iter  80 value 75682.328161
iter  90 value 75488.008479
iter 100 value 75318.748665
iter 110 value 75256.852122
iter 120 value 75238.685100
iter 130 value 75238.158291
iter 130 value 75238.158194
final  value 75238.158194 
converged


# Intercept-only P22250 model

In [92]:
int_P22250_model <- multinom(P22250_dum ~ 1 , data = puf2011)
int_P22250_model
int_P22250_model_probs <- predict(int_P22250_model, puf2011, type = 'prob')
head(int_P22250_model_probs)

# weights:  6 (2 variable)
initial  value 179937.312312 
final  value 124030.211417 
converged


Call:
multinom(formula = P22250_dum ~ 1, data = puf2011)

Coefficients:
  (Intercept)
0   1.6431718
1  -0.1696247

Residual Deviance: 248060.4 
AIC: 248064.4 

Unnamed: 0,-1,0,1
1,0.1425409,0.7371572,0.1203019
2,0.1425409,0.7371572,0.1203019
3,0.1425409,0.7371572,0.1203019
4,0.1425409,0.7371572,0.1203019
5,0.1425409,0.7371572,0.1203019
6,0.1425409,0.7371572,0.1203019


# Full model analysis

In [93]:
P22250_model

Call:
multinom(formula = P22250_dum ~ DSI + EIC + MARS1 + MARS2 + MARS3 + 
    MARS4 + E00200 + E00300 + E00600 + E00800 + E00900 + E01100 + 
    E01400 + E01500 + E01700 + E02100 + E02300 + E02400 + E03150 + 
    E03210 + E03240 + E03270 + E03300 + E17500 + E18400 + E18500 + 
    E19200 + E19800 + E20100 + E20400 + E32800, data = training, 
    maxit = 500)

Coefficients:
  (Intercept)       DSI        EIC       MARS1       MARS2       MARS3
0   2.1615806 1.0340209 1.38497540  0.73189817 -0.03252646  0.39561427
1  -0.1377918 0.2595269 0.02441548 -0.02390417  0.01099370 -0.05143389
        MARS4        E00200        E00300        E00600        E00800
0  1.06663253 -8.616475e-07 -3.250976e-06 -1.572590e-06 -2.478288e-05
1 -0.07347463 -1.338175e-08  9.594473e-08 -6.185212e-10 -1.714328e-05
         E00900    E01100        E01400        E01500        E01700
0  1.280725e-08  6.536524 -1.452021e-06 -3.888786e-07 -3.195626e-07
1 -3.442177e-08 -4.144809  5.229577e-09 -8.776849e-09 -4.377547e-

# Imputation

## Stochastic imputation function

In [94]:
# Create df 'probs0' containing predicted probabilities for each categorical P22250 outcome for each row.
probs0 <- predict(P22250_model, validation, type = 'prob')

# Create imputation function. Arguments = random uniform in [0,1] and one row of probs0 named probs.
# Function outputs predicted category if its associated CDF value > runif.
# Probs[[1]] = P(negative), Probs[[2]] = P(zero), Probs[[3]] = P(positive)

stoch_imp <- function(runif, probs){
    if (runif < probs[[1]]) {
        return (-1)
    }
    else if (runif < (probs[[2]] + probs[[1]])) {
        return (0)
    }
    else {
        return (1)
    }
}


## Testing imputations with different seed values

### Imputation 0

In [95]:
# Creates empty prediction column in validation, and a vector of runifs in [0,1] which is the length of the validation
# dataset. All prediction columns will be labelled P22250_p#_dum, with # corresponding to each imputation. 

validation$P22250_p0_dum <- NA
set.seed(1995)
valid_rand0 <- runif(nrow(validation))

count <- 1

# Inserts output of stoch_imp function row-by-row into the validation dataframe's empty prediction column,
# given runif from valid_rand0, and the corresponding row of probs0, as arguments. 

for(i in valid_rand0){
    probs <- probs0[count,]
    validation[count,'P22250_p0_dum'] <- stoch_imp(i, probs)
    count <- count + 1
}


In [96]:
# Creates df containing columns for predicted and actual P22250 signs (P22250_dum = actual, P22250_p0_dum = predicted)
P22250_valid <- data.frame(validation$P22250_dum, validation$P22250_p0_dum)

# Prepares columns for analysis
names(P22250_valid) <- c('P22250_dum', 'P22250_p0_dum')
P22250_valid$P22250_dum <- as.numeric(as.character(P22250_valid$P22250_dum))

# Creates difference column = predicted sign - actual sign. If difference = 0, prediction is correct.
P22250_valid$difference <- (P22250_valid$P22250_p0_dum - P22250_valid$P22250_dum)

# Df where actual P22250 != 0
not_zero <- P22250_valid[P22250_valid$P22250_dum != 0,]

# Sub-df's where difference column = 0 for both full and non-zero dataset.
not_zero_correct_prediction <- not_zero[not_zero$difference == 0,]
correct_prediction <- P22250_valid[P22250_valid$difference == 0,]

# Percent of not_zero and full dataset with correct prediction. Final accuracy variables are labelled
# 'not_zero_percent_accurate_round#' or 'percent_accurate_round#' with # corresponding to imputation. 
not_zero_percent_accurate = nrow(not_zero_correct_prediction)/nrow(not_zero)*100
not_zero_percent_accurate_round0 <- as.character(round(not_zero_percent_accurate, 2))
percent_accurate <- nrow(correct_prediction)/nrow(P22250_valid)*100
percent_accurate_round0 <- as.character(round(percent_accurate, 2))

#Results
cat(not_zero_percent_accurate_round0, '% of predictions for non-zero validation observations were accurate.\n')
cat(percent_accurate_round0, '% of predictions for full validation dataset were accurate.')

27.09 % of predictions for non-zero validation observations were accurate.
68.55 % of predictions for full validation dataset were accurate.

The imputations below are identical to the previous, except that they don't have comments, and the number represented by '#' in the following objects is increased by 1 for each imputation: P22250_p#_dum, set.seed(#), valid_rand#, not_zero_percent_accurate_round#, percent_accurate_round#. 

### Imputation 1

In [97]:
validation$P22250_p1_dum <- NA
set.seed(1996)
valid_rand1 <- runif(nrow(validation))

count <- 1

for(i in valid_rand1){
    probs <- probs0[count,]
    validation[count,'P22250_p1_dum'] <- stoch_imp(i, probs)
    count <- count + 1
}

P22250_valid <- data.frame(validation$P22250_dum, validation$P22250_p1_dum)

names(P22250_valid) <- c('P22250_dum', 'P22250_p1_dum')
P22250_valid$P22250_dum <- as.numeric(as.character(P22250_valid$P22250_dum))

P22250_valid$difference <- (P22250_valid$P22250_p1_dum - P22250_valid$P22250_dum)

not_zero <- P22250_valid[P22250_valid$P22250_dum != 0,]

not_zero_correct_prediction <- not_zero[not_zero$difference == 0,]
correct_prediction <- P22250_valid[P22250_valid$difference == 0,]

not_zero_percent_accurate = nrow(not_zero_correct_prediction)/nrow(not_zero)*100
not_zero_percent_accurate_round1 <- as.character(round(not_zero_percent_accurate, 2))
percent_accurate <- nrow(correct_prediction)/nrow(P22250_valid)*100
percent_accurate_round1 <- as.character(round(percent_accurate, 2))

cat(not_zero_percent_accurate_round1, '% of predictions for non-zero validation observations were accurate.\n')
cat(percent_accurate_round1, '% of predictions for full validation dataset were accurate.')

27.29 % of predictions for non-zero validation observations were accurate.
68.42 % of predictions for full validation dataset were accurate.

### Imputation 2

In [98]:
validation$P22250_p2_dum <- NA
set.seed(1997)
valid_rand2 <- runif(nrow(validation))

count <- 1

for(i in valid_rand2){
    probs <- probs0[count,]
    validation[count,'P22250_p2_dum'] <- stoch_imp(i, probs)
    count <- count + 1
}

P22250_valid <- data.frame(validation$P22250_dum, validation$P22250_p2_dum)

names(P22250_valid) <- c('P22250_dum', 'P22250_p2_dum')
P22250_valid$P22250_dum <- as.numeric(as.character(P22250_valid$P22250_dum))

P22250_valid$difference <- (P22250_valid$P22250_p2_dum - P22250_valid$P22250_dum)

not_zero <- P22250_valid[P22250_valid$P22250_dum != 0,]

not_zero_correct_prediction <- not_zero[not_zero$difference == 0,]
correct_prediction <- P22250_valid[P22250_valid$difference == 0,]

not_zero_percent_accurate = nrow(not_zero_correct_prediction)/nrow(not_zero)*100
not_zero_percent_accurate_round2 <- as.character(round(not_zero_percent_accurate, 2))
percent_accurate <- nrow(correct_prediction)/nrow(P22250_valid)*100
percent_accurate_round2 <- as.character(round(percent_accurate, 2))

cat(not_zero_percent_accurate_round2, '% of predictions for non-zero validation observations were accurate.\n')
cat(percent_accurate_round2, '% of predictions for full validation dataset were accurate.')

27.76 % of predictions for non-zero validation observations were accurate.
68.49 % of predictions for full validation dataset were accurate.

### Imputation 3

In [99]:
validation$P22250_p3_dum <- NA
set.seed(1998)
valid_rand3 <- runif(nrow(validation))

count <- 1

for(i in valid_rand3){
    probs <- probs0[count,]
    validation[count,'P22250_p3_dum'] <- stoch_imp(i, probs)
    count <- count + 1
}

P22250_valid <- data.frame(validation$P22250_dum, validation$P22250_p3_dum)

names(P22250_valid) <- c('P22250_dum', 'P22250_p3_dum')

P22250_valid$P22250_dum <- as.numeric(as.character(P22250_valid$P22250_dum))
P22250_valid$difference <- (P22250_valid$P22250_p3_dum - P22250_valid$P22250_dum)

not_zero <- P22250_valid[P22250_valid$P22250_dum != 0,]

not_zero_correct_prediction <- not_zero[not_zero$difference == 0,]
correct_prediction <- P22250_valid[P22250_valid$difference == 0,]

not_zero_percent_accurate = nrow(not_zero_correct_prediction)/nrow(not_zero)*100
not_zero_percent_accurate_round3 <- as.character(round(not_zero_percent_accurate, 2))
percent_accurate <- nrow(correct_prediction)/nrow(P22250_valid)*100
percent_accurate_round3 <- as.character(round(percent_accurate, 2))

cat(not_zero_percent_accurate_round3, '% of predictions for non-zero validation observations were accurate.\n')
cat(percent_accurate_round3, '% of predictions for full validation dataset were accurate.')

27.85 % of predictions for non-zero validation observations were accurate.
68.66 % of predictions for full validation dataset were accurate.

### Imputation 4

In [100]:
validation$P22250_p4_dum <- NA
set.seed(1999)
valid_rand4 <- runif(nrow(validation))

count <- 1

for(i in valid_rand4){
    probs <- probs0[count,]
    validation[count,'P22250_p4_dum'] <- stoch_imp(i, probs)
    count <- count + 1
}

P22250_valid <- data.frame(validation$P22250_dum, validation$P22250_p4_dum)

names(P22250_valid) <- c('P22250_dum', 'P22250_p4_dum')
P22250_valid$P22250_dum <- as.numeric(as.character(P22250_valid$P22250_dum))
P22250_valid$difference <- (P22250_valid$P22250_p4_dum - P22250_valid$P22250_dum)

not_zero <- P22250_valid[P22250_valid$P22250_dum != 0,]

not_zero_correct_prediction <- not_zero[not_zero$difference == 0,]
correct_prediction <- P22250_valid[P22250_valid$difference == 0,]

not_zero_percent_accurate = nrow(not_zero_correct_prediction)/nrow(not_zero)*100
not_zero_percent_accurate_round4 <- as.character(round(not_zero_percent_accurate, 2))
percent_accurate <- nrow(correct_prediction)/nrow(P22250_valid)*100
percent_accurate_round4 <- as.character(round(percent_accurate, 2))

cat(not_zero_percent_accurate_round4, '% of predictions for non-zero validation observations were accurate.\n')
cat(percent_accurate_round4, '% of predictions for full validation dataset were accurate.')

26.98 % of predictions for non-zero validation observations were accurate.
68.47 % of predictions for full validation dataset were accurate.

### Imputation 5

In [101]:
validation$P22250_p5_dum <- NA
set.seed(2000)
valid_rand5 <- runif(nrow(validation))

count <- 1

for(i in valid_rand5){
    probs <- probs0[count,]
    validation[count,'P22250_p5_dum'] <- stoch_imp(i, probs)
    count <- count + 1
}

P22250_valid <- data.frame(validation$P22250_dum, validation$P22250_p5_dum)

names(P22250_valid) <- c('P22250_dum', 'P22250_p5_dum')
P22250_valid$P22250_dum <- as.numeric(as.character(P22250_valid$P22250_dum))
P22250_valid$difference <- (P22250_valid$P22250_p5_dum - P22250_valid$P22250_dum)

not_zero <- P22250_valid[P22250_valid$P22250_dum != 0,]

not_zero_correct_prediction <- not_zero[not_zero$difference == 0,]
correct_prediction <- P22250_valid[P22250_valid$difference == 0,]

not_zero_percent_accurate = nrow(not_zero_correct_prediction)/nrow(not_zero)*100
not_zero_percent_accurate_round5 <- as.character(round(not_zero_percent_accurate, 2))
percent_accurate <- nrow(correct_prediction)/nrow(P22250_valid)*100
percent_accurate_round5 <- as.character(round(percent_accurate, 2))

cat(not_zero_percent_accurate_round5, '% of predictions for non-zero validation observations were accurate.\n')
cat(percent_accurate_round5, '% of predictions for full validation dataset were accurate.')

27.01 % of predictions for non-zero validation observations were accurate.
68.14 % of predictions for full validation dataset were accurate.

### Imputation 6

In [102]:
validation$P22250_p6_dum <- NA
set.seed(2001)
valid_rand6 <- runif(nrow(validation))

count <- 1

for(i in valid_rand6){
    probs <- probs0[count,]
    validation[count,'P22250_p6_dum'] <- stoch_imp(i, probs)
    count <- count + 1
}

P22250_valid <- data.frame(validation$P22250_dum, validation$P22250_p6_dum)

names(P22250_valid) <- c('P22250_dum', 'P22250_p6_dum')
P22250_valid$P22250_dum <- as.numeric(as.character(P22250_valid$P22250_dum))
P22250_valid$difference <- (P22250_valid$P22250_p6_dum - P22250_valid$P22250_dum)

not_zero <- P22250_valid[P22250_valid$P22250_dum != 0,]

not_zero_correct_prediction <- not_zero[not_zero$difference == 0,]
correct_prediction <- P22250_valid[P22250_valid$difference == 0,]

not_zero_percent_accurate = nrow(not_zero_correct_prediction)/nrow(not_zero)*100
not_zero_percent_accurate_round6 <- as.character(round(not_zero_percent_accurate, 2))
percent_accurate <- nrow(correct_prediction)/nrow(P22250_valid)*100
percent_accurate_round6 <- as.character(round(percent_accurate, 2))

cat(not_zero_percent_accurate_round6, '% of predictions for non-zero validation observations were accurate.\n')
cat(percent_accurate_round6, '% of predictions for full validation dataset were accurate.')

26.79 % of predictions for non-zero validation observations were accurate.
68.4 % of predictions for full validation dataset were accurate.

### Imputation 7

In [103]:
validation$P22250_p7_dum <- NA
set.seed(2002)
valid_rand7 <- runif(nrow(validation))

count <- 1

for(i in valid_rand7){
    probs <- probs0[count,]
    validation[count,'P22250_p7_dum'] <- stoch_imp(i, probs)
    count <- count + 1
}

P22250_valid <- data.frame(validation$P22250_dum, validation$P22250_p7_dum)

names(P22250_valid) <- c('P22250_dum', 'P22250_p7_dum')
P22250_valid$P22250_dum <- as.numeric(as.character(P22250_valid$P22250_dum))
P22250_valid$difference <- (P22250_valid$P22250_p7_dum - P22250_valid$P22250_dum)

not_zero <- P22250_valid[P22250_valid$P22250_dum != 0,]

not_zero_correct_prediction <- not_zero[not_zero$difference == 0,]
correct_prediction <- P22250_valid[P22250_valid$difference == 0,]

not_zero_percent_accurate = nrow(not_zero_correct_prediction)/nrow(not_zero)*100
not_zero_percent_accurate_round7 <- as.character(round(not_zero_percent_accurate, 2))
percent_accurate <- nrow(correct_prediction)/nrow(P22250_valid)*100
percent_accurate_round7 <- as.character(round(percent_accurate, 2))

cat(not_zero_percent_accurate_round7, '% of predictions for non-zero validation observations were accurate.\n')
cat(percent_accurate_round7, '% of predictions for full validation dataset were accurate.')

28.02 % of predictions for non-zero validation observations were accurate.
68.64 % of predictions for full validation dataset were accurate.

## Mean model accuracy


In [104]:
not_zero_accuracy_stats <- c(not_zero_percent_accurate_round7, not_zero_percent_accurate_round6, not_zero_percent_accurate_round5, 
                             not_zero_percent_accurate_round4, not_zero_percent_accurate_round3, not_zero_percent_accurate_round2, 
                             not_zero_percent_accurate_round1, not_zero_percent_accurate_round0)

not_zero_accuracy_stats_mean <- round(mean(as.numeric(not_zero_accuracy_stats)),2)

accuracy_stats <- c(percent_accurate_round7, percent_accurate_round6, percent_accurate_round5, percent_accurate_round4,
                    percent_accurate_round3, percent_accurate_round2, percent_accurate_round1, percent_accurate_round0)

accuracy_stats_mean <- round(mean(as.numeric(accuracy_stats)),2)

cat('Non-zero validation data mean accuracy = %', as.character(not_zero_accuracy_stats_mean),'\n')
cat('Full validation model mean accuracy = %', as.character(accuracy_stats_mean),'\n')


Non-zero validation data mean accuracy = % 27.35 
Full validation model mean accuracy = % 68.47 
