# Predictive Maintenance

In this notebook, we will explore various methods to handle the imbalance between the number of failures compared to non-failures.  In addition, we will attempt to fit various classification methods to the data in hopes of successsfully predicting a failure before it happens.

## Customize Enviornment

In [2]:
# load packages
packages <- c("purrr", "doMC", "RMySQL", "lubridate", "ggplot2", "caret", "DMwR", "ROSE", "e1071", "randomForest", 
              "party", "Matrix", "xgboost", "DiagrammeR", "ipred", "nnet", "pROC")
purrr::walk(packages, library, character.only = TRUE, warn.conflicts = FALSE)

# set default plot size
options(repr.plot.width=10, repr.plot.height=6)

# configure multicore processing
registerDoMC(cores=8)

## Load Data

In [3]:
# create an open connection to the database
#cn <- dbConnect(drv = RMySQL::MySQL(),
#                host = "host",
#                port = 3306,
#                user = "user",
#                password = "password",
#                dbname = "db")

# query the database and store the results into a DataFrame
#df <- dbGetQuery(cn, "SELECT * FROM dryer3_1_hour_before_fail")
#dbDisconnect(cn)

In [4]:
# save df to local machine
#saveRDS(df, "dryer3_1_hour_before_fail.Rda")
df <- readRDS("dryer3_1_hour_before_fail.Rda")

In [None]:
# remove rows without complete data
df <- df_all[complete.cases(df_all),]
nrow(df)

In [5]:
# create a new variable that indicates failure in the next hour
df$Class  <- as.factor(df$FAIL)

# drop the old failure indicators
df$FAIL <- NULL

# create variable names for the Class variable
levels(df$Class) <- make.names(c("notFail", "fail"))

In [6]:
# create new variables for dates and time
df$Datetime_hour <- ymd_hms(df$Datetime)
df$Year <- year(df$Datetime_hour)
df$Month <- month(df$Datetime_hour)
df$Day <- day(df$Datetime_hour)
df$Hour <- hour(df$Datetime_hour)
df$Minute <- minute(df$Datetime_hour)
df$Datetime_hour <- as.factor(df$Datetime_hour)

In [None]:
# look at the number of variables
cat("Number of variables before:")
ncol(df)

# drop variables are are constant
df <- df[sapply(df, function(x) length(unique(na.omit(x)))) > 1]
    
# look at the number of variables
cat("\nNumber of variables after:")
ncol(df)

In [None]:
# split data into traning and test
set.seed(1234)
trainIndex <- createDataPartition(df$Class, 
                                  p = .6, 
                                  list = FALSE, 
                                  times = 1)

imbal_train <- df[ trainIndex,]
non_train   <- df[-trainIndex,]

trainIndex <- createDataPartition(non_train$Class, 
                                  p = .5, 
                                  list = FALSE, 
                                  times = 1)

imbal_test  <- non_train[ trainIndex,]
imbal_valid <- non_train[-trainIndex,]

In [None]:
# seperate datetime_hour from each dataset so that I can join it up late
train_dt <- imbal_train$Datetime_hour
imbal_train$Datetime_hour <- NULL

test_dt <- imbal_test$Datetime_hour
imbal_test$Datetime_hour <- NULL

valid_dt <- imbal_valid$Datetime_hour
imbal_valid$Datetime_hour <- NULL

In [None]:
set.seed(1234)
smote_train <- SMOTE(Class ~ ., 
                     data = imbal_train, 
                     perc.over = 1000)                         

cat("Number of events by type in the original dataset:")
table(imbal_train$Class)

cat("\nNumber of events by type in the balanced dataset:")
table(smote_train$Class)

In [15]:
set.seed(1234)
rose_train <- ROSE(Class ~ ., 
                   data = imbal_train)$data                         

table(rose_train$Class) 


notFail    fail 
   1644    1660 

## Compare Balancing Methods

### In order to evalaute the usefulness of the 4 different balancing methods, we will be run the same classification methods on each of these new datasets and compare the results

In [16]:
ctrl <- trainControl(method = "repeatedcv", 
                     repeats = 5,
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary)

In [90]:
set.seed(1234)
orig_fit <- train(Class ~ ., 
                   data = imbal_train, 
                   method = "treebag",
                   nbagg = 100,
                   metric = "ROC",
                   trControl = ctrl,
                   na.action=na.exclude)

In [91]:
set.seed(1234)
down_fit <- train(Class ~ ., 
                   data = down_train, 
                   method = "treebag",
                   nbagg = 100,
                   metric = "ROC",
                   trControl = ctrl,
                   na.action=na.exclude)

In [92]:
set.seed(1234)
up_fit <- train(Class ~ ., 
                data = up_train, 
                method = "treebag",
                nbagg = 100,
                metric = "ROC",
                trControl = ctrl,
                na.action=na.exclude)

In [93]:
set.seed(1234)
smote_fit <- train(Class ~ ., 
                    data = smote_train, 
                    method = "treebag",
                    nbagg = 100,
                    metric = "ROC",
                    trControl = ctrl,
                    na.action=na.exclude)

In [94]:
set.seed(1234)
rose_fit <- train(Class ~ ., 
                  data = rose_train, 
                  method = "treebag",
                  nbagg = 100,
                  metric = "ROC",
                  trControl = ctrl,
                  na.action=na.exclude)

#### To compare the different balancing methods, we will validate the models with the test data and compare the ROC values

In [97]:
balance_models <- list(original = orig_fit,
                       down = down_fit,
                       up = up_fit,
                       SMOTE = smote_fit,
                       ROSE = rose_fit)

balance_resampling <- resamples(balance_models)

test_roc <- function(model, data) {
  library(pROC)
  roc_obj <- roc(data$Class, 
                 predict(model, data, type = "prob")[, "fail"])
  ci(roc_obj)
  }

balance_test <- lapply(balance_models, test_roc, data = imbal_test)
balance_test <- lapply(balance_test, as.vector)
balance_test <- do.call("rbind", balance_test)
colnames(balance_test) <- c("lower", "ROC", "upper")
balance_test <- as.data.frame(balance_test)

#summary(balance_resampling, metric = "ROC")
balance_test

Unnamed: 0,lower,ROC,upper
original,0.4192415,0.5674266,0.7156116
down,0.4491774,0.5778408,0.7065041
up,0.3881976,0.467714,0.5472304
SMOTE,0.4790826,0.6054611,0.7318395
ROSE,0.4551639,0.528014,0.6008641


## Explore Classification Methods

In [19]:
# define contrls for each model
fit_ctrl <- trainControl(method = "repeatedcv", 
                         repeats = 5,
                         classProbs = TRUE,
                         summaryFunction = twoClassSummary)

### Decision Tree

In [None]:
set.seed(1234)
rpart_fit <- train(Class ~ ., 
                   data = rose_train,
                   method = "rpart",
                   metric = "ROC",
                   trControl = fit_ctrl,
                   na.action = na.exclude)

In [None]:
rpart_fit

In [None]:
rpart_pred <- predict(rpart_fit, imbal_test, type = "prob")
rpart_pred$pred <- factor(ifelse(rpart_pred$notFail >= .5, "notFail", "fail"))
rpart_pred <- cbind(rpart_pred, actual = imbal_test$Class)

In [None]:
confusionMatrix(data = rpart_pred$pred, reference = rpart_pred$actual)

### Bagged Tree

In [None]:
ctrl <- trainControl(method = "repeatedcv", 
                     repeats = 5,
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary)

In [None]:
set.seed(1234)
treebag_fit <- train(Class ~ ., 
                     data = up_train,
                     method = "treebag",
                     nbagg = 100,
                     metric = "ROC",
                     trControl = ctrl, 
                     na.action=na.exclude)

In [None]:
treebag_fit

In [None]:
treebag_pred <- predict(treebag_fit, imbal_test, type = "prob")
treebag_pred$pred <- factor(ifelse(treebag_pred$notFail >= .5, "notFail", "fail"))
treebag_pred <- cbind(treebag_pred, actual = imbal_test$Class)

In [None]:
confusionMatrix(data = rpart_pred$pred, reference = rpart_pred$actual)

### Support Vector Machine

In [None]:
set.seed(1234)
svm_fit <- train(Class ~ ., 
                 data = rose_train, 
                 method = "svmRadial", 
                 trControl = fit_ctrl, 
                 preProc = c("center", "scale"),
                 tuneLength = 8,
                 metric = "ROC", 
                 na.action = na.exclude)

In [None]:
svm_fit

In [None]:
svm_pred <- predict(svm_fit, imbal_test, type = "prob")
svm_pred$pred <- factor(ifelse(svm_pred$notFail >= .5, "notFail", "fail"))
svm_pred <- cbind(svm_pred, actual = imbal_test$Class)

In [None]:
confusionMatrix(data = svm_pred$pred, reference = svm_pred$actual)

### AdaBoost Classification Trees

In [None]:
set.seed(1234)
ada_fit <- train(Class ~ ., 
                 data = rose_train, 
                 method = "adaboost",
                 trControl = fit_ctrl, 
                 na.action = na.exclude)

In [None]:
ada_fit

In [None]:
ada_pred <- predict(ada_fit, imbal_test, type = "prob")
ada_pred$pred <- factor(ifelse(ada_pred$notFail >= .5, "notFail", "fail"))
ada_pred <- cbind(ada_pred, actual = imbal_test$Class)

In [None]:
confusionMatrix(data = ada_pred$pred, reference = ada_pred$actual)

### Neural Network

In [None]:
set.seed(1234)
net_fit <- train(Class ~ ., 
                 data = imbal_train, 
                 method = "avNNet",
                 na.action = na.exclude)

In [None]:
net_fit

In [None]:
net_pred <- predict(net_fit, imbal_test, type = "prob")
net_pred$pred <- factor(ifelse(net_pred$notFail >= .5, "notFail", "fail"))
net_pred <- cbind(net_pred, actual = imbal_test$Class)

In [None]:
confusionMatrix(data = net_pred$pred, reference = net_pred$actual)

### Cost Sensitive Learning

In [None]:
stats <- function (data, lev = NULL, model = NULL)  {
  c(postResample(data[, "pred"], data[, "obs"]),
    Sens = sensitivity(data[, "pred"], data[, "obs"]),
    Spec = specificity(data[, "pred"], data[, "obs"]))
}

ctrl <- trainControl(method = "repeatedcv", 
                     repeats = 5,
                     summaryFunction = stats)

In [None]:
set.seed(1234)
c50_fit <- train(Class ~ ., 
                 data = rose_train, 
                 method = "C5.0Cost",
                 tuneGrid = expand.grid(model = "tree", 
                                        winnow = c(TRUE, FALSE),
                                        trials = c(1:10),
                                        cost = 1:10),
                 metric = "Kappa",
                 trControl = ctrl, 
                 na.action = na.exclude)

In [None]:
c50_fit

In [None]:
c50_pred <- predict(c50_fit, imbal_test)
c50_pred <- data.frame(predicted = c50_pred, actual = imbal_test$Class)

In [None]:
confusionMatrix(data = c50_pred$predicted, reference = c50_pred$actual)

### RPART with Cost

In [None]:
cctrl1 <- trainControl(method = "cv", 
                       number = 3, 
                       returnResamp = "all")

cctrl2 <- trainControl(method = "LOOCV")

cctrl3 <- trainControl(method = "none")

cctrlR <- trainControl(method = "cv", 
                       number = 3, 
                       returnResamp = "all", 
                       search = "random")

In [None]:
set.seed(1234)
rpartC_fit <- train(Class ~ ., 
                    data = rose_train, 
                    method = "rpartCost", 
                    trControl = cctrl2,
                    preProc = c("center", "scale"), 
                    metric = "Kappa",
                    na.action = na.exclude)

In [None]:
rpartC_fit

In [None]:
rpartC_fit_pred <- predict(rpartC_fit, imbal_test)
rpartC_fit_pred <- data.frame(predicted = rpartC_fit_pred, actual = imbal_test$Class)

In [None]:
confusionMatrix(data = rpartC_fit_pred$predicted, reference = rpartC_fit_pred$actual)