# SDSC3006 Group Project


## Steel Plates Faults Dataset (27 attributes, 1941 instances)


### Loading library


In [1]:
# Dev lib
# library(diffobj)


In [21]:
# Prod  lib
# suppressPackageStartupMessages({
library(repr)
library(ggplot2)
library(cowplot)
library(reshape2)
library(pROC)
library(caret)
library(e1071)
library(gbm)
library(kernlab)
library(MASS)
library(klaR)

# library(factoextra)
# library(haven)
# library(rpart)
# library(rpart.plot)
# })


### Supporting functions


In [3]:
setPlotSize <- function(wRatio, hRatio) {
    options(repr.plot.width = wRatio * repr_option_defaults$repr.plot.width)
    options(repr.plot.height = hRatio * repr_option_defaults$repr.plot.height)
}


In [4]:
plotCM2Heatmap <- function(table) {
    ggplot(data = melt(table), aes(x = Prediction, y = Reference, fill = value)) +
        geom_tile(color = "black") +
        geom_text(aes(label = value)) +
        scale_fill_gradientn(colours = heat.colors(100, rev = TRUE)) +
        coord_fixed() +
        theme_grey(base_size = 14) +
        theme(axis.text.x = element_text(angle = 315, hjust = 0))
}


In [5]:
plotMultiRoc <- function(prediction, predictor, arrow) {
    set.seed(0)
    auc <- multiclass.roc(prediction, predictor, direction = arrow)
    for (i in 1:length(auc$rocs)) {
        plot.roc(auc$rocs[[i]], add = (if (i == 1) F else T), legacy.axes = T, lwd = 2, col = sample.int(100))
    }
}


### Set random seed


In [6]:
seed <- 0
print(seed)


[1] 0


### Load dataset


In [7]:
df <- read.table("Faults.NNA", col.names = as.vector(read.table("Faults27x7_var")$V1))
df.X <- scale(df[1:27])
df.Y <- data.frame(Faults = factor(names(df[28:34])[max.col(df[28:34])]))


In [8]:
# setPlotSize(3, 1)
# ggplot(data = melt(cor(df.X, df[28:34])), aes(x = Var1, y = Var2, fill = value)) +
#     geom_tile(color = "black") +
#     scale_fill_gradientn(colours = heat.colors(100, rev = TRUE)) +
#     coord_fixed() +
#     theme_grey(base_size = 14) +
#     theme(axis.text.x = element_text(angle = 315, hjust = 0))


In [9]:
# setPlotSize(1, 1)


### PCA Tranformation


In [10]:
df.pca <- prcomp(df[, -length(df)], scale = TRUE)
df.pcaX <- df.pca$x


In [11]:
# ggplot(cbind(df.pcaX, df.Y), aes(x = PC1, y = PC2, color = Faults)) +
#     geom_point() +
#     stat_ellipse(level = 0.95, show.legend = F) +
#     theme_grey(base_size = 14) +
#     theme(legend.position = c(0.2, 0.2))

# impt <- melt(summary(df.pca)$importance[3, ])
# ggplot(cbind(key = 1:nrow(impt), impt), aes(x = key, y = value, group = 1)) +
#     geom_line(color="grey") +
#     geom_point(shape=21, color="black", fill="purple", size=3) +
#     theme_grey(base_size = 14) +
#     xlab("Principal Component") +
#     ylab("Cumulative Variance Explained")


### Split dataset


In [12]:
set.seed(seed)
rand <- sample(nrow(df), nrow(df) * 0.8)

df.train.X <- df.X[rand, ]
df.train.Y <- df.Y[rand, ]
df.train.pcaX <- df.pcaX[rand, ]

df.test.X <- df.X[-rand, ]
df.test.Y <- df.Y[-rand, ]
df.test.pcaX <- df.pcaX[-rand, ]


In [16]:
cv <- trainControl(method = "repeatedcv", number = 10)
