<a href="https://colab.research.google.com/github/1975JHK/1975JHK.github.io/blob/main/code_for_the_second_paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
##################################################
# Performance Comparison among predictive models #
#  1) Default Parameters                         #
#  2) Hyper-Parameters Optimization              #
#  3) SVM Bagging                                #
#  4) SVM Bagging + Baseline OOD Detection       #
# Coded on August 6, 2021                       #
# Coded By Robin Kim                             #
##################################################

# Installing the required packages
pkgs <- c('tidyverse', 'caret', 'see', 'e1071', 'DT', 'easypackages')
install.packages(pkgs)

# Loading the required packages
library(easypackages)
libraries(pkgs)

# Setting up the environments
Sys.setenv(LANG = 'en')
theme_set(new = theme_minimal())

# Importing the dataset
raw <- read.csv('/process_data.csv')
raw <- raw[ , -c(66:73)]
str(raw)

# Initial Preprocessing
## Extracting 2nd Plat Data from Raw Dataset
raw <- raw %>% filter(Var1 == 'P2')

## Feature Selection
df <- raw[ , -c(1:6, 10, 14:15, 39:45, 51:61, 63:64)]
colnames(df)

## Handling the Not Availables
apply(df, 2, function(x) sum(is.na(x)))
df <- na.omit(df)

## Classifying Tissu.Type
df$Var49 <- ifelse(df$Var49 == 'J' & df$Var52 < 3000, 'S', df$Var49)
df <- df[ , -36]

## Removing features with zero variance 
nearZeroVar(df, saveMetrics = T)
df.nvz <- nearZeroVar(df, saveMetrics = F)
df <- df[ , -df.nvz]

df <- df[ , -16]  # consists with zero(0)

## Separating the dates
df <- df %>% separate(Var8, into = c('Year', 'Month', 'Day'), sep = '/')
df <- df %>% filter(!c(Year == '20' & Month %in% c('01', '02')))
addmargins(table(df$Month))

## Encoding on Tissu.Type
df$Var49 <- factor(df$Var49)

## CTQ 정리
df <- df[ , -c(31:33)] # Not the Target

## Target
df$Y3 <- apply(df[ , 29:30], 1, mean)
df <- df[ , -c(29:30)]


## Removing outliers
### CTQs
df <- df %>% filter(Y3 > -2.0, Y3 < -0.1)
df <- df %>% filter(Y1 > 0.0160, Y1 < 0.210)

### React.Time
df <- df %>% filter(Var11 > 10, Var11 < 50)

### React.Temp
df <- df %>% filter(Var12 > 50, Var12 < 150)

### Line Speed
df <- df %>% filter(Var24 > 3.0)

### Upper and Lower Dryer Temp
df <- df %>% filter(Var25 > 45 & Var25 < 70)
df <- df %>% filter(Var26 > 45 & Var26 < 70)

### Pressure2
df <- df %>% filter(Var28 >= 0 & Var28 < 10)

### Pressure3
df <- df %>% filter(Var29 >= 0 & Var29 < 10)

### Pressure4
df <- df %>% filter(Var30 >= 0 & Var30 < 30)

### Pressure5
df <- df %>% filter(Var31 > 0)

### Pressure6
df <- df %>% filter(Var32 > 5)

### Pressure7
df <- df %>% filter(Var33 > 10)

### Pressure8
df <- df %>% filter(Var34 > 20)

### Pressure9
df <- df %>% filter(Var35 > 10)

### Pressure10
df <- df %>% filter(Var36 > 10)

### Pressure11
df <- df %>% filter(Var37 > 5 & Var37 < 40)

### Blower
df <- df %>% filter(Var18 < 12)

### Viscosity
df <- df %>% filter(Var9 >= 20000 & Var9 <= 50000)

# Feature Engineering
## Pressure slope on Dryers
df <- df %>% mutate(P1 = Var32 - Var30,
                    P2 = Var34 - Var32,
                    P3 = Var34 - Var37)

## Arranging dataset
colnames(df)
df <- df[ , c(1:27, 29, 31:33, 28, 30)]
colnames(df)


# Secondary Preprocessing
## Encoding : Y3(Target)
df$Y3 <- case_when(df$Y3 < -1.00 ~ 'Bad',
                   df$Y3 < -0.70 ~ 'Normal',
                   df$Y3 < 0.00 ~ 'Good',
                   TRUE ~ 'NA')

df$Y3 <- factor(df$Y3,
                levels = c('Normal', 'Good', 'Bad'),
                labels = c('Normal', 'Good', 'Bad'))

### Encoding : Var49
addmargins(table(df$Var49))
df$Var49 <- as.numeric(df$Var49)
addmargins(table(df$Var49))
table(is.na(df))
df <- na.omit(df)            


## Splitting the dataset into train and test set
set.seed(1975)
index <- sample(1:nrow(df), nrow(df)*0.70, replace = F)
train <- df[index, ]
test <- df[-index, ]
train.origin <- train
test.origin <- test

## Feature Selection
train <- train[ , -c(1:4, 13:14)]
test <- test[ , -c(1:4, 13:14)]
colnames(train)

## Handling with Imbalanced Classes with Replacement Sampling
## Oversampling over minority classes
minor_bad <- train %>% filter(Y3 == 'Bad')
minor_bad <- as.data.frame(lapply(minor_bad, rep, 10))
minor_norm <- train %>% filter(Y3 == 'Normal')
minor_norm <- as.data.frame(lapply(minor_norm, rep, 2))
train <- rbind(train, minor_bad, minor_norm)


minor_bad2 <- test %>% filter(Y3 == 'Bad')
minor_bad2 <- as.data.frame(lapply(minor_bad2, rep, 10))
minor_norm2 <- test %>% filter(Y3 == 'Normal')
minor_norm2 <- as.data.frame(lapply(minor_norm2, rep, 2))
test <- rbind(test, minor_bad2, minor_norm2)


## Scaling Features
center = apply(train[ , 1:25], 2, mean)
scale = apply(train[ , 1:25], 2, sd)
train[ , -c(26:27)] <- scale(train[ , -c(26:27)], center = center, scale = scale)
test[ , -c(26:27)] <- scale(test[ , -c(26:27)], center = center, scale = scale)
head(train, 3)
head(test, 3)
dim(train)
dim(test)

Installing packages into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Loading required package: tidyverse

“running command 'timedatectl' had status 1”
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: caret

Loading required package: lattice


Attaching package: ‘caret’


The following object is masked from ‘package:purrr’:

    lift


Loading required package: s

'data.frame':	864 obs. of  65 variables:
 $ Var1 : chr  "P2" "P2" "P2" "P2" ...
 $ Var2 : chr  "P2" "P2" "P2" "P2" ...
 $ Var3 : chr  "P21" "P21" "P21" "P21" ...
 $ Var4 : chr  "P21" "P21" "P21" "P21" ...
 $ Var5 : chr  "P2D01" "P2D01" "P2D01" "P2D01" ...
 $ Var6 : chr  "P2D01" "P2D01" "P2D01" "P2D01" ...
 $ Var7 : chr  "P2D20205001" "P2D20205002" "P2D20205003" "P2D20206001" ...
 $ Var8 : chr  "20/02/05 수요일" "20/02/05 수요일" "20/02/05 수요일" "20/02/06 목요일" ...
 $ Var9 : int  32600 32600 32600 32600 32600 38000 33500 33500 31000 33500 ...
 $ Var10: num  118 118 118 118 118 93.5 115 115 129 115 ...
 $ Var11: int  25 25 25 25 25 27 26 26 23 26 ...
 $ Var12: num  118 118 118 118 118 93.5 115 115 129 115 ...
 $ Var13: int  70 80 80 50 50 50 60 60 60 60 ...
 $ Var14: int  36 36 36 36 36 36 36 36 36 36 ...
 $ Var15: int  1200 1200 1200 1200 1200 1200 1200 1200 1200 1200 ...
 $ Var16: num  107 107 107 107 107 107 107 107 107 107 ...
 $ Var17: num  20.1 19.8 19.8 20 19.5 19.5 19.5 19.5 19.5 19.5 ..

Unnamed: 0_level_0,freqRatio,percentUnique,zeroVar,nzv
Unnamed: 0_level_1,<dbl>,<dbl>,<lgl>,<lgl>
Var7,1.0,100.0,False,False
Var8,1.0,32.4461343,False,False
Var9,1.4,16.6032953,False,False
Var11,1.012658,3.4220532,False,False
Var12,1.346154,14.4486692,False,False
Var13,1.109756,4.0557668,False,False
Var16,3.537879,0.887199,False,False
Var17,1.061947,3.1685678,False,False
Var18,2.119403,2.661597,False,False
Var19,0.0,0.1267427,True,True



 03  04  05  06  07  08  09  10  11  12 Sum 
 83 128 102  83  72  48  61  63  59   9 708 


  I   J   S   Y Sum 
 95 400  14   0 509 


  1   2   3 Sum 
 95 400  14 509 


FALSE 
16797 

Unnamed: 0_level_0,Var9,Var11,Var12,Var13,Var16,Var17,Var18,Var21,Var24,Var25,⋯,Var34,Var35,Var36,Var37,Var49,P1,P2,P3,Y1,Y3
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
274,2.02983434,-0.1166984,0.44530318,-0.3111733,-0.5370508,2.0121546,1.1399413,-0.5419886,0.1288659,0.57501977,⋯,0.5771982,1.1448911,1.1835285,1.0708624,0.5459738,-0.399917,2.4488194,-0.4234278,0.0193,Good
22,0.06769797,0.3862249,-0.67647401,-0.3111733,-0.5370508,-0.782802,-2.311496,0.5026317,0.440608,-0.05433433,⋯,0.4439544,0.1695616,0.4582493,1.0708624,0.5459738,0.7431722,-0.7358183,-0.5813623,0.0189,Normal
91,-0.38306309,-0.1166984,0.02864308,-0.107815,-0.5370508,0.7078415,0.7085117,1.0249419,0.1112255,0.09474005,⋯,0.3107107,0.1695616,0.4582493,0.7654659,0.5459738,0.1716276,0.8565006,-0.4234278,0.0192,Normal


Unnamed: 0_level_0,Var9,Var11,Var12,Var13,Var16,Var17,Var18,Var21,Var24,Var25,⋯,Var34,Var35,Var36,Var37,Var49,P1,P2,P3,Y1,Y3
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
4,0.22679011,0.3862249,0.188897,0.2989017,0.1973055,-0.03748026,-0.1543477,0.8508385,-0.5501421,-0.3317961,⋯,-0.3555081,-0.610702,-0.4483497,0.6127676,0.5459738,0.55265737,-1.1338981,-1.05516578,0.0196,Good
6,0.06769797,-0.6196218,-0.1316108,-1.1246066,-0.5370508,-0.78280201,-0.1543477,-1.2384021,0.6522919,1.1610804,⋯,-0.4887519,-1.1958997,-1.5362685,-1.2196116,-1.2721974,-2.11455085,-0.9348582,0.68211362,0.0187,Good
8,-0.8603395,-1.1225451,0.7978617,-0.7178899,-0.5370508,-0.78280201,-2.311496,0.154425,0.6245301,0.5212762,⋯,0.1774669,-0.2205702,-0.2670299,0.1546728,-1.2721974,-0.01888725,-1.1338981,0.05037566,0.0188,Good


In [6]:
# Performance Comparison --------------------------------------------------
## 1. Default Parameters
default_model = svm(Y3 ~ ., data = train[ , c(1:25, 27)],
                    type = 'C-classification')

default_pred <- predict(default_model, newdata = test[ , 1:25])

default_result <- data.frame(default_pred, test$Y3) %>% 
  rename(Predicted = default_pred, Actual = test.Y3)

with(default_result, confusionMatrix(Actual, Predicted))

Confusion Matrix and Statistics

          Reference
Prediction Normal Good Bad
    Normal     78   54  12
    Good       35   62   4
    Bad         0    0  44

Overall Statistics
                                          
               Accuracy : 0.6367          
                 95% CI : (0.5783, 0.6922)
    No Information Rate : 0.4014          
    P-Value [Acc > NIR] : 6.271e-16       
                                          
                  Kappa : 0.4263          
                                          
 Mcnemar's Test P-Value : 0.0001653       

Statistics by Class:

                     Class: Normal Class: Good Class: Bad
Sensitivity                 0.6903      0.5345     0.7333
Specificity                 0.6250      0.7746     1.0000
Pos Pred Value              0.5417      0.6139     1.0000
Neg Pred Value              0.7586      0.7128     0.9347
Prevalence                  0.3910      0.4014     0.2076
Detection Rate              0.2699      0.2145     0.1522
Det