<a href="https://colab.research.google.com/github/1975JHK/1975JHK.github.io/blob/main/code_for_the_second_paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
##################################################
# Performance Comparison among predictive models #
#  1) Default Parameters                         #
#  2) Hyper-Parameters Optimization              #
#  3) SVM Bagging                                #
#  4) SVM Bagging + Baseline OOD Detection       #
# Coded on August 6, 2021                       #
# Coded By Robin Kim                             #
##################################################

# Installing the required packages
pkgs <- c('tidyverse', 'readxl', 'caret', 'see', 'e1071', 'DT', 'easypackages')
install.packages(pkgs)

# Loading the required packages
library(easypackages)
libraries(c('tidyverse', 'readxl', 'caret', 'see',
            'e1071', 'DT'))

# Setting up the environments
Sys.setenv(LANG = 'en')
theme_set(new = theme_minimal())

# Importing the dataset
raw <- read.csv('/process_data.csv')
raw <- raw[ , -c(66:73)]
str(raw)

# Initial Preprocessing
## Extracting 2nd Plat Data from Raw Dataset
raw <- raw %>% filter(Var1 == 'P2')

## Feature Selection
df <- raw[ , -c(1:6, 10, 14:15, 39:45, 51:61, 63:64)]
colnames(df)

## Handling the Not Availables
apply(df, 2, function(x) sum(is.na(x)))
df <- na.omit(df)

## Classifying Tissu.Type
df$Var49 <- ifelse(df$Var49 == 'J' & df$Var52 < 3000, 'S', df$Var49)
df <- df[ , -36]

## Removing features with zero variance 
nearZeroVar(df, saveMetrics = T)
df.nvz <- nearZeroVar(df, saveMetrics = F)
df <- df[ , -df.nvz]

df <- df[ , -16]  # consists with zero(0)

## Separating the dates
df <- df %>% separate(Var8, into = c('Year', 'Month', 'Day'), sep = '/')
df <- df %>% filter(!c(Year == '20' & Month %in% c('01', '02')))
addmargins(table(df$Month))

## Encoding on Tissu.Type
df$Var49 <- factor(df$Var49)

## CTQ 정리
df <- df[ , -c(31:33)] # Not the Target

## Target
df$Y3 <- apply(df[ , 29:30], 1, mean)
df <- df[ , -c(29:30)]


## Removing outliers
### CTQs
df <- df %>% filter(Y3 > -2.0, Y3 < -0.1)
df <- df %>% filter(Y1 > 0.0160, Y1 < 0.210)

### React.Time
df <- df %>% filter(Var11 > 10, Var11 < 50)

### React.Temp
df <- df %>% filter(Var12 > 50, Var12 < 150)

### Line Speed
df <- df %>% filter(Var24 > 3.0)

### Upper and Lower Dryer Temp
df <- df %>% filter(Var25 > 45 & Var25 < 70)
df <- df %>% filter(Var26 > 45 & Var26 < 70)

### Pressure2
df <- df %>% filter(Var28 >= 0 & Var28 < 10)

### Pressure3
df <- df %>% filter(Var29 >= 0 & Var29 < 10)

### Pressure4
df <- df %>% filter(Var30 >= 0 & Var30 < 30)

### Pressure5
df <- df %>% filter(Var31 > 0)

### Pressure6
df <- df %>% filter(Var32 > 5)

### Pressure7
df <- df %>% filter(Var33 > 10)

### Pressure8
df <- df %>% filter(Var34 > 20)

### Pressure9
df <- df %>% filter(Var35 > 10)

### Pressure10
df <- df %>% filter(Var36 > 10)

### Pressure11
df <- df %>% filter(Var37 > 5 & Var37 < 40)

### Blower
df <- df %>% filter(Var18 < 12)

### Viscosity
df <- df %>% filter(Var9 >= 20000 & Var9 <= 50000)

# Feature Engineering
## Pressure slope on Dryers
df <- df %>% mutate(P1 = Var32 - Var30,
                    P2 = Var34 - Var32,
                    P3 = Var34 - Var37)

## Arranging dataset
colnames(df)
df <- df[ , c(1:27, 29, 31:33, 28, 30)]
colnames(df)


# Secondary Preprocessing
## Encoding : Y3(Target)
df$Y3 <- case_when(df$Y3 < -1.00 ~ 'Bad',
                   df$Y3 < -0.70 ~ 'Normal',
                   df$Y3 < 0.00 ~ 'Good',
                   TRUE ~ 'NA')

df$Y3 <- factor(df$Y3,
                levels = c('Normal', 'Good', 'Bad'),
                labels = c('Normal', 'Good', 'Bad'))

### Encoding : Var49
addmargins(table(df$Var49))
df$Var49 <- as.numeric(df$Var49)
addmargins(table(df$Var49))
table(is.na(df))
df <- na.omit(df)            


## Splitting the dataset into train and test set
set.seed(1975)
index <- sample(1:nrow(df), nrow(df)*0.70, replace = F)
train <- df[index, ]
test <- df[-index, ]
train.origin <- train
test.origin <- test

## Feature Selection
train <- train[ , -c(1:4, 13:14)]
test <- test[ , -c(1:4, 13:14)]
colnames(train)

## Handling with Imbalanced Classes with Replacement Sampling
## Oversampling over minority classes
minor_bad <- train %>% filter(Y3 == 'Bad')
minor_bad <- as.data.frame(lapply(minor_bad, rep, 10))
minor_norm <- train %>% filter(Y3 == 'Normal')
minor_norm <- as.data.frame(lapply(minor_norm, rep, 2))
train <- rbind(train, minor_bad, minor_norm)


minor_bad2 <- test %>% filter(Y3 == 'Bad')
minor_bad2 <- as.data.frame(lapply(minor_bad2, rep, 10))
minor_norm2 <- test %>% filter(Y3 == 'Normal')
minor_norm2 <- as.data.frame(lapply(minor_norm2, rep, 2))
test <- rbind(test, minor_bad2, minor_norm2)


## Scaling Features
center = apply(train[ , 1:25], 2, mean)
scale = apply(train[ , 1:25], 2, sd)
train[ , -c(26:27)] <- scale(train[ , -c(26:27)], center = center, scale = scale)
test[ , -c(26:27)] <- scale(test[ , -c(26:27)], center = center, scale = scale)
head(train, 3)
head(test, 3)
dim(train)
dim(test)
