<a href="https://colab.research.google.com/github/1975JHK/1975JHK.github.io/blob/main/Importing_and_preprocessing_the_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Installing the required packages
paks <- c('tidyverse', 'readxl', 'caret', 'see',
            'e1071', 'DT', 'ROSE', 'easypackages')
install.packages(paks)

Installing packages into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [9]:
##################################################
# Performance Comparison among predictive models #
#  0) Rebalancing the dataset with SMOTE         #
#  1) Default Parameters                         #
#  2) Hyper-Parameters Optimization              #
#  3) SVM Bagging                                #
#  4) SVM Bagging + Baseline OOD Detection       #
# Coded on August 20, 2020                       #
# Coded By Robin Kim                             #
##################################################
  
# Loading the required Package
library(easypackages)
libraries(c('tidyverse', 'readxl', 'caret', 'see',
            'e1071', 'DT', 'ROSE'))

# Setting up the environments
Sys.setenv(LANG = 'en')
theme_set(new = theme_minimal())
options(warn=-1)

# Importing the dataset
raw <- read_excel('/content/process_data.xlsx')
str(raw)


# Initial Preprocessing ---------------------------------------------------
## Extracting 2nd Plat Data from Raw Dataset
raw <- raw %>% filter(Var1 == 'P2')

## Feature Selection
df <- raw[ , -c(1:6, 10, 14:15, 38:45, 51:61, 63:64)]
colnames(df)

## Handling the Not Availables
apply(df, 2, function(x) sum(is.na(x)))
df <- na.omit(df)

## Classifying Tissu.Type
df$Var49 <- ifelse(df$Var49 == 'J' & df$Var52 < 3000, 'S', df$Var49)
df <- df[ , -35]

## Removing features with zero variance 
nearZeroVar(df, saveMetrics = T)
df.nvz <- nearZeroVar(df, saveMetrics = F)
df <- df[ , -df.nvz]

df <- df[ , -16]  # consists with zero(0)

## Separating the dates
df <- df %>% separate(Var8, into = c('Year', 'Month', 'Day'), sep = '/')
df <- df %>% filter(!c(Year == '20' & Month %in% c('01', '02', '12')))
addmargins(table(df$Month))

## Encoding on Tissu.Type
df$Var49 <- factor(df$Var49)

## CTQ 정리
df <- df[ , -c(30:32)] # Not the Target

## Target
df$Y3 <- apply(df[ , 28:29], 1, mean)
df <- df[ , -c(28:29)]


## Removing outliers
### CTQs
df <- df %>% filter(Y3 > -2.0, Y3 < -0.1)

### React.Time
df <- df %>% filter(Var11 > 10, Var11 < 50)

### React.Temp
df <- df %>% filter(Var12 > 50, Var12 < 150)

### Line Speed
df <- df %>% filter(Var24 > 3.0)

### Upper and Lower Dryer Temp
df <- df %>% filter(Var25 > 45 & Var25 < 70)
df <- df %>% filter(Var26 > 45 & Var26 < 70)

### Pressure2
df <- df %>% filter(Var28 >= 0 & Var28 < 10)

### Pressure3
df <- df %>% filter(Var29 >= 0 & Var29 < 10)

### Pressure4
df <- df %>% filter(Var30 >= 0 & Var30 < 30)

### Pressure5
df <- df %>% filter(Var31 > 0)

### Pressure6
df <- df %>% filter(Var32 > 5)

### Pressure7
df <- df %>% filter(Var33 > 10)

### Pressure8
df <- df %>% filter(Var34 > 20)

### Pressure9
df <- df %>% filter(Var35 > 10)

### Pressure10
df <- df %>% filter(Var36 > 10)

### Pressure11
df <- df %>% filter(Var37 > 5 & Var37 < 40)

### Blower
df <- df %>% filter(Var18 < 12)

### Viscosity
df <- df %>% filter(Var9 >= 20000 & Var9 <= 50000)


# Feature Engineering -----------------------------------------------------
## Pressure slope on Dryers
df <- df %>% mutate(P1 = Var32 - Var30,
                    P2 = Var34 - Var32,
                    P3 = Var34 - Var37)

## Arranging dataset
colnames(df)
df <- df[ , c(1:28, 30:32, 29)]
colnames(df)



# Secondary Preprocessing -------------------------------------------------
## Encoding : Y3(Target)
df$Y3 <- case_when(df$Y3 < -1.00 ~ 'Bad',
                   df$Y3 < 0.00 ~ 'Good',
                   TRUE ~ 'NA')

df$Y3 <- factor(df$Y3,
                levels = c('Good', 'Bad'),
                labels = c('Good', 'Bad'))

## Encoding : Var49
addmargins(table(df$Var49))
df$Var49 <- as.numeric(df$Var49)
addmargins(table(df$Var49))
table(is.na(df))
df <- na.omit(df)

## Feature Selection
df <- df[ , -c(1:4, 13:14)]
colnames(df)


## Rebalancing the extremely imbalanced dataset
## Rebalance with SMOTE Method
table(df$Y3)
df <- ROSE(Y3 ~ ., data = df, seed = 1975)$data
table(df$Y3)


## Splitting the dataset into train and test set
set.seed(1975)
index <- sample(1:nrow(df), nrow(df)*0.70, replace = F)
train <- df[index, ]
test <- df[-index, ]


## Scaling Features
center = apply(train[ , 1:25], 2, mean)
scale = apply(train[ , 1:25], 2, sd)
train[ , -26] <- scale(train[ , -26], center = center, scale = scale)
test[ , -26] <- scale(test[ , -26], center = center, scale = scale)
head(train, 3)
head(test, 3)
dim(train)
dim(test)

All packages loaded successfully



tibble [864 × 65] (S3: tbl_df/tbl/data.frame)
 $ Var1 : chr [1:864] "P2" "P2" "P2" "P2" ...
 $ Var2 : chr [1:864] "P2" "P2" "P2" "P2" ...
 $ Var3 : chr [1:864] "P21" "P21" "P21" "P21" ...
 $ Var4 : chr [1:864] "P21" "P21" "P21" "P21" ...
 $ Var5 : chr [1:864] "P2D01" "P2D01" "P2D01" "P2D01" ...
 $ Var6 : chr [1:864] "P2D01" "P2D01" "P2D01" "P2D01" ...
 $ Var7 : chr [1:864] "P2D20205001" "P2D20205002" "P2D20205003" "P2D20206001" ...
 $ Var8 : chr [1:864] "20/02/05 수요일" "20/02/05 수요일" "20/02/05 수요일" "20/02/06 목요일" ...
 $ Var9 : num [1:864] 32600 32600 32600 32600 32600 38000 33500 33500 31000 33500 ...
 $ Var10: num [1:864] 118 118 118 118 118 93.5 115 115 129 115 ...
 $ Var11: num [1:864] 25 25 25 25 25 27 26 26 23 26 ...
 $ Var12: num [1:864] 118 118 118 118 118 93.5 115 115 129 115 ...
 $ Var13: num [1:864] 70 80 80 50 50 50 60 60 60 60 ...
 $ Var14: num [1:864] 36 36 36 36 36 36 36 36 36 36 ...
 $ Var15: num [1:864] 1200 1200 1200 1200 1200 1200 1200 1200 1200 1200 ...
 $ Var16: num 

Unnamed: 0_level_0,freqRatio,percentUnique,zeroVar,nzv
Unnamed: 0_level_1,<dbl>,<dbl>,<lgl>,<lgl>
Var7,1.0,100.0,False,False
Var8,1.0,32.4461343,False,False
Var9,1.4,16.6032953,False,False
Var11,1.012658,3.4220532,False,False
Var12,1.346154,14.4486692,False,False
Var13,1.109756,4.0557668,False,False
Var16,3.537879,0.887199,False,False
Var17,1.061947,3.1685678,False,False
Var18,2.119403,2.661597,False,False
Var19,0.0,0.1267427,True,True



 03  04  05  06  07  08  09  10  11 Sum 
 83 128 102  83  72  48  61  63  59 699 


  I   J   S   Y Sum 
 95 400  14   0 509 


  1   2   3 Sum 
 95 400  14 509 


FALSE 
16288 


Good  Bad 
 493   16 


Good  Bad 
 269  240 

Unnamed: 0_level_0,Var9,Var11,Var12,Var13,Var16,Var17,Var18,Var21,Var24,Var25,⋯,Var33,Var34,Var35,Var36,Var37,Var49,P1,P2,P3,Y3
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
274,0.8283564,0.05083231,-0.9901318,-0.8816009,-0.07925396,0.3868064,-1.3638514,-0.1426148,-0.1422607,0.9171992,⋯,1.9707398,-0.5664891,-1.2014848,-1.2827122,-1.074927,-0.644566,-1.0912905,0.6448336,0.6753097,Bad
22,1.0536558,0.85172444,-1.5301811,1.2213453,2.14824232,0.1923349,0.6507297,0.3227481,0.1300818,-0.574465,⋯,-0.5219517,-1.2801716,-0.7061533,0.3382749,-0.255653,0.8341163,0.2199544,-0.3757925,-1.2936409,Good
91,2.6699387,0.07885764,0.285324,1.3813036,2.50030384,-0.4832947,-1.4646239,3.5339896,-1.0813539,-1.7120543,⋯,-1.1126105,-1.5943254,-2.6825119,-1.4660361,-1.146756,0.2288065,-1.0616953,-1.1277256,-1.0594085,Good


Unnamed: 0_level_0,Var9,Var11,Var12,Var13,Var16,Var17,Var18,Var21,Var24,Var25,⋯,Var33,Var34,Var35,Var36,Var37,Var49,P1,P2,P3,Y3
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
4,1.5804628,-0.5207176,-0.4079966,1.8104838,1.6330475,1.02394889,0.2189316,1.6895452,-0.7172701,-0.9974547,⋯,-1.0213058,-2.1639982,-1.1588018,-1.6389083,-1.24134291,0.9812878,0.1579262,-0.28793,-0.1620307,Good
6,-0.8435108,0.5281642,-1.5120015,-0.2951873,-0.4051988,-0.63541404,0.1943267,-0.1654659,2.0455381,0.808792,⋯,1.0364368,0.6494916,2.3950233,-1.068887,-1.11150149,0.2693773,0.9553758,-0.7398168,2.5643345,Good
8,-0.7149838,0.5596349,0.8317754,-0.1408831,0.6525811,-0.06797849,-0.5665898,2.2810636,-0.6977936,-0.9179767,⋯,0.6501158,-1.6688305,-0.7940379,-0.4680053,0.02334385,0.3884793,1.1744626,-1.0659069,-0.8267937,Good
