# 6. Comparison of Classification Methods 

- 7 Classification methods 
    - LDA and QDA
    - KNN(K=1, K=5, K=20) 
    - Logistic Regression 
    - Bayes Naive method 
    
- 6 Scenarios 
    - Gaussian model with diagonal covaraince 
    - Gaussian model with the same covariance(LDA) 
    - t-distribution model 
    - Gaussian model with a different covariance(QDA) 
    - Multiplicative model($X_1 + X_2 + X_1X_2$) 
    - Complicated non-parametric model 

## 6.1 [Ex] Prerequirsite of Simulation Study

```R
# Importing libraries 
library(mnormt) 
library(MASS)
library(class)
library(e1071) 

# Calculate missclassification rate 
MissClassRate <- function(x.tran, x.test, y.test, y.train) { 
    nt <- nrow(x.tran) 
    # Model1 : LDA 
    ldafit <- predict(lda(x.tran, y.tran), x.test)$class
    # Model2 : QDA
    qdafit <- predict(qda(x.tran, y.tran), x.test)$class
    # Model3 : KNN 
    knn1 <- knn(x.tran, x.test, y.tran, k=1)
    # Model4 : KNN
    knn5 <- knn(x.tran, x.test, y.tran, k=5)
    # Model5 : KNN 
    knn20 <- knn(x.tran, x.test, y.tran, k=20)
    # Model 6 : Logistic Regression 
    data <- data.frame(x=rbind(x.tran,x.test),y=c(y.tran,y.test))
    g <- glm(y~., family="binomial", subset=1:nt, data)
    logit <- predict(g, data, type="response")[-c(1:nt)]
    logit[logit >= 0.5] <- 1
    logit[logit < 0.5] <- 0
    # Model 7 : Naive Bayes 
    g2 <- naiveBayes(y~., subset=1:nt, data)
    NB <- predict(g2, data)[-c(1:nt)]
    c(mean(ldafit!=y.test), qda=mean(qdafit!=y.test),
    mean(knn1!=y.test), mean(knn5!=y.test), mean(knn20!=y.test),
    mean(logit!=y.test), mean(NB!=y.test))
} 
```

## 6.2 [Ex] Scenario 1 : Gaussian model with diagonal covaraince

```R
set.seed(12345)
K <- 100
RES1 <- matrix(NA, K, 7)

# Iterate 100 times 
for (i in 1:K) {
    x.A <- rmnorm(150, rep(0, 2), diag(2))
    x.B <- rmnorm(150, rep(1, 2), diag(2))
    x.tran <- rbind(x.A[1:50, ], x.B[1:50, ])
    x.test <- rbind(x.A[-c(1:50), ], x.B[-c(1:50), ])
    y.tran <- factor(rep(0:1, each=50))
    y.test <- factor(rep(0:1, each=100))
    RES1[i,] <- MissClassRate(x.tran, x.test, y.test, y.tran)
}

# Boxplot of missclassification report among 7 models 
boxplot(RES1, boxwex=0.5, col=2:8,names=c("LDA", "QDA", "KNN-1",
"KNN-5", "KNN-20", "Logit", "NB"), main="Scenario 1",
ylab="Test Error Rates")
``` 

![](Img/CCM1.png)

## 6.3 [Ex] Scenario 2 : Gaussian model with the same covariance(LDA) 

```R
# Iterate 100 times 
RES2 <- matrix(NA, K, 7)
for (i in 1:K) {
    x.A <- rmnorm(150, rep(0, 2), matrix(c(1,-0.5,-0.5,1),2))
    x.B <- rmnorm(150, rep(1, 2), matrix(c(1,-0.5,-0.5,1),2))
    x.tran <- rbind(x.A[1:50, ], x.B[1:50, ])
    x.test <- rbind(x.A[-c(1:50), ], x.B[-c(1:50), ])
    y.tran <- factor(rep(0:1, each=50))
    y.test <- factor(rep(0:1, each=100))
    RES2[i,] <- MissClassRate(x.tran, x.test, y.test, y.tran)
}

# Boxplot of missclassification report among 7 models
boxplot(RES2, boxwex=0.5, col=2:8,names=c("LDA", "QDA", "KNN-1",
"KNN-5", "KNN-20", "Logit", "NB"), main="Scenario 2",
ylab="Test Error Rates")

```

![](Img/CCM2.png)

## 6.4 [Ex] Scenario 3 : t-distribution model 

```R
# Iterate 100 times 
RES3 <- matrix(NA, K, 7)
for (i in 1:K) {
x.A <- cbind(rt(150, df=5, ncp=0), rt(150, df=5, ncp=0))
x.B <- cbind(rt(150, df=5, ncp=0.5), rt(150, df=5, ncp=0.5))
x.tran <- rbind(x.A[1:50, ], x.B[1:50, ])
x.test <- rbind(x.A[-c(1:50), ], x.B[-c(1:50), ])
y.tran <- factor(rep(0:1, each=50))
y.test <- factor(rep(0:1, each=100))
RES3[i,] <- MissClassRate(x.tran, x.test, y.test, y.tran)
}

# Boxplot of missclassification report among 7 models
boxplot(RES3, boxwex=0.5, col=2:8,names=c("LDA", "QDA", "KNN-1",
"KNN-5", "KNN-20", "Logit", "NB"), main="Scenario 3",
ylab="Test Error Rates")
```

![](Img/CCM3.png)

## 6.5 [Ex] Scenario 4 : Gaussian model with a different covariance(QDA) 

```R
# Iterate 100 times 
RES4 <- matrix(NA, K, 7)
for (i in 1:K) {
    x.A <- rmnorm(150, rep(0, 2), matrix(c(1,0.5,0.5,1),2))
    x.B <- rmnorm(150, rep(1, 2), matrix(c(1,-0.5,-0.5,1),2))
    x.tran <- rbind(x.A[1:50, ], x.B[1:50, ])
    x.test <- rbind(x.A[-c(1:50), ], x.B[-c(1:50), ])
    y.tran <- factor(rep(0:1, each=50))
    y.test <- factor(rep(0:1, each=100))
    RES4[i,] <- MissClassRate(x.tran, x.test, y.test, y.tran)
}

# Boxplot of missclassification report among 7 models 
boxplot(RES4, boxwex=0.5, col=2:8,names=c("LDA", "QDA", "KNN-1",
"KNN-5", "KNN-20", "Logit", "NB"), main="Scenario 4",
ylab="Test Error Rates")
```

![](Img/CCM4.png)

## 6.5 [Ex] Scenario 5 : Multiplicative model($X_1 + X_2 + X_1 X_2$)

```R
# Iterate 100 times 
RES5 <- matrix(NA, K, 7)
for (i in 1:K) {
    x.A <- rmnorm(150, rep(0, 2), diag(2))
    x.B <- rmnorm(150, rep(1, 2), diag(2))
    x.tran <- rbind(x.A[1:50, ], x.B[1:50, ])
    x.test <- rbind(x.A[-c(1:50), ], x.B[-c(1:50), ])
    tr.int <- x.tran[,1]*x.tran[,2]
    te.int <- x.test[,1]*x.test[,2]
    xb.tr <- cbind(x.tran,tr.int)%*%c(-0.5,0.5,1)
    xb.te <- cbind(x.test,te.int)%*%c(-0.5,0.5,1)
    y.tran <- rep(0, 100); y.test <- rep(0, 200)
    y.tran[xb.tr > 0] <- 1; y.tran <- factor(y.tran)
    y.test[xb.te > 0] <- 1; y.test <- factor(y.test)
    RES5[i,] <- MissClassRate(x.tran, x.test, y.test, y.tran)
}

# Boxplot of missclassification report among 7 models 
boxplot(RES5, boxwex=0.5, col=2:8,names=c("LDA", "QDA", "KNN-1",
"KNN-5", "KNN-20", "Logit", "NB"), main="Scenario 5",
ylab="Test Error Rates")
```

![](Img/CCM5.png)

## 6.6 [Ex] Scenario 6 : Complicated non-parametric model 

```R
# Iterate 100 times 
RES6 <- matrix(NA, K, 7)
for (i in 1:K) {
    x.A <- rmnorm(150, rep(0, 2), diag(2))
    x.B <- rmnorm(150, rep(1, 2), diag(2))
    x.tran <- rbind(x.A[1:50, ], x.B[1:50, ])
    x.test <- rbind(x.A[-c(1:50), ], x.B[-c(1:50), ])
    tr.int <- exp(x.tran[,1])/log(abs(x.tran[,2]))
    te.int <- exp(x.test[,1])/log(abs(x.test[,2]))
    xb.tr <- cbind(x.tran,tr.int)%*%c(-0.5,0.5,1)
    xb.te <- cbind(x.test,te.int)%*%c(-0.5,0.5,1)
    y.tran <- rep(0, 100); y.test <- rep(0, 200)
    y.tran[xb.tr > 0] <- 1; y.tran <- factor(y.tran)
    y.test[xb.te > 0] <- 1; y.test <- factor(y.test)
    RES6[i,] <- MissClassRate(x.tran, x.test, y.test, y.tran)
}

# Boxplot of missclassification report among 7 models 
boxplot(RES6, boxwex=0.5, col=2:8,names=c("LDA", "QDA", "KNN-1",
"KNN-5", "KNN-20", "Logit", "NB"), main="Scenario 5",
ylab="Test Error Rates")

```

![](Img/CCM6.png)

## 6.7 [Ex] Conclusions 

```R
# Scenario 1 : 
par(mfrow=c(2,3))
boxplot(RES1, boxwex=0.5, col=2:8, ylim=c(0,0.6), 
        names=c("LDA", "QDA", "KNN-1", "KNN-5", "KNN-20", "Logit", "NB"), 
        main="Scenario 1", ylab="Test Error Rates")

# Scenario 2 : 
boxplot(RES2, boxwex=0.5, col=2:8, ylim=c(0,0.6),
        names=c("LDA", "QDA", "KNN-1", "KNN-5", "KNN-20", "Logit", "NB"), 
        main="Scenario 2", ylab="Test Error Rates")

# Scenario 3 : 
boxplot(RES3, boxwex=0.5, col=2:8, ylim=c(0,0.6), 
        names=c("LDA", "QDA", "KNN-1", "KNN-5", "KNN-20", "Logit", "NB"), 
        main="Scenario 3", ylab="Test Error Rates")

# Scenario 4 : 
boxplot(RES4, boxwex=0.5, col=2:8, ylim=c(0,0.6), 
        names=c("LDA", "QDA", "KNN-1", "KNN-5", "KNN-20", "Logit", "NB"), 
        main="Scenario 4", ylab="Test Error Rates")

# Scenario 5 : 
boxplot(RES5, boxwex=0.5, col=2:8, ylim=c(0,0.6), 
        names=c("LDA", "QDA", "KNN-1", "KNN-5", "KNN-20", "Logit", "NB"), 
        main="Scenario 5", ylab="Test Error Rates")

# Scenario 6 : 
boxplot(RES6, boxwex=0.5, col=2:8, ylim=c(0,0.6), 
        names=c("LDA", "QDA", "KNN-1", "KNN-5", "KNN-20", "Logit", "NB"), 
        main="Scenario 6", ylab="Test Error Rates")
```

![](Img/CCM7.png)
- Each scenario reflects different situation of dataset. 
- We can see that the missclassification rate of 7 models differs at those scenarios.

#  7. [Ex] Stock Market Data 

## 7.1 [Ex] Stock Market Data
```R
# Import library and dataset 
library(ISLR)
names(Smarket)
str(Smarket)
dim(Smarket)
summary(Smarket)
pairs(Smarket)
cor(Smarket[, -9])
attach(Smarket)

# Explore features 
par(mfrow=c(2,4))
for (i in 1:8) {
plot(Smarket[,i], pch=20, main=colnames(Smarket)[i],
col=as.numeric(Smarket$Direction) + 1)
}
table(Year)
train <- (Year < 2005)
y.test <- Direction[!train]
Sdata <- Smarket[,-c(1,8)]
```

## 7.2 [Ex] Logistic Regression

```R
## Logistic Regression
g1 <- glm(Direction~., data=Sdata, family="binomial",
subset=train)
p1 <- predict(g1, Sdata[!train,], type="response")
pred1 <- rep("Down", length(y.test))
pred1[p1 > 0.5] <- "Up"
mean(pred1!=y.test)
```

- Missclassification Error Rate : 0.5198413 

## 7.3 [Ex] LDA

```R
## LDA
library(MASS)
g2 <- lda(Direction~., data=Sdata, subset=train)
pred2 <- predict(g2, Sdata[!train,])$class
mean(pred2!=y.test)
```

- Missclassification Error Rate : 0.5198413

## 7.4 [Ex] QDA 

```R
## QDA
g3 <- qda(Direction~., data=Sdata, subset=train)
pred3 <- predict(g3, Sdata[!train,])$class
mean(pred3!=y.test)
```

- Missclassification Error Rate : 0.555556 

## 7.5 [Ex] Naive Bayes 

```R
## Naive Bayes
library(e1071)
g4 <- naiveBayes(Direction~., data=Sdata, subset=train)
pred4 <- predict(g4, Sdata[!train,])
mean(pred4!=y.test)
```

- Missclassification Error Rate : 0.5436508

## 7.5 [Ex] KNN

```R
## KNN
library(class)
x.train <- Sdata[train, -7]
x.test <- Sdata[!train, -7]
y.train <- Sdata$Direction[train]

# Grid Search for K
CER <- NULL
for (k in 1:200) {
    g5 <- knn(x.train, x.test, y.train, k=k)
    CER[k] <- mean(g5!=y.test)
}
summary(CER)

# Visualize result
plot(1:200, CER, type="b", xlab="k", ylab="Error", col=2, pch=20)
```

![](Img/ScenarioKNN.png)