## Load Packages

In [None]:
library(tidyr)
library(data.table)
library(foreach)
library(lattice)
library(ggplot2)
library(glmnet)    # lasso and ridge
library(class)     # knn
library(scales)    # number formatting
library(caret)     # confusion matrix
library(MASS)      # lda
library(e1071)     # svm
library(parallelSVM)
library(SparseM)
library(LiblineaR)
library(nnet)      # neural net

## User-Defined Functions

In [2]:
# function to print out the accuracy and macro-f1 score
metrics <- function(actual, predicted){
    df <- cbind(as.data.frame(actual), as.data.frame(predicted))
    names(df) <- c('Actual', 'Predicted')
    conf_m <- confusionMatrix(data = factor(df$Predicted, levels = as.character(seq(1:23))),
                               reference = factor(df$Actual, ordered = TRUE))
    # metrics
    tot_acc <- percent(conf_m$overall['Accuracy'])
    macro_f1 <- percent(mean(conf_m$byClass[,'F1']))

    # view metrics
    paste('Accuracy: ',tot_acc,'   ', 'Macro-F1: ', macro_f1)
}

## Load the Feature Vectors

#### Read the Feature Vector CSVs into Data.Tables

In [3]:
# drop the index column
train <- fread('cleaned_training_vectors.csv', header=TRUE, drop=c(1))
test <- fread('cleaned_testing_vectors.csv', header=TRUE, stringsAsFactors=FALSE, drop=c(1))

## Define the Training and Validation Sets

In [4]:
# Shuffle the rows of the datatables
set.seed(42)
train <- train[sample(nrow(train)),]
test <- test[sample(nrow(test)),]

In [5]:
# create test and train sets
set.seed(42)
split_inds <- sample(nrow(train), nrow(train)*(0.8))
x_train_split <- as.matrix(train[split_inds, -c(1)])
y_train_split <- as.matrix(train$Category[split_inds])
x_val_split <- as.matrix(train[-split_inds, -c(1)])
y_val_split <- as.matrix(train$Category[-split_inds])

## Modelling - Naive bayes
Much faster using H20 (v3.ipynb) or Sklearn in Python

In [188]:
# nB_fit <- naiveBayes(Category~., data = train)

In [None]:
# Make predictions
# nb_preds <- predict(nb_fit, train[-split_inds, -c(1)])

## Modelling - Lasso

In [19]:
# get lambdas
NFOLDS <- 5    # k-fold cross-validation
ALPHA <- 1
t1 <- Sys.time()
glmnet_cv <- cv.glmnet(x = Matrix(x_train_split, sparse=TRUE), y = Matrix(y_train_split, sparse=TRUE),
                       family = 'multinomial', 
                        # L1 penalty
                        alpha = ALPHA,
                        # type of error to use
                        type.measure = 'class',
                        # cross-validation
                        nfolds = NFOLDS,
                        # high value is less accurate, but has faster training
#                         thresh = 1e-3,
                        # again lower number of iterations for faster training
#                         maxit = 1e3,
                        # since the feature vectors were already standardized in the preprocessing
                        standardize = FALSE
                        # apply group penalty (multinomial family type only)
#                         type.multinomial='grouped'
                        )
print(difftime(Sys.time(), t1, units = 'min'))

Time difference of 30.01597 mins


In [21]:
# predict 
t1 <- Sys.time()
glmnet_preds <- predict(glmnet_cv, x_val_split, type = "class", s = glmnet_cv$lambda.min)
print(difftime(Sys.time(), t1, units = 'min'))

In [22]:
# confusion matrix
df <- data.frame(y_val_split, glmnet_preds)
names(df) <- c('Actual', 'Predicted')
conf_m <- confusionMatrix(data = factor(df$Predicted, levels = as.character(seq(1:23))),
                           reference = factor(df$Actual, ordered = TRUE))
# metrics
tot_acc <- percent(conf_m$overall['Accuracy'])
macro_f1 <- percent(mean(conf_m$byClass[,'F1']))

# view metrics
paste('Accuracy: ',tot_acc,'   ', 'Macro-F1: ', macro_f1)

## Modelling - LDA

In [23]:
t1 <- Sys.time()
# Fit the model
lda_fit <- lda(Category~., data = train[split_inds, -c(1)])
print(difftime(Sys.time(), t1, units = 'min'))

Time difference of 18.92945 mins


In [24]:
# Make predictions
lda_preds <- predict(lda_fit, train[-split_inds, -c(1)])

In [25]:
# confusion matrix
df <- data.frame(train$Category[-split_inds], lda_preds)
names(df) <- c('Actual', 'Predicted')
conf_m <- confusionMatrix(data = factor(df$Predicted, levels = as.character(seq(1:23))),
                           reference = factor(df$Actual, ordered = TRUE))
# metrics
tot_acc <- percent(conf_m$overall['Accuracy'])
macro_f1 <- percent(mean(conf_m$byClass[,'F1']))

# view metrics
paste('Accuracy: ',tot_acc,'   ', 'Macro-F1: ', macro_f1)

In [11]:
cv.lda = function(v, formula, data, cl){
    require(MASS)
    grps = cut(1:nrow(data), v, labels=FALSE)[sample(1:nrow(data))]
    pred = lapply(1:v, function(i, formula, data){
        omit = which(grps == i)
        z = lda(formula, data=data[-omit,])
        predict(z, data[omit,])
    }, formula, data)
    return(pred)
#    wh = unlist(lapply(pred, function(pp)pp$class))
#    table(wh,cl[order(grps)])
}
# https://www.stat.berkeley.edu/~s133/Class2a.html

In [None]:
lda_preds <- cv.lda(2, Category~.,train[split_inds], train[split_inds]$Category)

In [None]:
# Make predictions
lda_preds <- predict(lda_fit, train[-split_inds, -c(1)])

In [None]:
# confusion matrix
df <- data.frame(train$Category[-split_inds], lda_preds)
names(df) <- c('Actual', 'Predicted')
conf_m <- confusionMatrix(data = factor(df$Predicted, levels = as.character(seq(1:23))),
                           reference = factor(df$Actual, ordered = TRUE))
# metrics
tot_acc <- percent(conf_m$overall['Accuracy'])
macro_f1 <- percent(mean(conf_m$byClass[,'F1']))

# view metrics
paste('Accuracy: ',tot_acc,'   ', 'Macro-F1: ', macro_f1)

## Modelling - LinSVM

In [17]:
t1 <- Sys.time()

k <- 10 # K-fold cross validation
# tryTypes <- c(0,2)
tryCosts <- c(.01, .1, 1, 10, 100, 1000, 2000)
tryTypes <- 0
# tryCosts <- 1
bestCost <- NA
bestAcc <- 0
bestType <- NA

# Find the best model with the best cost parameter via k-fold cross-validations
for(ty in tryTypes){
    print(format(Sys.time(), "%X"))
    print(ty)
    for(co in tryCosts){
        svm_fit <- LiblineaR(data=x_train_split, target=y_train_split,
                             type=ty, cost=co, cross=k)
        cat("Results for C=", co, " : ", svm_fit, " accuracy\n", sep="")
        if(svm_fit > bestAcc){
            bestCost <- co
            bestAcc <- svm_fit
            bestType <- ty
        }
    }
}

cat("Best model type is:", bestType,"\n")
cat("Best cost is:", bestCost,"\n")
cat("Best accuracy is:", bestAcc,"\n")

print(difftime(Sys.time(), t1, units = 'min'))

[1] "2:20:24 PM"
[1] 0
Results for C=0.01 : 0.7060068 accuracy
Results for C=0.1 : 0.7502938 accuracy
Results for C=1 : 0.7700578 accuracy
Results for C=10 : 0.7665092 accuracy
Results for C=100 : 0.7519976 accuracy
Results for C=1000 : 0.7447241 accuracy
Results for C=2000 : 0.7420333 accuracy
Best model type is: 0 
Best cost is: 1 
Best accuracy is: 0.7700578 
Time difference of 32.80737 mins


In [18]:
# Re-train best model with best cost value.
best <- LiblineaR(data=x_train_split, target=y_train_split, type=bestType, cost=bestCost, bias=1, verbose=FALSE)

In [19]:
svm_preds=predict(best, x_val_split, proba=FALSE, decisionValues=TRUE)

In [20]:
# confusion matrix
df <- data.frame(train$Category[-split_inds], svm_preds)
names(df) <- c('Actual', 'Predicted')
conf_m <- confusionMatrix(data = factor(df$Predicted, levels = as.character(seq(1:23))),
                           reference = factor(df$Actual, ordered = TRUE))
# metrics
tot_acc <- percent(conf_m$overall['Accuracy'])
macro_f1 <- percent(mean(conf_m$byClass[,'F1']))

# view metrics
paste('Accuracy: ',tot_acc,'   ', 'Macro-F1: ', macro_f1)

## Modelling - nnet

In [None]:
t1 <- Sys.time()
# Fit the model
nnet_fit <- nnet::multinom(Category~., data = train[split_inds, -c(1)],
                           MaxNWts = 12000, maxit = 100)
print(difftime(Sys.time(), t1, units = 'min'))

In [None]:
# Make predictions
nnet_preds <-predict(nnet_fit, train[-split_inds, -c(1)], type = 'class')

In [None]:
# confusion matrix
df <- data.frame(train$Category[-split_inds], nnet_preds)
names(df) <- c('Actual', 'Predicted')
conf_m <- confusionMatrix(data = factor(df$Predicted, levels = as.character(seq(1:23))),
                           reference = factor(df$Actual, ordered = TRUE))
# metrics
tot_acc <- percent(conf_m$overall['Accuracy'])
macro_f1 <- percent(mean(conf_m$byClass[,'F1']))

# view metrics
paste('Accuracy: ',tot_acc,'   ', 'Macro-F1: ', macro_f1)

## Modelling - K-Clusters

In [193]:
t1 <- Sys.time()
n_clust <- round(sqrt(nrow(x_train_split))/2)
set.seed(42)
knn_preds <- knn(x_train_split, x_test_split, y_train_split, n_clust)
print(difftime(Sys.time(), t1, units = 'min'))

Time difference of 371.6459 mins


In [194]:
# confusion matrix
df <- data.frame(y_test_split, knn_preds)
names(df) <- c('Actual', 'Predicted')
conf_m <- confusionMatrix(data = factor(df$Predicted, levels = as.character(seq(1:23))),
                           reference = factor(df$Actual, ordered = TRUE))
# metrics
tot_acc <- percent(conf_m$overall['Accuracy'])
macro_f1 <- percent(mean(conf_m$byClass[,'F1']))

# view metrics
paste('Accuracy: ',tot_acc,'   ', 'Macro-F1: ', macro_f1)