# Applied Data Analytics: Assessment 2
Alexis Illig  
Student No: 29342872  

Date: 26/11/2018  
Enviroment: R v3.5.1

## Load Packages

In [65]:
library(tidyr)
library(data.table)
library(lattice)
library(ggplot2)
library(scales)    # number formatting
library(caret)     # confusion matrix
library(LiblineaR) # svm

## User-defined Functions

In [66]:
# function to display the accuracy and macro-f1 score
metrics <- function(actual, predicted){
    df <- cbind(as.data.frame(actual), as.data.frame(predicted))
    names(df) <- c('Actual', 'Predicted')
    conf_m <- confusionMatrix(data = factor(df$Predicted, levels = as.character(seq(1:23))),
                               reference = factor(df$Actual, ordered = TRUE))
    # metrics
    tot_acc <- percent(conf_m$overall['Accuracy'])
    macro_f1 <- percent(mean(conf_m$byClass[,'F1']))

    # view metrics
    paste('Accuracy: ',tot_acc,'   ', 'Macro-F1: ', macro_f1)
}

## Load the Feature Vectors

In [67]:
# Read the feature vectors into datatables
train <- fread('cleaned_training_vectors.csv', header=TRUE, drop=c(1))
test <- fread('cleaned_testing_vectors.csv', header=TRUE, stringsAsFactors=FALSE, drop=c(1))

## Define the Training and Validation Sets

In [68]:
# Shuffle the datasets
set.seed(42)
train <- train[sample(nrow(train)),]
test <- test[sample(nrow(test)),]

In [69]:
# Partition
set.seed(42)
split_inds <- sample(nrow(train), nrow(train)*(0.8))

In [70]:
# Create train and validation sets
x_train <- as.matrix(train[split_inds, -c(1)])
y_train <- as.matrix(train$Category[split_inds])
x_val <- as.matrix(train[-split_inds, -c(1)])
y_val <- as.matrix(train$Category[-split_inds])

## Model - Linear SVM
The tuning cell is based on an example provided in the LiblineaR documentation.

In [71]:
t1 <- Sys.time()

#  set tuning ranges (toggled for assessment submission)
k <- 5                                              # K-fold cross validation
tryTypes <- c(2)                                    # 0 for logistic regression, 2 for l2-l2 linear svm
# tryCosts <- c(.01, .1, 1, 10, 100, 1000, 2000)    # cost tuning for logistic regression
# tryCosts <- c(c(0.001,0.005,0.01, 0.05), 
#               seq(0.1, 0.5, by=0.05), 
#               c(1, 5, 10, 50, 100, 500, 1000))      # cost tuning for linear svm

# set tuned parameters (toggled for assessment submission)
tryCosts <- c(0.225)                                # tune cost parameter
bestCost <- NA
bestAcc <- 0
bestType <- NA
tuning <- data.table(Type=integer(), Cost=numeric(), Accuracy=numeric())    # repository for tuning parameters

# Find the best model with the best cost parameter via k-fold cross-validation
counter = 1
for(ty in tryTypes){
    for(co in tryCosts){
#         print(Sys.time())
#         print(counter)
#         flush.console()
        svm_fit <- LiblineaR(data=x_train, target=y_train,
                             type=ty, cost=co, cross=k)
        tuning <- rbind(tuning, data.table(Type = ty, Cost = co, Accuracy = svm_fit))
        cat("Results for C=", co, " : ", svm_fit, " accuracy\n", sep="")
        if(svm_fit > bestAcc){
            bestCost <- co
            bestAcc <- svm_fit
            bestType <- ty
        }
        counter = counter + 1
    }
}

cat("Best model type is:", bestType,"\n")
cat("Best cost is:", bestCost,"\n")
cat("Best training accuracy is:", bestAcc,"\n")

print(difftime(Sys.time(), t1, units = 'min'))

Results for C=0.225 : 0.7727134 accuracy
Best model type is: 2 
Best cost is: 0.225 
Best training accuracy is: 0.7727134 
Time difference of 0.9173293 mins


In [72]:
# Re-train best model with best cost value
best <- LiblineaR(data=x_train, target=y_train, type=bestType, cost=bestCost, bias=1, verbose=FALSE)

In [73]:
# Predict the validation set classifications
svm_preds <- predict(best, x_val, proba=FALSE, decisionValues=TRUE)

In [74]:
# Confusion matrix
metrics(y_val, svm_preds)

#### Tuning Plot
An image of this plot is in the Report.  
Toggled off for Assessment.  The tuned value is used instead.

In [75]:
# Set plot width and height
# options(repr.plot.width = 8, repr.plot.height = 4)

In [76]:
# Plot the parameter tuning
# print(ggplot(data = tuning[1:17], mapping = aes(x = Cost, y = Accuracy)) +  
#       geom_line(aes(x = Cost, y = Accuracy)) + geom_point(alpha = 1.0, col = 'blue', size = 1.0))

## Predict Test Set

In [77]:
# Predict the test set classes
test_preds <- predict(best, test[,-c(1)], proba=FALSE, decisionValues=TRUE)

In [78]:
# Create a dataframe of document names and predictions
df <- data.frame(test$Doc, test_preds$predictions)
names(df) <- c('Doc', 'Predicted')

In [79]:
# Append 'C' to the classifications
df$Predicted <- sub('^', 'C', df$Predicted )

In [80]:
# Order by document name
df <- df[order(as.numeric(gsub("[^[:digit:]]", "", df$Doc))),] 

In [81]:
head(df)

Unnamed: 0,Doc,Predicted
6189,te_doc_1,C5
23216,te_doc_2,C17
10249,te_doc_3,C22
23188,te_doc_4,C12
24157,te_doc_5,C1
2957,te_doc_6,C20


## Save Test Predictions

In [82]:
write.table(df, 'testing_labels_pred.txt', quote=FALSE, row.names=FALSE, col.names=FALSE)