**Install packages**

In [None]:
# Package for ggplot2 enhancements
install.packages("ggExtra")

# Package for multivariate normal distribution
install.packages('mvtnorm')

**Load essential libraries**

In [None]:
library(ggplot2) # library for plotting
library(dplyr) # library for data wrangling
library(ggExtra) # library for enhanced ggplot2 plots
library(mvtnorm) # library for multivariate normal distribution
library(tidyr) # library to reorganize data

In [None]:
# Load RData file data2.RData
load(url('https://tinyurl.com/527nxn23'))
str(data2)

In [None]:
# Training data
X_train = data2$X # training data as matrix (all samples here are non-anomalous)
df_train = as.data.frame(X_train) # training data frame
head(X_train, n = 5)

In [None]:
# Validation data
X_validate = data2$Xval # validation data as matrix
y_validate = data2$yval # validation data labels
df_validate = as.data.frame(X_validate) # validation data frame
head(y_validate, n = 5) # 1 represents an outlier sample, 0 represents a regular sample

In [None]:
# Fraction of servers that are outliers in the validation set
mean(y_? = ?)  # supervised algorithms have to address such class imbalance

In [None]:
# Wide to long dataframe containing training data
df_gather_train = gather(as.data.frame(X_train))
colnames(df_gather_train) = c('Variable', 'Value')
head(df_gather_train, n = 5)

In [None]:
# Wide to long dataframe containing validation data
df_gather_validate = gather(as.data.frame(X_validate))
colnames(df_gather_validate) = c('Variable', 'Value')
head(df_gather_validate, n = 5)

In [None]:
# Separate density plots for each variable (or features) in the training data
df_gather_train %>% ggplot(aes(x = ?, fill = ?, color = ?)) +
geom_density(alpha = 0.3)+ggtitle('Distibution of variables from training data')

# Which variable appears the least normally disributed?

In [None]:
# Separate density plots for each variable (or features) in the validation data
df_gather_validate %>% ggplot(aes(x = Value, fill = Variable, color = Variable)) +
geom_density(alpha = 0.3)+ggtitle('Distibution of variables from validation data')

# Which variable appears the least normally disributed?

In [None]:
# Make a scatter plot with the marginal densities for any pair of variables
# in the training data

p1 = ggplot(data = ?, aes(x = ?, y = ?)) + 
geom_point(size = 2, alpha = 0.7) + xlab("Variable-1") + 
   ylab("Variable-5") + ggtitle("Scatter Plot") +
   coord_fixed(ratio = 1) +
   stat_ellipse(level = 0.68, color = 'red')+
   stat_ellipse(level = 0.95, color = 'green')+
   stat_ellipse(level = 0.997, color = 'blue')

# Add marginal histogram plot to the scatter plot 
delta =1
ggMarginal(p1, type = 'histogram', color = 'black', binwidth = delta)

# Add marginal density plot to the scatter plot
ggMarginal(p1, type = 'density', color = 'cyan')

In [None]:
solve(cov(as.data.frame(cbind(X_train, 2*X_train[, 11]))))

In [None]:
## Build model using training data 
mu_train = mean(?) # sample mean using training data
S_train = ?(as.data.frame(X_train)) # sample covariance matrix using training data
S_train_inverse = solve(?)  # inverse of sample covariance matrix from training data

# Mahalanobis distance of training samples
M_distance_train = apply(?, 1, function(x){sqrt(t(x-?) %*% ? %*% (?-mu_train))})

# Mahalanobis distance of validation samples using model built using training data
M_distance_validate = apply(?, 1, function(x){sqrt(t(x-mu_train) %*% ? %*% (x-mu_train))})

# Add Mahalanobis distance as new column of training and validation data frame
df_train$MD = ?
df_validate$? = M_distance_validate

head(df_validate)

In [None]:
## Determine cutoff probability such that we get the best performance on the validation set
## Best performance corresponds to, for example, the best F1 score (close to 1)

# Add a new column to training and validation data frame for outliers
k = ncol(X_train) # dimensions or the number of features
j = 5 # vary this to change cutoff probability
cutoff_probability = 1-(10^(-j))
threshold = qchisq(?, ?) # threshold for Mahalanobis distance
df_train$Outliers = ((?)^2 >= threshold)
df_validate$Outliers = ((df_validate$MD)^2 >= ?)

# True positives
tp = sum((df_validate$Outliers == 'TRUE') & (y_validate == 1))
# False positives
fp = sum((df_validate$Outliers == 'TRUE') & (y_validate == 0))
# False negatives
fn = sum((df_validate$Outliers == 'FALSE') & (y_validate == 1))
# Precision
precision = tp / (tp + fp)
# Recall
recall = tp / (tp + fn)
# F1 score
F1 = (2 * precision * recall) / (precision + recall)
cat(sprintf('Threshold = %f\n', threshold))
cat(sprintf('For probability cutoff = %f, \nPrecision = %f,\nRecall = %f,\nF1 score = %f,\nFraction of outliers in training data = %f\n', cutoff_probability, precision, recall, F1,mean(df_train$Outliers == 'TRUE')))