This repository has been archived by the owner on Jan 8, 2020. It is now read-only.
/
knn_optim_parallelf.R
148 lines (127 loc) · 6.48 KB
/
knn_optim_parallelf.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#' Optimizes the values of k and d for a given time series. First, values corresponding to instants from init + 1 to the last one
#' are predicted. The first value predicted, which corresponds to instant init + 1, is calculated using instants from 1 to
#' instant init; the second value predicted, which corresponds to instant init + 2, is predicted using instants from 1
#' to instant init + 1; and so on until the last value, which corresponds to instant n (length of the given time series),
#' is predicted using instants from 1 to instant n - 1. Finally, the error is evaluated between the predicted values and
#' the real values of the series.
#' This version of the optimization function uses a parallelized distances calculation function, and the computation of
#' the predicted values is done parallelizing by the number of d's and the number of instants to be predicted. Each thread
#' that calculates predicted values reads from a file a complete distances matrix in which the information used to predict
#' is contained.
#'
#' @param y A time series.
#' @param k Values of k's to be analyzed.
#' @param d Values of d's to be analyzed.
#' @param v Variable to be predicted if given multivariate time series.
#' @param init Variable that determines the limit of the known past for the first instant predicted
#' @param error_metric Type of metric to evaluate the prediction error.
#' Five metrics supported:
#' \describe{
#' \item{ME}{Mean Error}
#' \item{RMSE}{Root Mean Squared Error}
#' \item{MAE}{Mean Absolute Error}
#' \item{MPE}{Mean Percentage Error}
#' \item{MAPE}{Mean Absolute Percentage Error}
#' }
#' @param weight Type of weight to be used at the time of calculating the predicted value with a weighted mean.
#' Three supported: proximity, same, linear.
#' \describe{
#' \item{proximity}{the weight assigned to each neighbor is proportional to its distance}
#' \item{same}{all neighbors are assigned with the same weight}
#' \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the
#' least nearest neighbor which is assigned with a weight of 1.}
#' }
#' @param threads Number of threads to be used when parallelizing, default is number of cores detected - 1 or
#' 1 if there is only one core.
#' @param file Path and id of the files where the distances matrixes are.
#' @return A matrix of errors, optimal k and d.
#' @examples
#' knn_distances(AirPassengers, 1:3, file = "AirPassengers")
#' knn_optim_parallelf(AirPassengers, 1:5, 1:3, file = "AirPassengers")
#' knn_distances(LakeHuron, 1:6, file = "LakeHuron")
#' knn_optim_parallelf(LakeHuron, 1:10, 1:6, file = "LakeHuron")
knn_optim_parallelf <- function(y, k, d, v = 1, init = NULL, error_metric = "MAE", weight = "proximity", threads = NULL, file){
require(parallelDist)
require(forecast)
require(foreach)
require(doParallel)
require(iterators)
# Default number of threads to be used
if (is.null(threads)) {
cores <- parallel::detectCores()
threads <- ifelse(cores == 1, cores, cores - 1)
}
# Choose the appropiate index of the accuracy result, depending on the error_metric
error_type <- switch(error_metric,
ME = 1,
RMSE = 2,
MAE = 3,
MPE = 4,
MAPE = 5
)
# Sort k or d vector if they are unsorted
if (is.unsorted(k)) {
k <- sort(k)
}
if (is.unsorted(d)) {
d <- sort(d)
}
# Initialization of variables to be used
y <- as.matrix(sapply(y, as.numeric), ncol = NCOL(y))
n <- NROW(y)
ks <- length(k)
ds <- length(d)
init <- ifelse(is.null(init), floor(n * 0.7), init)
expSmoVal <- 0.5
real_values <- matrix(y[(init + 1):n, v])
errors <- matrix(nrow = ks, ncol = ds, dimnames = list(k, d))
# This next line is only there to avoid 'No visible binding for global variable' warning
# in R CMD check due to j variable used in foreach loop
j <- NULL
# For each of the combinations of d's and instants init to n - 1, a distances vector
# according to each combination is taken from a distances matrix saved previously in a file
# and then ordered. Later, the k's inner loop applies k-nn to predict values.
clust <- makeCluster(threads)
registerDoParallel(cl = clust)
all_predictions <- foreach(i = 1:ds, .combine = cbind) %:% foreach(j = (n - init + 1):2, .combine = cbind) %dopar% {
predictions <- vector(mode = "numeric", ks)
# Get column needed from the distances matrix and sort it
initial_index <- (n - d[i] + 1) * (j - 1) - j * (j - 1) / 2 + 1
distances_col <- readRDS(paste0(file, "-", d[i]))[initial_index:(initial_index + n - d[i] - j)]
sorted_distances_col <- sort.int(distances_col, index.return = TRUE)
for (k_index in 1:ks) {
k_value <- k[k_index]
# Get the indexes of the k nearest 'elements', these are called neighbors
k_nn <- head(sorted_distances_col$ix, k_value)
if ( weight == "expSmooth" )
k_nn <- sort.int(k_nn)
# Calculate the weights for the future computation of the weighted mean
weights <- switch(weight,
proximity = 1 / (distances_col[k_nn] + .Machine$double.xmin * 1e150),
same = rep.int(1, k_value),
linear = k_value:1,
#expSmooth = expSmoVal ** k_value:1
expSmooth = expSmoVal * (1 - expSmoVal) ** (k_value - 1):0
)
# Calculate the predicted value
predictions[k_index] <- weighted.mean(y[n - j + 2 - k_nn, v], weights)
}
predictions
}
registerDoSEQ()
stopCluster(clust)
# Calculate error values between the known values and the predicted values, these values
# correspond to instants init to n - 1. These is done for all k's and d's analyzed
for (i in 1:ds) {
initial_index <- (i - 1) * (n - init) + 1
for (k_index in 1:ks) {
errors[k_index, i] <- accuracy(ts(all_predictions[k_index, initial_index:(initial_index + n - init - 1)]), real_values)[error_type]
}
}
# Construction of the list to be returned
index_min_error <- which.min(errors)
opt_k <- k[((index_min_error - 1) %% ks) + 1]
opt_d <- d[ceiling(index_min_error / ks)]
result <- list(errors = errors, k = opt_k, d = opt_d)
result
}