R/carp.R

#' Compute \code{CARP} (Convex Clustering) Solution Path
#'
#' \code{CARP} returns a fast approximation to the Convex Clustering
#' solution path along with visualizations such as dendrograms and
#' cluster paths. \code{CARP} solves the Convex Clustering problem via an efficient
#' Algorithmic Regularization scheme.
#'
#' @param X The data matrix (\eqn{X \in R^{n \times p}}{X}): rows correspond to
#'          the observations (to be clustered) and columns to the variables (which
#'          will not be clustered). If \code{X} has missing values - \code{NA} or
#'          \code{NaN} values - they will be automatically imputed.
#' @param labels A character vector of length \eqn{n}: observations (row) labels
#' @param X.center A logical: Should \code{X} be centered columnwise?
#' @param X.scale A logical: Should \code{X} be scaled columnwise?
#' @param back_track A logical: Should back-tracking be used to exactly identify fusions?
#'                   By default, back-tracking is not used.
#' @param exact A logical: Should the exact solution be computed using an iterative algorithm?
#'              By default, algorithmic regularization is applied and the exact solution
#'              is not computed. Setting \code{exact = TRUE} often significantly increases
#'              computation time.
#' @param norm Which norm to use in the fusion penalty? Currently only \code{1}
#'             and \code{2} (default) are supported.
#' @param t A number greater than 1: the size of the multiplicative update to
#'          the cluster fusion regularization parameter (not used by
#'          back-tracking variants). Typically on the scale of \code{1.005} to \code{1.1}.
#' @param npcs An integer >= 2. The number of principal components to compute
#'             for path visualization.
#' @param dendrogram.scale A character string denoting how the scale of dendrogram
#'                         regularization proportions should be visualized.
#'                         Choices are \code{'original'} or \code{'log'}; if not
#'                         provided, a data-driven heuristic choice is used.
#' @param ... Unused arguements. An error will be thrown if any unrecognized
#'            arguments as given. All arguments other than \code{X} must be given
#'            by name.
#' @param weights One of the following: \itemize{
#'                \item A function which, when called with argument \code{X},
#'                      returns an b-by-n matrix of fusion weights.
#'                \item A matrix of size n-by-n containing fusion weights
#'                }
#' @param impute_func A function used to impute missing data in \code{X}. By default,
#'                    the \code{\link[missForest]{missForest}} function from the
#'                    package of the same name is used. This provides a flexible
#'                    potentially non-linear imputation function. This function
#'                    has to return a data matrix with no \code{NA} values.
#'                    Note that, consistent with base \code{R}, both \code{NaN}
#'                    and \code{NA} are treaded as "missing values" for imputation.
#' @param status Should a status message be printed to the console?
#' @return An object of class \code{CARP} containing the following elements (among others):
#'         \itemize{
#'         \item \code{X}: the original data matrix
#'         \item \code{n}: the number of observations (rows of \code{X})
#'         \item \code{p}: the number of variables (columns of \code{X})
#'         \item \code{alg.type}: the \code{CARP} variant used
#'         \item \code{X.center}: a logical indicating whether \code{X} was centered
#'                                column-wise before clustering
#'         \item \code{X.scale}: a logical indicating whether \code{X} was scaled
#'                               column-wise before centering
#'         \item \code{weight_type}: a record of the scheme used to create
#'                                   fusion weights
#'         }
#' @importFrom utils data
#' @importFrom dplyr %>% mutate group_by ungroup as_tibble n_distinct
#' @importFrom rlang %||%
#' @importFrom stats var
#' @importFrom missForest missForest
#' @export
#' @examples
#' carp_fit <- CARP(presidential_speech[1:10,1:4])
#' print(carp_fit)
#' plot(carp_fit)
CARP <- function(X,
                 ...,
                 weights = sparse_rbf_kernel_weights(k = "auto",
                                                     phi = "auto",
                                                     dist.method = "euclidean",
                                                     p = 2),
                 labels = rownames(X),
                 X.center = TRUE,
                 X.scale = FALSE,
                 back_track = FALSE,
                 exact = FALSE,
                 norm = 2,
                 t = 1.05,
                 npcs = min(4L, NCOL(X), NROW(X)),
                 dendrogram.scale = NULL,
                 impute_func = function(X) {if(anyNA(X)) missForest(X)$ximp else X},
                 status = (interactive() && (clustRviz_logger_level() %in% c("MESSAGE", "WARNING", "ERROR")))) {

  tic <- Sys.time()

  ####################
  ##
  ## Input validation
  ##
  ####################

  dots <- list(...)

  if (length(dots) != 0L) {
    if (!is.null(names(dots))) {
      crv_error("Unknown argument ", sQuote(names(dots)[1L]), " passed to ", sQuote("CARP."))
    } else {
      crv_error("Unknown ", sQuote("..."), " arguments passed to ", sQuote("CARP."))
    }
  }

  if (!is.matrix(X)) {
    crv_warning(sQuote("X"), " should be a matrix, not a " , class(X)[1],
                ". Converting with as.matrix().")
    X <- as.matrix(X)
  }

  if (!is.numeric(X)) {
    crv_error(sQuote("X"), " must be numeric.")
  }

  # Missing data mask: M_{ij} = 1 means we see X_{ij};
  M <- 1 - is.na(X)

  # Impute missing values in X
  # By default, we use the "Missing Forest" function from the missForest package
  # though other imputers can be supplied by the user.
  X.orig <- X

  if(anyNA(X)) {
    X <- impute_func(X)
  }

  ## Check that imputation was successful.
  if (anyNA(X)) {
    crv_error("Imputation failed. Missing values found in ", sQuote("X"), " even after imputation.")
  }

  if (!all(is.finite(X))) {
    crv_error("All elements of ", sQuote("X"), " must be finite.")
  }

  if (!is_logical_scalar(X.center)) {
    crv_error(sQuote("X.center"), "must be either ", sQuote("TRUE"), " or ", sQuote("FALSE."))
  }

  if (!is_logical_scalar(X.scale)) {
    crv_error(sQuote("X.scale"), "must be either ", sQuote("TRUE"), " or ", sQuote("FALSE."))
  }

  if (!is_logical_scalar(back_track)) {
    crv_error(sQuote("back_track"), "must be either ", sQuote("TRUE"), " or ", sQuote("FALSE."))
  }

  if (!is_logical_scalar(exact)) {
    crv_error(sQuote("exact"), "must be either ", sQuote("TRUE"), " or ", sQuote("FALSE."))
  }

  if (norm %not.in% c(1, 2)){
    crv_error(sQuote("norm"), " must be either 1 or 2.")
  }

  l1 <- (norm == 1)

  if ( (!is_numeric_scalar(t)) || (t <= 1) ) {
    crv_error(sQuote("t"), " must be a scalar greater than 1.")
  }

  if (!is.null(dendrogram.scale)) {
    if (dendrogram.scale %not.in% c("original", "log")) {
      crv_error("If not NULL, ", sQuote("dendrogram.scale"), " must be either ", sQuote("original"), " or ", sQuote("log."))
    }
  }

  if ( (!is_integer_scalar(npcs)) || (npcs < 2) || (npcs > NCOL(X)) || (npcs > NROW(X)) ){
    crv_error(sQuote("npcs"), " must be an integer scalar between 2 and ", sQuote("min(dim(X))."))
  }

  ## Get row (observation) labels
  if (is.null(labels)) {
    labels <- paste0("Obs", seq_len(NROW(X)))
  }

  if ( length(labels) != NROW(X) ){
    crv_error(sQuote("labels"), " must be of length ", sQuote("NROW(X)."))
  }

  rownames(X.orig) <- rownames(X) <- labels <- make.unique(as.character(labels), sep="_")

  n <- NROW(X)
  p <- NCOL(X)

  # Center and scale X
  if (X.center | X.scale) {
    X <- scale(X, center = X.center, scale = X.scale)
  }

  scale_vector  <- attr(X, "scaled:scale", exact=TRUE)  %||% rep(1, p)
  center_vector <- attr(X, "scaled:center", exact=TRUE) %||% rep(0, p)

  crv_message("Pre-computing weights and edge sets")

  # Calculate clustering weights
  if (is.function(weights)) { # Usual case, `weights` is a function which calculates the weight matrix
    weight_result <- weights(X)

    if (is.matrix(weight_result)) {
      weight_matrix <- weight_result
      weight_type   <- UserFunction()
    } else {
      weight_matrix <- weight_result$weight_mat
      weight_type   <- weight_result$type
    }
  } else if (is.matrix(weights)) {

    if (!is_square(weights)) {
      crv_error(sQuote("weights"), " must be a square matrix.")
    }

    if (NROW(weights) != NROW(X)) {
      crv_error(sQuote("NROW(weights)"), " must be equal to ", sQuote("NROW(X)."))
    }

    weight_matrix <- weights
    weight_type   <- UserMatrix()
  } else {
    crv_error(sQuote("CARP"), " does not know how to handle ", sQuote("weights"),
              " of class ", class(weights)[1], ".")
  }

  if (any(weight_matrix < 0) || anyNA(weight_matrix)) {
    crv_error("All fusion weights must be positive or zero.")
  }

  if (!is_connected_adj_mat(weight_matrix != 0)) {
    crv_error("Weights do not imply a connected graph. Clustering will not succeed.")
  }

  weight_matrix_ut <- weight_matrix * upper.tri(weight_matrix);

  edge_list <- which(weight_matrix_ut != 0, arr.ind = TRUE)
  edge_list <- edge_list[order(edge_list[, 1], edge_list[, 2]), ]
  cardE <- NROW(edge_list)
  D <- matrix(0, ncol = n, nrow = cardE)
  D[cbind(seq_len(cardE), edge_list[,1])] <-  1
  D[cbind(seq_len(cardE), edge_list[,2])] <- -1

  weight_vec <- weight_mat_to_vec(weight_matrix)

  crv_message("Computing Convex Clustering [CARP] Path")
  tic_inner <- Sys.time()

  carp.sol.path <- CARPcpp(X = X,
                           M = M,
                           D = D,
                           t = t,
                           epsilon = .clustRvizOptionsEnv[["epsilon"]],
                           weights = weight_vec[weight_vec != 0],
                           rho = .clustRvizOptionsEnv[["rho"]],
                           thresh = .clustRvizOptionsEnv[["stopping_threshold"]],
                           max_iter = .clustRvizOptionsEnv[["max_iter"]],
                           max_inner_iter = .clustRvizOptionsEnv[["max_inner_iter"]],
                           burn_in = .clustRvizOptionsEnv[["burn_in"]],
                           viz_max_inner_iter = .clustRvizOptionsEnv[["viz_max_inner_iter"]],
                           viz_initial_step = .clustRvizOptionsEnv[["viz_initial_step"]],
                           viz_small_step = .clustRvizOptionsEnv[["viz_small_step"]],
                           keep = .clustRvizOptionsEnv[["keep"]],
                           l1 = l1,
                           show_progress = status,
                           back_track = back_track,
                           exact = exact)

  toc_inner <- Sys.time()

  ## FIXME - Convert gamma.path to a single column matrix instead of a vector
  ##         RcppArmadillo returns a arma::vec as a n-by-1 matrix
  ##         RcppEigen returns an Eigen::VectorXd as a n-length vector
  ##         Something downstream cares about the difference, so just change
  ##         the type here for now
  carp.sol.path$gamma_path <- matrix(carp.sol.path$gamma_path, ncol=1)

  crv_message("Post-processing")

  post_processing_results <- ConvexClusteringPostProcess(X = X,
                                                         edge_matrix      = edge_list,
                                                         gamma_path       = carp.sol.path$gamma_path,
                                                         u_path           = carp.sol.path$u_path,
                                                         v_path           = carp.sol.path$v_path,
                                                         v_zero_indices   = carp.sol.path$v_zero_inds,
                                                         labels           = labels,
                                                         dendrogram_scale = dendrogram.scale,
                                                         npcs             = npcs,
                                                         smooth_U         = TRUE)

  carp.fit <- list(
    X = X.orig,
    M = M,
    D = D,
    U = post_processing_results$U,
    dendrogram = post_processing_results$dendrogram,
    rotation_matrix = post_processing_results$rotation_matrix,
    cluster_membership = post_processing_results$membership_info,
    n = n,
    p = p,
    weights = weight_matrix,
    weight_type = weight_type,
    back_track = back_track,
    exact = exact,
    norm = norm,
    t = t,
    X.center = X.center,
    center_vector = center_vector,
    X.scale = X.scale,
    scale_vector = scale_vector,
    time = Sys.time() - tic,
    fit_time = toc_inner - tic_inner
  )

  if (.clustRvizOptionsEnv[["keep_debug_info"]]) {
    carp.fit[["debug"]] <- list(path = carp.sol.path,
                                row  = post_processing_results$debug)
  }

  class(carp.fit) <- "CARP"

  return(carp.fit)
}

#' Print \code{CARP} Results
#'
#' Prints a brief descripton of a fitted \code{CARP} object.
#'
#' Reports number of observations and variables of dataset, any preprocessing
#' done by the \code{\link{CARP}} function, regularization weight information,
#' and the variant of \code{CARP} used.
#'
#' @details The \code{as.dendrogram} and \code{as.hclust} methods convert the
#'          \code{CBASS} output to an object of class \code{dendrogram} or \code{hclust}
#'          respectively.
#'
#' @param x an object of class \code{CARP} as returned by \code{\link{CARP}}
#' @param object an object of class \code{CARP} as returned by \code{\link{CARP}}
#' @param ... Additional unused arguments
#' @export
#' @rdname print_carp
#' @examples
#' carp_fit <- CARP(presidential_speech)
#' print(carp_fit)
print.CARP <- function(x, ...) {
  if(x$exact){
    if(x$back_track){
      alg_string = "ADMM-VIZ [Exact Solver + Back-Tracking Fusion Search]"
    } else {
      alg_string = paste0("ADMM (t = ", round(x$t, 3), ") [Exact Solver]")
    }
  } else {
    if(x$back_track){
      alg_string = "CARP-VIZ [Back-Tracking Fusion Search]"
    } else {
      alg_string = paste0("CARP (t = ", round(x$t, 3), ")")
    }
  }

  if(x$norm == 1){
    alg_string <- paste(alg_string, "[L1]")
  }

  cat("CARP Fit Summary\n")
  cat("====================\n\n")
  cat("Algorithm:", alg_string, "\n")
  cat("Fit Time:", sprintf("%2.3f %s", x$fit_time, attr(x$fit_time, "units")), "\n")
  cat("Total Time:", sprintf("%2.3f %s", x$time, attr(x$time, "units")), "\n\n")

  cat("Number of Observations:", x$n, "\n")
  cat("Number of Variables:   ", x$p, "\n\n")

  cat("Pre-processing options:\n")
  cat(" - Columnwise centering:", x$X.center, "\n")
  cat(" - Columnwise scaling:  ", x$X.scale, "\n\n")

  cat("Weights:\n")
  print(x$weight_type)

  invisible(x)
}

#' @export
#' @importFrom stats as.dendrogram
#' @rdname print_carp
as.dendrogram.CARP <- function(object, ...){
  as.dendrogram(object$dendrogram)
}

#' @export
#' @importFrom stats as.hclust
#' @rdname print_carp
as.hclust.CARP <- function(x, ...){
  x$dendrogram
}