In [None]:
# Parámetros y librerías

suppressPackageStartupMessages({
  library(ape)
  library(mvMORPH)
  library(readr)
  library(dplyr)
  library(purrr)
  library(tibble)
  library(glue)
})

# Entradas
TRAITS_ALIGNED  <- "../traits/traits_high-level_aligned.csv"
TREE_MCC_FILE   <- "../trees/BEAST_MCC_46_ultrametric_FIXED.tre"              # MCC ultramétrico
POOL_UNDERSCORE <- "../trees/BEAST_posterior_aligned.nex"

# Salidas
OUT_DIR <- "../results"
dir.create(OUT_DIR, showWarnings = FALSE, recursive = TRUE)

OUT_SUMMARY_CSV    <- file.path(OUT_DIR, "model_screen_MCC_summary.csv")
OUT_BEST_PER_MODEL <- file.path(OUT_DIR, "model_screen_MCC_best_per_model.csv")
OUT_POOL_DETAIL    <- file.path(OUT_DIR, "model_screen_pool_detail.csv")
OUT_POOL_WIN_FREQ  <- file.path(OUT_DIR, "model_screen_pool_winner_freq.csv")

# Configuración modelos
CANDIDATE_MODELS <- c("BM", "OU", "EB")
PENALTY          <- "RidgeArch"
METHOD           <- "H&L"
DELTA_TIE        <- 2
SEED             <- 123
set.seed(SEED)

stop_with <- function(msg) {
  stop(glue("[ERROR] {msg}"), call. = FALSE)
}

# Reordenar matriz de rasgos al orden del árbol
reorder_Y_to_tree <- function(tree, Y) {
  if (identical(rownames(Y), tree$tip.label)) return(Y)
  idx <- match(tree$tip.label, rownames(Y))
  if (any(is.na(idx))) {
    faltan <- paste(tree$tip.label[is.na(idx)], collapse = ", ")
    stop_with(glue("El árbol tiene especies que no están en Y: {faltan}"))
  }
  Y[idx, , drop = FALSE]
}

In [None]:
# Cargar MCC + Y (alineado) y chequear orden

tree <- read.tree(TREE_MCC_FILE)
stopifnot(inherits(tree, "phylo"))

dat <- read_csv(TRAITS_ALIGNED, show_col_types = FALSE)
stopifnot("species" %in% names(dat))

species <- dat$species
X <- dat %>%
  dplyr::select(-species)

# Forzar numérico en todos los rasgos
X[] <- lapply(X, function(z) suppressWarnings(as.numeric(z)))
Y <- as.matrix(X)
rownames(Y) <- species

# Reordenar Y al orden del árbol
Y <- reorder_Y_to_tree(tree, Y)

cat(glue("MCC listo: n={nrow(Y)}, p={ncol(Y)}\n"))


In [None]:
# Screening en MCC (BM, OU, EB) con GIC y LogLik

get_gic_loglik <- function(fit) {
  g <- try(GIC(fit), silent = TRUE)
  if (inherits(g, "try-error") || is.null(g[["GIC"]])) {
    return(list(GIC = NA_real_, LogLik = NA_real_))
  }
  ic_val <- unname(g[["GIC"]])
  ll_val <- if (!is.null(g[["LogLikelihood"]])) unname(g[["LogLikelihood"]]) else NA_real_
  list(GIC = ic_val, LogLik = ll_val)
}

fit_one <- function(model) {
  t0  <- proc.time()
  dat <- list(Y = Y)

  fit <- try(
    mvgls(Y ~ 1, data = dat, tree = tree,
          model = model, penalty = PENALTY, method = METHOD),
    silent = TRUE
  )

  if (inherits(fit, "try-error")) {
    return(
      tibble(
        model       = model,
        ok          = FALSE,
        GIC         = NA_real_,
        LogLik      = NA_real_,
        elapsed_sec = as.numeric((proc.time() - t0)[3]),
        error       = as.character(fit)
      )
    )
  }

  met <- get_gic_loglik(fit)

  tibble(
    model       = model,
    ok          = TRUE,
    GIC         = met$GIC,
    LogLik      = met$LogLik,
    elapsed_sec = as.numeric((proc.time() - t0)[3]),
    error       = NA_character_
  )
}

# Ajuste de modelos en MCC
results_tbl <- purrr::map_dfr(CANDIDATE_MODELS, fit_one)

# Tabla (model, GIC, ΔGIC, LogLik)
paper_tbl <- results_tbl %>%
  filter(ok, is.finite(GIC)) %>%
  mutate(
    delta_GIC = GIC - min(GIC, na.rm = TRUE)
  ) %>%
  mutate(
    across(c(GIC, delta_GIC, LogLik), ~ round(.x, 3))
  ) %>%
  select(model, GIC, delta_GIC, LogLik) %>%
  arrange(GIC)

write_csv(paper_tbl, OUT_SUMMARY_CSV)

print(paper_tbl)
cat(glue("\nGuardado (tabla):\n- {OUT_SUMMARY_CSV}\n"))

In [None]:
# Screening en pool posterior (BM, OU, EB) con GIC

trees <- read.nexus(POOL_UNDERSCORE)
stopifnot(inherits(trees, "multiPhylo"))
cat(glue("Pool leído: {length(trees)} árboles.\n"))

fit_on_tree <- function(tr, model) {
  Y_tr <- reorder_Y_to_tree(tr, Y)
  dat  <- list(Y = Y_tr)
  t0   <- proc.time()

  fit <- try(
    mvgls(Y ~ 1, data = dat, tree = tr,
          model = model, penalty = PENALTY, method = METHOD),
    silent = TRUE
  )

  if (inherits(fit, "try-error")) {
    return(
      tibble(
        model       = model,
        ok          = FALSE,
        GIC         = NA_real_,
        LogLik      = NA_real_,
        elapsed_sec = as.numeric((proc.time() - t0)[3]),
        error       = as.character(fit)
      )
    )
  }

  met <- get_gic_loglik(fit)

  tibble(
    model       = model,
    ok          = TRUE,
    GIC         = met$GIC,
    LogLik      = met$LogLik,
    elapsed_sec = as.numeric((proc.time() - t0)[3]),
    error       = NA_character_
  )
}

pool_rows <- vector("list", length(trees))

for (i in seq_along(trees)) {
  tr <- trees[[i]]

  res_i <- purrr::map_dfr(CANDIDATE_MODELS, ~ fit_on_tree(tr, .x)) %>%
    filter(ok, is.finite(GIC)) %>%
    mutate(
      tree_id   = i,
      delta_GIC = GIC - min(GIC, na.rm = TRUE)
    ) %>%
    select(tree_id, model, GIC, delta_GIC, LogLik)

  pool_rows[[i]] <- res_i

  if (i %% 10 == 0) {
    cat(glue("..procesados {i}/{length(trees)}\n"))
  }
}

pool_tbl <- dplyr::bind_rows(pool_rows) %>%
  mutate(
    across(c(GIC, delta_GIC, LogLik), ~ round(.x, 3))
  )

# Detalle por árbol
write_csv(pool_tbl, OUT_POOL_DETAIL)

# Ganadores por árbol
winner_tbl <- pool_tbl %>%
  group_by(tree_id) %>%
  slice_min(GIC, n = 1, with_ties = TRUE) %>%
  ungroup()

freq_tbl <- winner_tbl %>%
  count(model, name = "n_win") %>%
  arrange(desc(n_win)) %>%
  mutate(
    n_trees_used = length(unique(winner_tbl$tree_id)),
    freq         = round(n_win / n_trees_used, 3)
  )

write_csv(freq_tbl, OUT_POOL_WIN_FREQ)

print(freq_tbl)
cat(glue("\nGuardados:\n- {OUT_POOL_DETAIL}\n- {OUT_POOL_WIN_FREQ}\n"))