In [None]:
# Cargar matriz high-level + muestra posterior de árboles

library(ape)
library(readr)

# Entradas
TRAITS_ALIGNED    <- "../traits/traits_high-level_aligned.csv"
TREES_POST_FILE   <- "../trees/BEAST_posterior_aligned.nex"

# salida
OUT_DIR <- "../results/asr_POSTERIOR_OU"
dir.create(OUT_DIR, recursive = TRUE, showWarnings = FALSE)


# Matriz de rasgos high-level
traits_tbl <- read_csv(TRAITS_ALIGNED, show_col_types = FALSE)

traits_df <- as.data.frame(traits_tbl)
rownames(traits_df) <- traits_df$species
traits_df$species   <- NULL

# todo  numérico
traits_df[] <- lapply(traits_df, function(z) suppressWarnings(as.numeric(z)))


# Muestra posterior de árboles (multiPhylo)
forest <- ape::read.nexus(TREES_POST_FILE)
stopifnot(inherits(forest, "multiPhylo"))

n_trees <- length(forest)
tree1   <- forest[[1L]]  # primero como referencia de orden de puntas

# Alinear especies rasgos-árbol 1
stopifnot(setequal(rownames(traits_df), tree1$tip.label))
traits_df <- traits_df[tree1$tip.label, , drop = FALSE]


# Resumen
n_species <- length(tree1$tip.label)
n_traits  <- ncol(traits_df)
na_total  <- sum(is.na(traits_df))

cat(sprintf(
  "OK \nÁrboles en muestra posterior: %d\nEspecies: %d | Rasgos high-level: %d | NAs totales: %d\nOUT_DIR: %s\n",
  n_trees, n_species, n_traits, na_total,
  normalizePath(OUT_DIR, mustWork = FALSE)
))

In [None]:
# Definir traits_proc (filtrar rasgos con sd == 0)

DROP_SD_ZERO <- TRUE

# Árbol de referencia (el primero de la muestra posterior)
tree_ref <- forest[[1L]]
stopifnot(inherits(tree_ref, "phylo"))

# Chequeo de dimensiones
stopifnot(nrow(traits_df) == length(tree_ref$tip.label))

# NAs totales
na_total <- sum(is.na(traits_df))

# Desviación estándar por rasgo
sds <- apply(traits_df, 2, sd, na.rm = TRUE)
idx_sd0 <- which(!is.na(sds) & sds == 0)

traits_proc <- traits_df
n_sd0       <- length(idx_sd0)

if (DROP_SD_ZERO && n_sd0 > 0) {
  traits_proc <- traits_df[, -idx_sd0, drop = FALSE]
}

cat(sprintf(
  paste0(
    "OK \n",
    "Especies: %d | Rasgos (original): %d | Rasgos usados: %d | NAs: %d\n",
    "Rasgos con sd == 0 removidos: %d\n"
  ),
  nrow(traits_df), ncol(traits_df),
  ncol(traits_proc), na_total, n_sd0
))

In [None]:
# Configurar OU penalizado y definir fit_ou_on_tree()

library(mvMORPH)

MODEL   <- "OU"
PENALTY <- "RidgeArch"
METHOD  <- "H&L"

# Ajuste OU penalizado en un árbol dado
fit_ou_on_tree <- function(tr, Ymat) {
  stopifnot(setequal(rownames(Ymat), tr$tip.label))
  Y <- as.matrix(Ymat[tr$tip.label, , drop = FALSE])

  storage.mode(Y) <- "double"
  dat <- list(Y = Y)

  mvgls(
    Y ~ 1,
    data    = dat,
    tree    = tr,
    model   = MODEL,
    penalty = PENALTY,
    method  = METHOD
  )
}

cat("OK: función fit_ou_on_tree() definida.\n")

In [None]:
# Definir helper de ASR nodal asr_nodes_on_tree()

asr_nodes_on_tree <- function(tr, Ymat) {
  fit <- fit_ou_on_tree(tr, Ymat)
  A_nodes <- mvMORPH::ancestral(fit)  # matriz [Nnode × p]
  list(
    fit     = fit,
    A_nodes = A_nodes
  )
}

cat("OK: función asr_nodes_on_tree() definida.\n")

In [None]:
# ASR en la muestra posterior y CSV único con todos los nodos/árboles

library(mvMORPH)
library(readr)

Ntrees <- length(forest)

tree_ref <- forest[[1L]]
Ntip  <- length(tree_ref$tip.label)
Nnode <- tree_ref$Nnode
p     <- ncol(traits_proc)

cat(sprintf(
  "ASR posterior: %d árboles | Ntip = %d | Nnode = %d | Rasgos = %d\n",
  Ntrees, Ntip, Nnode, p
))

df_list      <- vector("list", Ntrees)
failed_trees <- integer(0)

for (i in seq_len(Ntrees)) {
  tr <- forest[[i]]

  cat(sprintf("Árbol %3d / %3d ... ", i, Ntrees))

  res_i <- try(asr_nodes_on_tree(tr, traits_proc), silent = TRUE)

  if (inherits(res_i, "try-error")) {
    cat("ERROR (saltando este árbol)\n")
    failed_trees <- c(failed_trees, i)
    next
  }

  A_i <- res_i$A_nodes  # [Nnode × p]

  node_ids_i <- (length(tr$tip.label) + 1L):(length(tr$tip.label) + tr$Nnode)

  df_i <- as.data.frame(A_i)
  colnames(df_i) <- colnames(traits_proc)

  df_i <- cbind(
    tree_index = i,
    tree_id    = paste0("tree", sprintf("%03d", i)),
    node       = node_ids_i,
    df_i
  )

  df_list[[i]] <- df_i

  cat("OK\n")

  if (i %% 10 == 0) {
    cat(sprintf(">>> Checkpoint: %d árboles procesados.\n", i))
  }
}

df_list_clean <- df_list[!vapply(df_list, is.null, logical(1))]
asr_all_df    <- do.call(rbind, df_list_clean)

OUT_CSV_ALL <- file.path(OUT_DIR, "asr_POST_alltrees_nodes_high-level.csv")
readr::write_csv(asr_all_df, OUT_CSV_ALL)

cat("\nOK \n")
cat(sprintf("Archivo CSV con todos los ASR nodales (árboles exitosos):\n- %s\n", OUT_CSV_ALL))

if (length(failed_trees) > 0) {
  cat("Árboles que fallaron y fueron saltados: ",
      paste(failed_trees, collapse = ", "), "\n")
} else {
  cat("Todos los árboles se procesaron sin errores numéricos.\n")
}

In [None]:
# Definir clados del MCC y cargar ASR MCC + posterior

library(ape)
library(dplyr)
library(readr)

# RUTAS
TREE_MCC_FILE <- "../trees/BEAST_MCC_46_ultrametric_FIXED.tre"
ASR_MCC_FILE  <- "../results/asr_MCC_OU_nodes_high-level.csv"
ASR_POST_FILE <- file.path(OUT_DIR, "asr_POST_alltrees_nodes_high-level.csv")

# Árbol MCC y clados internos

tree_mcc <- ape::read.tree(TREE_MCC_FILE)
stopifnot(inherits(tree_mcc, "phylo"))

# Si Nnode no viene definido en el árbol, lo calculamos
if (is.null(tree_mcc$Nnode)) {
  tree_mcc$Nnode <- ape::Nnode(tree_mcc)
}

Ntip_mcc  <- length(tree_mcc$tip.label)
Nnode_mcc <- tree_mcc$Nnode

# Secuencia de nodos internos
internal_nodes <- (Ntip_mcc + 1L):(Ntip_mcc + Nnode_mcc)

# Edades de nodos internos (asumimos unidades = Mya)
bt <- ape::branching.times(tree_mcc)  # named vector, names = node IDs

get_clade_tips <- function(tr, node) {
  cl <- ape::extract.clade(tr, node)
  sort(cl$tip.label)
}

clade_list <- lapply(internal_nodes, function(nd) {
  tips <- get_clade_tips(tree_mcc, nd)
  tibble(
    node_mcc = nd,
    n_desc   = length(tips),
    clade_id = paste(tips, collapse = "|"),
    age_Mya  = as.numeric(bt[as.character(nd)])
  )
})

node_clades_mcc <- bind_rows(clade_list) %>%
  arrange(node_mcc)

# Cargar ASR MCC (nodos ancestrales en MCC)

asr_mcc <- readr::read_csv(ASR_MCC_FILE, show_col_types = FALSE)

stopifnot("node" %in% colnames(asr_mcc))
asr_mcc <- asr_mcc %>%
  rename(node_mcc = node)

embed_cols_mcc <- setdiff(colnames(asr_mcc), "node_mcc")

# Cargar ASR en la muestra posterior (árboles buenos)

asr_post <- readr::read_csv(ASR_POST_FILE, show_col_types = FALSE)

stopifnot(all(c("tree_index", "tree_id", "node") %in% colnames(asr_post)))
asr_post <- asr_post %>%
  mutate(node_post = as.integer(node))

embed_cols_post <- setdiff(colnames(asr_post), c("tree_index", "tree_id", "node", "node_post"))

# Chequeo de columnas embedding coinciden en MCC y posterior
stopifnot(identical(embed_cols_mcc, embed_cols_post))

embed_cols <- embed_cols_mcc  # nombre común

#  Resumen 

n_nodes_mcc   <- nrow(node_clades_mcc)
n_rows_post   <- nrow(asr_post)
n_trees_used  <- length(unique(asr_post$tree_index))
n_per_node    <- n_rows_post / n_nodes_mcc

cat(sprintf(
  paste0(
    "OK \n",
    "MCC: Ntip = %d | Nnode = %d\n",
    "Nodos internos en MCC: %d\n",
    "Filas en ASR_POST: %d (≈ %.1f árboles por nodo, antes de filtrar por monofilia)\n",
    "Árboles únicos con ASR posterior: %d\n",
    "Dimensiones de embedding: %d\n"
  ),
  Ntip_mcc, Nnode_mcc,
  n_nodes_mcc,
  n_rows_post, n_per_node,
  n_trees_used,
  length(embed_cols)
))


In [None]:
# Resumen de incertidumbre filogenética por clado MCC (distancias en espacio latente + edades posteriores)

library(ape)
library(dplyr)
library(readr)

# RUTAS

TREE_MCC_FILE <- "../trees/BEAST_MCC_46_ultrametric_FIXED.tre"

ASR_MCC_FILE  <- "../results/asr_MCC_OU_nodes_high-level.csv"
ASR_POST_FILE <- file.path(OUT_DIR, "asr_POST_alltrees_nodes_high-level.csv")

# Árbol MCC: clados internos + edades

tree_mcc <- ape::read.tree(TREE_MCC_FILE)
stopifnot(inherits(tree_mcc, "phylo"))

if (is.null(tree_mcc$Nnode)) {
  tree_mcc$Nnode <- ape::Nnode(tree_mcc)
}

Ntip_mcc  <- length(tree_mcc$tip.label)
Nnode_mcc <- tree_mcc$Nnode

internal_nodes <- (Ntip_mcc + 1L):(Ntip_mcc + Nnode_mcc)

# Edades de nodos internos ( unidades = Mya)
bt_mcc <- ape::branching.times(tree_mcc)  # named numeric, names = node IDs

get_clade_tips <- function(tr, node) {
  cl <- ape::extract.clade(tr, node)
  sort(cl$tip.label)
}

clade_list <- lapply(internal_nodes, function(nd) {
  tips <- get_clade_tips(tree_mcc, nd)
  tibble(
    node_mcc = nd,
    n_desc   = length(tips),
    clade_id = paste(tips, collapse = "|"),
    age_Mya  = as.numeric(bt_mcc[as.character(nd)])  # edad MCC para este nodo
  )
})

node_clades_mcc <- bind_rows(clade_list) %>%
  arrange(node_mcc)

# ASR en el MCC (nodos ancestrales)

asr_mcc <- readr::read_csv(ASR_MCC_FILE, show_col_types = FALSE)

stopifnot("node" %in% colnames(asr_mcc))
asr_mcc <- asr_mcc %>%
  rename(node_mcc = node)

embed_cols_mcc <- setdiff(colnames(asr_mcc), "node_mcc")

# ASR en la muestra posterior (árboles buenos)

asr_post <- readr::read_csv(ASR_POST_FILE, show_col_types = FALSE)

stopifnot(all(c("tree_index", "tree_id", "node") %in% colnames(asr_post)))
asr_post <- asr_post %>%
  mutate(node_post = as.integer(node))

embed_cols_post <- setdiff(colnames(asr_post), c("tree_index", "tree_id", "node", "node_post"))

# Chequeo de que  columnas de embedding coinciden
stopifnot(identical(embed_cols_mcc, embed_cols_post))
embed_cols <- embed_cols_mcc

# Match clado MCC - MRCA en cada árbol posterior (con edades posteriores)

trees_ok <- sort(unique(asr_post$tree_index))
map_list <- vector("list", length(trees_ok))

for (k in seq_along(trees_ok)) {
  i  <- trees_ok[k]
  tr <- forest[[i]]

  # Edades de nodos para este árbol posterior
  bt_i <- ape::branching.times(tr)  # named numeric

  local_list <- lapply(seq_len(nrow(node_clades_mcc)), function(j) {
    clade_tips <- strsplit(node_clades_mcc$clade_id[j], "\\|")[[1]]

    # MRCA en el árbol i
    mrca_i <- ape::getMRCA(tr, clade_tips)
    if (is.na(mrca_i)) return(NULL)

    # Monofilia: descendientes del MRCA deben ser exactamente esos tips
    desc_i <- sort(ape::extract.clade(tr, mrca_i)$tip.label)
    if (!identical(desc_i, clade_tips)) return(NULL)

    age_post <- as.numeric(bt_i[as.character(mrca_i)])

    tibble(
      tree_index   = i,
      node_mcc     = node_clades_mcc$node_mcc[j],
      clade_id     = node_clades_mcc$clade_id[j],
      age_Mya      = node_clades_mcc$age_Mya[j],   # edad MCC
      node_post    = mrca_i,
      age_post_Mya = age_post,
      age_diff_Mya = age_post - node_clades_mcc$age_Mya[j]
    )
  })

  map_list[[k]] <- bind_rows(local_list)

  cat(sprintf("Mapa clados: árbol %3d / %3d OK (%d matches)\n",
              i, max(trees_ok), nrow(map_list[[k]])))
}

node_map <- bind_rows(map_list)

cat(sprintf(
  "Total de matches clado-MRCA (sobre todos los árboles): %d filas\n",
  nrow(node_map)
))

# Unir mapping con ASR posterior y calcular distancias

asr_post_mapped <- asr_post %>%
  inner_join(node_map,
             by = c("tree_index", "node_post"))

cat(sprintf(
  "Filas en ASR_POST mapeadas a clados MCC (monofiléticos): %d\n",
  nrow(asr_post_mapped)
))

# Matriz MCC: una fila por node_mcc
mcc_mat <- as.matrix(asr_mcc[, embed_cols])

# Alinear MCC con filas de asr_post_mapped
idx_mcc <- match(asr_post_mapped$node_mcc, asr_mcc$node_mcc)
stopifnot(!any(is.na(idx_mcc)))

mcc_for_post <- mcc_mat[idx_mcc, , drop = FALSE]
post_mat     <- as.matrix(asr_post_mapped[, embed_cols])

diff_mat <- post_mat - mcc_for_post
dist_vec <- sqrt(rowSums(diff_mat^2))

asr_post_mapped <- asr_post_mapped %>%
  mutate(dist_euclid = dist_vec)

# Resumen por clado MCC

phylo_uncert_summary <- asr_post_mapped %>%
  group_by(node_mcc, clade_id, age_Mya) %>%  # age_Mya = MCC
  summarise(
    n_trees         = n(),
    # Distancias en espacio latente
    dist_mean       = mean(dist_euclid),
    dist_median     = median(dist_euclid),
    dist_q025       = quantile(dist_euclid, 0.025),
    dist_q975       = quantile(dist_euclid, 0.975),
    # Edades posteriores del MRCA
    age_post_mean   = mean(age_post_Mya),
    age_post_median = median(age_post_Mya),
    age_post_q025   = quantile(age_post_Mya, 0.025),
    age_post_q975   = quantile(age_post_Mya, 0.975),
    # Diferencia edad_post - edad_MCC
    age_diff_mean   = mean(age_diff_Mya),
    age_diff_median = median(age_diff_Mya),
    age_diff_q025   = quantile(age_diff_Mya, 0.025),
    age_diff_q975   = quantile(age_diff_Mya, 0.975),
    .groups = "drop"
  ) %>%
  arrange(age_Mya)

OUT_PHYLO_SUMMARY <- file.path(OUT_DIR, "asr_POST_phylo_uncertainty_by_clade.csv")
readr::write_csv(phylo_uncert_summary, OUT_PHYLO_SUMMARY)

cat("OK \n")
cat(sprintf("Resumen (distancias + edades) guardado en:\n- %s\n",
            OUT_PHYLO_SUMMARY))

print(head(phylo_uncert_summary, 5))


In [None]:
# Resumen de incertidumbre filogenética por 5 bandas de edad

library(dplyr)
library(readr)

df_bands <- phylo_uncert_summary %>%
  mutate(
    dist_ic_width = dist_q975 - dist_q025,
    age_ic_width  = age_post_q975 - age_post_q025
  ) %>%
  arrange(desc(age_Mya))

n_total   <- nrow(df_bands)
n_bands   <- 5L
band_size <- n_total / n_bands  # 45/5 = 9

df_bands <- df_bands %>%
  mutate(
    rank     = dplyr::row_number(),
    age_band = ceiling(rank / band_size)
  )

age_band_summary <- df_bands %>%
  group_by(age_band) %>%
  summarise(
    n_nodes              = n(),
    age_MCC_min          = min(age_Mya),
    age_MCC_max          = max(age_Mya),
    dist_median_min      = min(dist_median),
    dist_median_max      = max(dist_median),
    dist_median_mean     = mean(dist_median),
    dist_IC95_width_mean = mean(dist_ic_width),
    age_post_median_min  = min(age_post_median),
    age_post_median_max  = max(age_post_median),
    age_post_median_mean = mean(age_post_median),
    age_IC95_width_mean  = mean(age_ic_width),
    .groups = "drop"
  ) %>%
  arrange(age_band)

age_band_summary_5 <- age_band_summary %>%
  mutate(across(where(is.numeric), ~ round(.x, 3)))

print(age_band_summary_5)

OUT_BANDS_CSV <- file.path(OUT_DIR, "asr_POST_phylo_uncertainty_agebands_5.csv")
readr::write_csv(age_band_summary_5, OUT_BANDS_CSV)

cat("\nOK \n")
cat("Tabla resumen por 5 bandas de edad guardada en:\n - ", OUT_BANDS_CSV, "\n", sep = "")

In [None]:
# Incertidumbre filogenética vs rango dinámico de los tips
# (análogo a Tabla S7, pero usando MCC vs posterior en vez de MCC vs bootstrap)

library(readr)
library(dplyr)

# Cargar embeddings de tips
TIPS_FILE <- "../traits/traits_high-level_aligned.csv"
tips_tbl  <- readr::read_csv(TIPS_FILE, show_col_types = FALSE)

# Usamos las mismas columnas de embeddings que en el ASR (embed_cols)
trait_cols <- embed_cols

missing_cols <- setdiff(trait_cols, colnames(tips_tbl))
if (length(missing_cols) > 0) {
  stop(
    "Faltan estas columnas de rasgos en TIPS_FILE: ",
    paste(missing_cols, collapse = ", ")
  )
}

# Rango dinámico global de los embeddings en tips
tips_vals  <- as.matrix(tips_tbl[, trait_cols])
tips_min   <- min(tips_vals, na.rm = TRUE)
tips_max   <- max(tips_vals, na.rm = TRUE)
tips_range <- tips_max - tips_min

cat("Rango dinámico de embeddings en tips:\n")
cat("  min   =", tips_min, "\n")
cat("  max   =", tips_max, "\n")
cat("  rango =", tips_range, "\n\n")

# Incertidumbre "equivalente por dimensión" a partir de phylo_uncert_summary

if (!exists("phylo_uncert_summary")) {
  stop("phylo_uncert_summary no está definido (revisa la Celda 06).")
}

D <- length(trait_cols)

phylo_uncert_per_dim <- phylo_uncert_summary %>%
  mutate(
    ic_width             = dist_q975 - dist_q025,
    step_median_per_dim  = dist_median / sqrt(D),
    step_icwidth_per_dim = ic_width    / sqrt(D)
  ) %>%
  summarise(
    mean_step_median_per_dim = mean(step_median_per_dim,  na.rm = TRUE),
    min_step_median_per_dim  = min(step_median_per_dim,   na.rm = TRUE),
    max_step_median_per_dim  = max(step_median_per_dim,   na.rm = TRUE),
    mean_step_ic_per_dim     = mean(step_icwidth_per_dim, na.rm = TRUE),
    min_step_ic_per_dim      = min(step_icwidth_per_dim,  na.rm = TRUE),
    max_step_ic_per_dim      = max(step_icwidth_per_dim,  na.rm = TRUE)
  )

print(phylo_uncert_per_dim)

# Expresar como % del rango dinámico de los tips

mean_med_pct <- phylo_uncert_per_dim$mean_step_median_per_dim / tips_range * 100
mean_ic_pct  <- phylo_uncert_per_dim$mean_step_ic_per_dim     / tips_range * 100

cat("\nRespecto al rango dinámico de los tips (", tips_range, "):\n", sep = "")
cat("  Desplazamiento medio por dimensión de la mediana de distancias ≈ ",
    round(mean_med_pct, 2), "% del rango por dimensión\n", sep = "")
cat("  Ancho medio por dimensión del IC95% de distancias              ≈ ",
    round(mean_ic_pct, 2), "% del rango por dimensión\n", sep = "")

# Tabla resumida (una fila) análoga a la S7 original

phylo_dyn_uncert_summary <- tibble::tibble(
  tips_min              = tips_min,
  tips_max              = tips_max,
  tips_range            = tips_range,
  mean_step_med_per_dim = phylo_uncert_per_dim$mean_step_median_per_dim,
  mean_step_ic_per_dim  = phylo_uncert_per_dim$mean_step_ic_per_dim,
  step_med_pct_range    = mean_med_pct,
  step_ic_pct_range     = mean_ic_pct
)

# Renombrar columnas como en la tabla S7 y redondear a 3 decimales

phylo_dyn_uncert_summary_3 <- phylo_dyn_uncert_summary %>%
  rename(
    `tips min`        = tips_min,
    `tips max`        = tips_max,
    `tips range`      = tips_range,
    `median disp dim` = mean_step_med_per_dim,
    `IC95 disp dim`   = mean_step_ic_per_dim,
    `median disp %`   = step_med_pct_range,
    `IC95 disp %`     = step_ic_pct_range
  ) %>%
  mutate(
    across(where(is.numeric), ~ round(.x, 3))
  )

OUT_DYN_PHYLO <- file.path(OUT_DIR, "phylo_uncertainty_vs_tips_range_summary.csv")
write_csv(phylo_dyn_uncert_summary_3, OUT_DYN_PHYLO)

cat("\nTabla análoga a S7 (incertidumbre filogenética vs rango de tips) guardada en:\n",
    OUT_DYN_PHYLO, "\n")

phylo_dyn_uncert_summary_3

In [None]:
# Crear tabla S8 para Material Suplementario

library(dplyr)
library(readr)

IN_S8  <- file.path(OUT_DIR, "asr_POST_phylo_uncertainty_agebands_5.csv")
OUT_S8 <- file.path(OUT_DIR, "asr_POST_phylo_uncertainty_agebands_5_limpia.csv")

s8_raw <- readr::read_csv(IN_S8, show_col_types = FALSE)

s8_limpia <- s8_raw %>%
  select(
    age_band,
    n_nodes,
    age_MCC_min,
    age_MCC_max,
    dist_median_min,
    dist_median_max,
    dist_median_mean,
    dist_IC95_width_mean,
    age_IC95_width_mean
  )

readr::write_csv(s8_limpia, OUT_S8)

s8_limpia
