In [None]:
# Cargar MCC + matriz high-level

library(ape)
library(readr)

# Entradas (idénticas a ASR_01-models.ipynb)
TRAITS_ALIGNED <- "../traits/traits_high-level_aligned.csv"
TREE_MCC_FILE  <- "../trees/BEAST_MCC_46_ultrametric_FIXED.tre"

# Carpeta de salida común para todos los resultados del ASR
OUT_DIR <- "../results"

# Árbol MCC
tree <- read.tree(TREE_MCC_FILE)
stopifnot(inherits(tree, "phylo"))

# Matriz de rasgos high-level 
traits_tbl <- read_csv(TRAITS_ALIGNED, show_col_types = FALSE)

# Primera columna se llama 'species'
traits_df <- as.data.frame(traits_tbl)
rownames(traits_df) <- traits_df$species
traits_df$species   <- NULL

# Columnas numéricas
traits_df[] <- lapply(traits_df, function(z) suppressWarnings(as.numeric(z)))

# Alinear especies árbol-rasgos
if (!setequal(rownames(traits_df), tree$tip.label)) {
  stop("El conjunto de especies en el árbol MCC no coincide")
}

traits_df <- traits_df[tree$tip.label, , drop = FALSE]

# Resumen
n_species <- length(tree$tip.label)
n_traits  <- ncol(traits_df)
na_total  <- sum(is.na(traits_df))

cat(sprintf(
  "OK \nÁrbol MCC: %d especies\nRasgos high-level: %d columnas\nNAs totales en matriz: %d\nOUT_DIR: %s\n",
  n_species, n_traits, na_total, normalizePath(OUT_DIR, mustWork = FALSE)
))

In [None]:
# Diagnóstico

# Parámetros
DROP_SD_ZERO   <- TRUE    # eliminar rasgos con sd exactamente 0
DO_COR_SUMMARY <- TRUE    # mostrar resumen de correlaciones
COR_SAMPLE_MAX <- 5e5     # máximo de correlaciones a muestrear
SEED_COR       <- 123

# Chequeos
stopifnot(exists("tree"), inherits(tree, "phylo"))
stopifnot(exists("traits_df"))
stopifnot(nrow(traits_df) == length(tree$tip.label))

na_total <- sum(is.na(traits_df))
if (na_total > 0) {
  stop(sprintf(
    "Se detectaron %d valores NA en la matriz de rasgos.\nImputa/filtra antes de continuar con ASR.",
    na_total
  ))
}

# Eliminar rasgos con sd exactamente 0
sds     <- apply(traits_df, 2, sd)
idx_sd0 <- which(!is.na(sds) & sds == 0)

traits_proc <- traits_df
n_sd0       <- length(idx_sd0)

if (DROP_SD_ZERO && n_sd0 > 0) {
  traits_proc <- traits_df[, -idx_sd0, drop = FALSE]
}

# Resumen simple de correlaciones
cor_quantiles <- NULL

if (DO_COR_SUMMARY && ncol(traits_proc) >= 2) {
  set.seed(SEED_COR)
  C   <- suppressWarnings(cor(traits_proc))
  ut  <- upper.tri(C, diag = FALSE)
  cors <- C[ut]
  rm(C, ut)

  if (length(cors) > COR_SAMPLE_MAX) {
    cors <- sample(cors, COR_SAMPLE_MAX)
  }

  cor_quantiles <- quantile(
    cors,
    probs = c(0, .05, .25, .5, .75, .95, 1),
    na.rm = TRUE
  )
}

# Print

cat(
  sprintf(
    paste0(
      "OK \n",
      "Especies: %d | Rasgos (original): %d | Rasgos usados: %d | NAs: %d\n",
      "Rasgos con sd==0 removidos: %d\n"
    ),
    nrow(traits_df), ncol(traits_df), ncol(traits_proc), na_total, n_sd0
  ),
  if (!is.null(cor_quantiles)) {
    paste0(
      "Correlaciones (upper-tri) cuantiles: ",
      paste(
        names(cor_quantiles),
        sprintf("%.3f", as.numeric(cor_quantiles)),
        collapse = " | "
      ),
      "\n"
    )
  } else "\n"
)

In [None]:
# Ajuste OU penalizado en el MCC (mvgls)

library(mvMORPH)

# Configuración del modelo (según el screening previo)
MODEL   <- "OU"
PENALTY <- "RidgeArch"
METHOD  <- "H&L"

Y <- as.matrix(traits_proc)
storage.mode(Y) <- "double"
dat <- list(Y = Y)

# Ajuste del modelo
t0 <- Sys.time()

fit_ou <- mvgls(
  Y ~ 1,
  data    = dat,
  tree    = tree,
  model   = MODEL,
  penalty = PENALTY,
  method  = METHOD
)

t1 <- Sys.time()
elapsed_sec <- as.numeric(difftime(t1, t0, units = "secs"))

# Criterio de información (GIC) y log-verosimilitud
gic <- GIC(fit_ou)

if (is.list(gic)) {
  ic_value <- as.numeric(gic$GIC)
  loglik   <- as.numeric(gic$LogLikelihood)
} else {
  ic_value <- as.numeric(gic)[1]
  loglik   <- NA_real_
}

# Parámetro de regularización (ridge_tuning)
ridge_tuning <- fit_ou$tuning[1]

# Print
cat(sprintf(
  paste0(
    "OK — OU penalizado ajustado en el MCC.\n",
    "Especies: %d | Rasgos: %d\n",
    "Método: %s | Penalización: %s\n",
    "GIC = %.3f | LogLik = %.3f | ridge_tuning = %.6f\n",
    "Tiempo de ajuste: %.1f s\n"
  ),
  nrow(Y), ncol(Y),
  METHOD, PENALTY,
  ic_value, loglik, ridge_tuning,
  elapsed_sec
))

In [None]:
# ASR en MCC (high-level) y export a CSV

# Reconstrucción de estados ancestrales en nodos internos

t0 <- Sys.time()
A_nodes <- mvMORPH::ancestral(fit_ou)  # matriz [Nnode × p]
t1 <- Sys.time()
elapsed_asr <- as.numeric(difftime(t1, t0, units = "secs"))

Ntip  <- length(tree$tip.label)
Nnode <- tree$Nnode

# Chequeo de consistencia de dimensiones
stopifnot(is.matrix(A_nodes), nrow(A_nodes) == Nnode)

# IDs estándar de nodos internos en 'phylo': Ntip+1, ..., Ntip+Nnode
node_ids <- (Ntip + 1):(Ntip + Nnode)

# Armar tabla y guardar a CSV en ../results
asr_tbl <- as.data.frame(A_nodes)
colnames(asr_tbl) <- colnames(traits_proc)
asr_tbl <- cbind(node = node_ids, asr_tbl)

OUT_ASR_CSV <- file.path(OUT_DIR, "asr_MCC_OU_nodes_high-level.csv")
readr::write_csv(asr_tbl, OUT_ASR_CSV)

# Print
cat(sprintf(
  paste0(
    "OK — ASR en MCC completado.\n",
    "Nodos internos: %d | Rasgos: %d\n",
    "Archivo guardado:\n- %s\n",
    "Tiempo de ASR: %.1f s\n"
  ),
  Nnode, ncol(traits_proc),
  OUT_ASR_CSV,
  elapsed_asr
))

In [None]:
# Bootstrap paramétrico de ASR + resúmenes

# Parámetros del bootstrap
B         <- 100 # réplicas
BOOT_SEED <- 123
set.seed(BOOT_SEED)

# Carpeta de salida para el bootstrap
boot_dir <- file.path(OUT_DIR, "asr_MCC_OU_bootstrap")

# Dimensiones básicas
Ntip  <- length(tree$tip.label)
Nnode <- tree$Nnode
p     <- ncol(traits_proc)

# Array para guardar los estados ancestrales en nodos internos
# Dimensiones: [nodo, rasgo, réplica_válida]
boot_nodes <- array(NA_real_, dim = c(Nnode, p, B))

# Contador de réplicas válidas y de intentos totales
b_valid   <- 0
b_attempt <- 0

# Loop principal del bootstrap
while (b_valid < B) {
  b_attempt <- b_attempt + 1

  # 1. Simular rasgos en puntas bajo el modelo ajustado (matriz Ntip × p)
  sim_b <- simulate(fit_ou, nsim = 1, tree = tree)
  Y_sim <- as.matrix(sim_b)
  rownames(Y_sim) <- tree$tip.label
  colnames(Y_sim) <- colnames(traits_proc)

  # 2. Reajustar el mismo modelo OU + RidgeArch sobre la simulación
  dat_b <- list(Y = Y_sim)

  fit_b <- try(
    mvgls(
      Y ~ 1,
      data    = dat_b,
      tree    = tree,
      model   = MODEL,
      penalty = PENALTY,
      method  = METHOD
    ),
    silent = TRUE
  )

  # Si el refit falla, intentamos de nuevo (sin sumar réplica válida)
  if (inherits(fit_b, "try-error") || is.null(fit_b)) {
    next
  }

  # 3. ASR en nodos internos para esta réplica
  A_b <- try(mvMORPH::ancestral(fit_b), silent = TRUE)

  # Si falla la ASR, intentamos de nuevo
  if (inherits(A_b, "try-error") || is.null(A_b)) {
    next
  }

  # Si todo funcionó, sumamos una réplica válida
  b_valid <- b_valid + 1
  boot_nodes[, , b_valid] <- A_b

  # Progreso por consola (primera réplica válida y luego cada 10 válidas)
  if (b_valid == 1 || b_valid %% 10 == 0) {
    cat("Réplicas válidas:", b_valid, "de", B,
        "(intentos totales:", b_attempt, ")\n")
    flush.console()
  }
}

cat("Réplicas válidas finales:", b_valid, "de", B,
    "(intentos totales:", b_attempt, ")\n")

# Cálculo de SD e IC95 por nodo × rasgo
# Ahora todas las B "capas" de boot_nodes son válidas

boot_sd    <- apply(boot_nodes, c(1, 2), sd)
boot_q025  <- apply(boot_nodes, c(1, 2), quantile, probs = 0.025)
boot_q975  <- apply(boot_nodes, c(1, 2), quantile, probs = 0.975)
boot_w95   <- boot_q975 - boot_q025   # ancho del IC95

# Resumen por nodo
node_ids <- (Ntip + 1):(Ntip + Nnode)

node_summary <- data.frame(
  node       = node_ids,
  sd_mean    = apply(boot_sd, 1, mean),
  sd_median  = apply(boot_sd, 1, median),
  w95_mean   = apply(boot_w95, 1, mean),
  w95_median = apply(boot_w95, 1, median)
)

readr::write_csv(
  node_summary,
  file.path(boot_dir, "node_uncertainty_summary.csv")
)

# Resumen por rasgo
trait_names <- colnames(traits_proc)

trait_summary <- data.frame(
  trait      = trait_names,
  sd_mean    = apply(boot_sd, 2, mean),
  sd_median  = apply(boot_sd, 2, median),
  w95_mean   = apply(boot_w95, 2, mean),
  w95_median = apply(boot_w95, 2, median),
  w95_max    = apply(boot_w95, 2, max)
)

# Ordenamos rasgos desde los más inestables (w95_mean alto)
trait_summary <- trait_summary[order(trait_summary$w95_mean, decreasing = TRUE), ]

readr::write_csv(
  trait_summary,
  file.path(boot_dir, "trait_uncertainty_summary.csv")
)

# Print
cat(sprintf(
  paste0(
    "OK bootstrap ASR completado.\n",
    "Réplicas válidas: %d de %d (intentos totales: %d)\n",
    "Nodos internos: %d | Rasgos: %d\n",
    "Archivos guardados:\n",
    "- %s\n- %s\n"
  ),
  b_valid, B, b_attempt,
  Nnode, p,
  file.path(boot_dir, "node_uncertainty_summary.csv"),
  file.path(boot_dir, "trait_uncertainty_summary.csv")
))

In [None]:
# Análisis del bootstrap (resúmenes por nodo y por rasgo)

library(readr)
library(dplyr)
library(glue)
library(ape)

# Directorio donde quedaron los CSV del bootstrap
boot_dir <- file.path(OUT_DIR, "asr_MCC_OU_bootstrap")

# Cargar resúmenes
node_summary <- readr::read_csv(
  file.path(boot_dir, "node_uncertainty_summary.csv"),
  show_col_types = FALSE
)

trait_summary <- readr::read_csv(
  file.path(boot_dir, "trait_uncertainty_summary.csv"),
  show_col_types = FALSE
)

# Añadir edad (Mya) a cada nodo interno, usando el mismo MCC ya cargado en el notebook

bt <- branching.times(tree)  # named vector: names = nodos internos (Ntip+1,...)
age_tbl <- tibble(
  node    = as.integer(names(bt)),
  age_Mya = as.numeric(bt)
)

node_summary_age <- node_summary %>%
  left_join(age_tbl, by = "node") %>%
  arrange(desc(age_Mya))  # raíz arriba, nodos más recientes abajo

# Resúmenes globales

node_stats <- node_summary_age %>%
  summarise(
    n_nodes      = n(),
    w95_mean_min = min(w95_mean, na.rm = TRUE),
    w95_mean_med = median(w95_mean, na.rm = TRUE),
    w95_mean_max = max(w95_mean, na.rm = TRUE),
    sd_mean_med  = median(sd_mean,  na.rm = TRUE)
  )

trait_stats <- trait_summary %>%
  summarise(
    n_traits       = n(),
    w95_mean_med   = median(w95_mean, na.rm = TRUE),
    w95_mean_p90   = quantile(w95_mean, 0.90, na.rm = TRUE),
    w95_mean_max   = max(w95_mean, na.rm = TRUE),
    sd_mean_med    = median(sd_mean,    na.rm = TRUE)
  )

cat("Resumen por nodo (w95_mean en unidades de embedding):\n")
print(node_stats)

cat("\nResumen por rasgo (embedding):\n")
print(trait_stats)

# Top nodos por incertidumbre (para texto/figuras)

top_k_nodes <- 10L

top_nodes_by_w95 <- node_summary_age %>%
  arrange(desc(w95_mean)) %>%
  slice_head(n = top_k_nodes)

top_nodes_by_sd <- node_summary_age %>%
  arrange(desc(sd_mean)) %>%
  slice_head(n = top_k_nodes)

cat(glue("\nTop {top_k_nodes} nodos por w95_mean (ancho IC95 promedio):\n"))
print(top_nodes_by_w95 %>%
        select(node, age_Mya, w95_mean, w95_median) %>%
        mutate(across(where(is.numeric), ~ signif(.x, 4))))

cat(glue("\nTop {top_k_nodes} nodos por sd_mean (desv. estándar promedio):\n"))
print(top_nodes_by_sd %>%
        select(node, age_Mya, sd_mean, sd_median) %>%
        mutate(across(where(is.numeric), ~ signif(.x, 4))))

# Guardar estos resúmenes para usarlos en tablas / figuras
readr::write_csv(
  node_summary_age,
  file.path(boot_dir, "node_uncertainty_with_age.csv")
)

readr::write_csv(
  top_nodes_by_w95,
  file.path(boot_dir, glue("top{top_k_nodes}_nodes_by_w95_mean.csv"))
)

readr::write_csv(
  top_nodes_by_sd,
  file.path(boot_dir, glue("top{top_k_nodes}_nodes_by_sd_mean.csv"))
)

# Top rasgos (dimensiones de embedding) por incertidumbre

top_k_traits <- 30L  # para tabla suplementaria

top_traits_by_w95 <- trait_summary %>%
  arrange(desc(w95_mean)) %>%
  slice_head(n = top_k_traits)

cat(glue("\nTop {top_k_traits} rasgos por w95_mean (ancho IC95 promedio):\n"))
print(top_traits_by_w95 %>%
        select(trait, w95_mean, w95_median, w95_max) %>%
        mutate(across(where(is.numeric), ~ signif(.x, 4))))

readr::write_csv(
  top_traits_by_w95,
  file.path(boot_dir, glue("top{top_k_traits}_traits_by_w95_mean.csv"))
)

cat(glue(
  "\nOK análisis del bootstrap completado.\n",
  "Archivos clave guardados en {boot_dir}:\n",
  "- node_uncertainty_with_age.csv\n",
  "- top{top_k_nodes}_nodes_by_w95_mean.csv\n",
  "- top{top_k_nodes}_nodes_by_sd_mean.csv\n",
  "- top{top_k_traits}_traits_by_w95_mean.csv\n"
))

### Preparar data para UMAP a partir de: embeddings de puntas (46 filas) + nodos ancestrales de la corrida principal (45 filas) + nodos ancestrales bootstrap (45 mil filas)

In [None]:
# Exportar ASR bootstrap nodal (boot_nodes) + aristas del MCC a CSV

boot_dir <- file.path(OUT_DIR, "asr_MCC_OU_bootstrap")

Ntip   <- length(tree$tip.label)
Nnode  <- dim(boot_nodes)[1]
p      <- dim(boot_nodes)[2]
B      <- dim(boot_nodes)[3]

node_ids <- (Ntip + 1):(Ntip + Nnode)


# Exportar boot_nodes en formato ancho

# Matriz ancho: filas = nodo × réplica, columnas = embeddings
boot_mat <- matrix(NA_real_, nrow = Nnode * B, ncol = p)

for (b in 1:B) {
  rows <- ((b - 1) * Nnode + 1):(b * Nnode)
  boot_mat[rows, ] <- boot_nodes[, , b]
}

boot_df <- as.data.frame(boot_mat)
colnames(boot_df) <- colnames(traits_proc)

boot_df$node <- rep(node_ids, times = B)
boot_df$rep  <- rep(1:B, each = Nnode)

boot_df <- boot_df[, c("node", "rep", colnames(traits_proc))]

OUT_BOOT_CSV <- file.path(boot_dir, "boot_nodes_long.csv")
readr::write_csv(boot_df, OUT_BOOT_CSV)

cat("OK boot_nodes exportado a:\n", OUT_BOOT_CSV, "\n")

# Exportar aristas del MCC (para overlay del árbol en Python)

edges_df <- as.data.frame(tree$edge)
colnames(edges_df) <- c("parent", "child")

OUT_EDGE_CSV <- file.path(boot_dir, "tree_MCC_edges.csv")
readr::write_csv(edges_df, OUT_EDGE_CSV)

cat("Aristas del MCC exportadas a:\n", OUT_EDGE_CSV, "\n")

### CAMBIAR A PYTHON

In [None]:
# Phylomorphospace UMAP por clado + ramas coloreadas por edad

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from umap import UMAP
import matplotlib.cm as cm
import matplotlib.colors as mcolors
from matplotlib.colors import LinearSegmentedColormap

RESULTS_DIR     = "../results"

TRAITS_FILE     = "../traits/traits_high-level_aligned.csv"
ASR_NODES_FILE  = f"{RESULTS_DIR}/asr_MCC_OU_nodes_high-level.csv"
TREE_EDGES_FILE = f"{RESULTS_DIR}/asr_MCC_OU_bootstrap/tree_MCC_edges.csv"
CLADOS_FILE     = f"{RESULTS_DIR}/asr_MCC_OU_FIGURA-UMAP/clados.csv"
NODE_AGE_FILE   = f"{RESULTS_DIR}/asr_MCC_OU_bootstrap/node_uncertainty_with_age.csv"

OUT_FIG         = f"{RESULTS_DIR}/asr_MCC_OU_FIGURA-UMAP/phylomorphospace_umap_tips_nodes_clados_age.png"

# Columna de clado en clados.csv
CLADO_COL       = "clado"

# Parámetros UMAP
N_NEIGHBORS     = 5
MIN_DIST        = 0.8
N_COMPONENTS    = 2
METRIC          = "euclidean"
RANDOM_STATE    = 1234

# Estilo figura
FIGSIZE         = (10, 7)
DPI_SAVE        = 300

# Nodos medios OU
NODE_COLOR      = "black"
NODE_SIZE       = 300
NODE_MARKER     = "*"
NODE_EDGE_COLOR = "white"
NODE_EDGE_WIDTH = 0.8

# Tips coloreados por clado
TIP_SIZE        = 100
TIP_EDGE_WIDTH  = 0.4

# Ramas coloreadas por edad
EDGE_WIDTH      = 3.0
SHOW_COLORBAR   = True

# Paleta para edades (2–3 colores, sin blanco/negro)
# Opciones:
#   ["#b2182b", "#ef8a62", "#67a9cf"]     # rojo-naranjo-azul
#   ["#762a83", "#af8dc3", "#7fbf7b"]     # morado-lila-verde
#   ["#e66101", "#fdb863", "#5e3c99"]     # naranjo-amarillo- morado
EDGE_CMAP_COLORS = [
    "#CCCCCC",  # RECIENTE
    "#888888",  # 
    "#222222",  # ANTIGUO
]
EDGE_CMAP = LinearSegmentedColormap.from_list("age_cmap", EDGE_CMAP_COLORS)

# Etiquetas en nodos internos: "none" | "id" | "age"
NODE_LABEL_MODE  = "id"
NODE_LABEL_SIZE  = 12
NODE_LABEL_DY    = 0.01   # desplazamiento vertical relativo


# Cargar datos

# Tips + rasgos
tips_df = pd.read_csv(TRAITS_FILE)
species = tips_df["species"].values
X_tips = tips_df.drop(columns=["species"])
trait_cols = X_tips.columns
X_tips = X_tips.values
n_tips, p = X_tips.shape

# Nodos medios OU
nodes_df = pd.read_csv(ASR_NODES_FILE)
node_ids = nodes_df["node"].astype(int).values
X_nodes = nodes_df[trait_cols].values
n_nodes = X_nodes.shape[0]

# Clados ajustados
clados_df = pd.read_csv(CLADOS_FILE)
tips_clados = tips_df[["species"]].merge(clados_df, on="species", how="left")

if tips_clados[CLADO_COL].isna().any():
    missing = tips_clados[tips_clados[CLADO_COL].isna()]["species"].unique()
    raise ValueError(f"Especies sin clado definido en {CLADOS_FILE}:\n{missing}")

groups = tips_clados[CLADO_COL].values

# Edades de nodos internos
age_df = pd.read_csv(NODE_AGE_FILE)   # columnas
node_age_map = dict(zip(age_df["node"].astype(int), age_df["age_Mya"].values))


# UMAP tips + nodos

X_base = np.vstack([X_tips, X_nodes])

reducer = UMAP(
    n_neighbors=N_NEIGHBORS,
    min_dist=MIN_DIST,
    n_components=N_COMPONENTS,
    metric=METRIC,
    random_state=RANDOM_STATE,
)

Z_base = reducer.fit_transform(X_base)
Z_tips  = Z_base[:n_tips, :]
Z_nodes = Z_base[n_tips:(n_tips + n_nodes), :]

# Aristas del árbol y edades de ramas

edges_df = pd.read_csv(TREE_EDGES_FILE)
edges_df["parent"] = edges_df["parent"].astype(int)
edges_df["child"]  = edges_df["child"].astype(int)

Ntip = n_tips

# Edad de cada nodo (tips = 0, internos desde CSV)
node_age_all = {}
for i in range(1, Ntip + 1):
    node_age_all[i] = 0.0
for nid in node_ids:
    node_age_all[nid] = float(node_age_map.get(nid, 0.0))

# Coordenadas UMAP por índice de tip y nodo
tip_coords = {i + 1: (Z_tips[i, 0], Z_tips[i, 1]) for i in range(Ntip)}
node_coords = {
    node_ids[i]: (Z_nodes[i, 0], Z_nodes[i, 1])
    for i in range(n_nodes)
}

segments = []
branch_ages = []

for _, edge in edges_df.iterrows():
    parent = edge["parent"]
    child  = edge["child"]

    if parent <= Ntip:
        x0, y0 = tip_coords[parent]
    else:
        x0, y0 = node_coords[parent]

    if child <= Ntip:
        x1, y1 = tip_coords[child]
    else:
        x1, y1 = node_coords[child]

    segments.append(((x0, x1), (y0, y1)))

    age_parent = node_age_all.get(parent, 0.0)
    age_child  = node_age_all.get(child, 0.0)
    branch_ages.append(0.5 * (age_parent + age_child))

branch_ages = np.array(branch_ages, dtype=float)
age_min = float(branch_ages.min())
age_max = float(branch_ages.max()) if branch_ages.max() > 0 else 1.0

norm = mcolors.Normalize(vmin=age_min, vmax=age_max)


# Colores por clado (tips)

unique_clados = np.sort(pd.unique(groups))
cmap_clados = cm.get_cmap("tab20", len(unique_clados))
clado_color_map = {cl: cmap_clados(i) for i, cl in enumerate(unique_clados)}


# Figura

fig, ax = plt.subplots(figsize=FIGSIZE)

# Ramas coloreadas por edad
for (xs, ys), age in zip(segments, branch_ages):
    ax.plot(
        xs, ys,
        linewidth=EDGE_WIDTH,
        color=EDGE_CMAP(norm(age)),
        zorder=1
    )

# Nodos medios OU
ax.scatter(
    Z_nodes[:, 0],
    Z_nodes[:, 1],
    s=NODE_SIZE,
    facecolor=NODE_COLOR,
    edgecolor=NODE_EDGE_COLOR,
    linewidth=NODE_EDGE_WIDTH,
    marker=NODE_MARKER,
    zorder=3,
    label="Ancestral nodes"
)

# Tips coloreados por clado
for cl, color in clado_color_map.items():
    mask = (groups == cl)
    ax.scatter(
        Z_tips[mask, 0],
        Z_tips[mask, 1],
        s=TIP_SIZE,
        facecolor=color,
        edgecolor="black",
        linewidth=TIP_EDGE_WIDTH,
        zorder=4,
        label=cl
    )

# Etiquetas de nodos internos (opcional)
if NODE_LABEL_MODE in ("id", "age"):
    y_range = Z_base[:, 1].max() - Z_base[:, 1].min()
    dy = NODE_LABEL_DY * y_range
    for nid in node_ids:
        x, y = node_coords[nid]
        if NODE_LABEL_MODE == "id":
            txt = str(nid)
        else:  # "age"
            age = node_age_all.get(nid, np.nan)
            txt = f"{age:.1f}"
        ax.text(
            x,
            y + dy,
            txt,
            ha="center",
            va="bottom",
            fontsize=NODE_LABEL_SIZE,
            color="black",
            zorder=10,
        )

ax.set_xlabel("UMAP 1")
ax.set_ylabel("UMAP 2")


# Leyenda de clados sin duplicados
handles, labels = ax.get_legend_handles_labels()
by_label = dict(zip(labels, handles))
ax.legend(by_label.values(), by_label.keys(), frameon=False, loc="best")

# Colorbar para edades (abajo antiguo, arriba reciente)
if SHOW_COLORBAR:
    sm = cm.ScalarMappable(norm=norm, cmap=EDGE_CMAP)
    sm.set_array([])
    cbar = fig.colorbar(sm, ax=ax, pad=0.02)
    cbar.set_label("Branch age (Mya)", fontsize=9)
    # Invertir  el eje de  colorbar:
    # abajo = valores altos (antiguo), arriba = valores bajos (reciente)
    cbar.ax.invert_yaxis()

plt.tight_layout()
fig.savefig(OUT_FIG, dpi=DPI_SAVE, bbox_inches="tight")
plt.show()

print(f"Figura guardada en alta resolución:\n{OUT_FIG}")

In [None]:
# Árbol MCC ultramétrico con edades + clados

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors
from matplotlib.colors import LinearSegmentedColormap

RESULTS_DIR     = "../results"

TRAITS_FILE     = "../traits/traits_high-level_aligned.csv"
TREE_EDGES_FILE = f"{RESULTS_DIR}/asr_MCC_OU_bootstrap/tree_MCC_edges.csv"
NODE_AGE_FILE   = f"{RESULTS_DIR}/asr_MCC_OU_bootstrap/node_uncertainty_with_age.csv"
CLADOS_FILE     = f"{RESULTS_DIR}/asr_MCC_OU_FIGURA-UMAP/clados.csv"

OUT_FIG_TREE    = f"{RESULTS_DIR}/asr_MCC_OU_FIGURA-UMAP/tree_MCC_clados_age.png"

# Columna de clado
CLADO_COL       = "clado"

# Estilo figura
FIGSIZE         = (10, 10)
DPI_SAVE        = 300

# Tips
TIP_SIZE        = 100
TIP_EDGE_WIDTH  = 0.4
TIP_LABEL_SIZE  = 13  # nombres de especies
TIP_LABEL_FRAC  = 0.03   # fracción del rango temporal para separar texto del tip

# Nodos internos (estrellas + ID)
NODE_MARKER     = "*"
NODE_SIZE       = 300
NODE_EDGE_COLOR = "white"
NODE_EDGE_WIDTH = 0.8
NODE_LABEL_SIZE = 13
NODE_LABEL_DX   = 0.9
NODE_LABEL_DY   = 0.2

# Ramas (mismo cmap gris)
EDGE_WIDTH      = 3.0
EDGE_CMAP_COLORS = [
    "#CCCCCC",  # reciente
    "#888888",
    "#222222",  # antiguo
]
EDGE_CMAP = LinearSegmentedColormap.from_list("age_cmap", EDGE_CMAP_COLORS)


# Cargar datos básicos

traits_df = pd.read_csv(TRAITS_FILE)
species = traits_df["species"].drop_duplicates().values
n_tips = len(species)

clados_df = pd.read_csv(CLADOS_FILE)
sp_clados = (
    pd.DataFrame({"species": species})
    .merge(clados_df, on="species", how="left")
)
if sp_clados[CLADO_COL].isna().any():
    missing = sp_clados[sp_clados[CLADO_COL].isna()]["species"].unique()
    raise ValueError(f"Especies sin clado definido en {CLADOS_FILE}:\n{missing}")

groups = sp_clados[CLADO_COL].values

edges_df = pd.read_csv(TREE_EDGES_FILE)
edges_df["parent"] = edges_df["parent"].astype(int)
edges_df["child"]  = edges_df["child"].astype(int)

age_df = pd.read_csv(NODE_AGE_FILE)
node_age_map = dict(zip(age_df["node"].astype(int), age_df["age_Mya"].values))

Ntip = n_tips
all_nodes = set(edges_df["parent"]).union(set(edges_df["child"]))
Nnode_total = max(all_nodes)


# Edades de nodos y ramas

node_age_all = {}
for i in range(1, Ntip + 1):
    node_age_all[i] = 0.0
for nid, age in node_age_map.items():
    node_age_all[nid] = float(age)

branch_ages = []
for _, edge in edges_df.iterrows():
    parent = edge["parent"]
    child  = edge["child"]
    age_parent = node_age_all.get(parent, 0.0)
    age_child  = node_age_all.get(child, 0.0)
    branch_ages.append(0.5 * (age_parent + age_child))
branch_ages = np.array(branch_ages, dtype=float)

age_min = float(branch_ages.min())
age_max = float(branch_ages.max()) if branch_ages.max() > 0 else 1.0
norm = mcolors.Normalize(vmin=age_min, vmax=age_max)

# Estructura y coordenadas del árbol

children = {node: [] for node in all_nodes}
for _, edge in edges_df.iterrows():
    children[edge["parent"]].append(edge["child"])

all_children = set(edges_df["child"].tolist())
all_parents  = set(edges_df["parent"].tolist())
roots = list(all_parents - all_children)
if len(roots) != 1:
    raise ValueError(f"Se esperaba una única raíz, encontradas: {roots}")
root = roots[0]

y_coords = {}
current_y = 0

def assign_y(node):
    global current_y
    if node <= Ntip:
        y_coords[node] = current_y
        current_y += 1
    else:
        for child in children[node]:
            assign_y(child)
        child_y = [y_coords[ch] for ch in children[node]]
        y_coords[node] = sum(child_y) / len(child_y)

assign_y(root)

x_coords = {node: node_age_all.get(node, 0.0) for node in all_nodes}


# Colores por clado (tips)

# Clados únicos tal como vienen en el CSV
clados_raw = pd.unique(groups)

# Ordenar alfabéticamente ignorar guiones bajos (y cambiar a  espacios)
clado_sort_keys = [c.replace("_", " ") for c in clados_raw]
order = np.argsort(clado_sort_keys)

# lista ordenada de clados
unique_clados = [clados_raw[i] for i in order]

cmap_clados = cm.get_cmap("tab20", len(unique_clados))
clado_color_map = {cl: cmap_clados(i) for i, cl in enumerate(unique_clados)}

tip_colors = np.array([clado_color_map[c] for c in groups])

tip_index_to_species = {i + 1: species[i] for i in range(Ntip)}
tip_index_to_color   = {i + 1: tip_colors[i] for i in range(Ntip)}


# Figura

fig, ax = plt.subplots(figsize=FIGSIZE)

# Ramas coloreadas por edad
for (edge_idx, edge) in enumerate(edges_df.itertuples(index=False)):
    parent = edge.parent
    child  = edge.child

    x_p, y_p = x_coords[parent], y_coords[parent]
    x_c, y_c = x_coords[child],  y_coords[child]

    age_mid = 0.5 * (node_age_all.get(parent, 0.0) + node_age_all.get(child, 0.0))
    color = EDGE_CMAP(norm(age_mid))

    ax.plot([x_p, x_p], [y_p, y_c], color=color, linewidth=EDGE_WIDTH, zorder=1)
    ax.plot([x_p, x_c], [y_c, y_c], color=color, linewidth=EDGE_WIDTH, zorder=1)

y_min, y_max = min(y_coords.values()), max(y_coords.values())
y_range = y_max - y_min
x_range = age_max - age_min


margin_right = 0.10 * x_range
margin_left  = 0.12 * x_range
x_min_plot = -margin_right
x_max_plot = age_max + margin_left

dx_label = TIP_LABEL_FRAC * x_range

for tip_idx in range(1, Ntip + 1):
    x_t = x_coords[tip_idx]
    y_t = y_coords[tip_idx]
    col = tip_index_to_color[tip_idx]
    sp  = tip_index_to_species[tip_idx]
    sp = sp.replace("_", " ")

    ax.scatter(
        x_t, y_t,
        s=TIP_SIZE,
        facecolor=col,
        edgecolor="black",
        linewidth=TIP_EDGE_WIDTH,
        zorder=3,
    )

    # Eje invertido
    # borde izquierdo del texto a la derecha del tip.
    x_label = x_t - dx_label

    ax.text(
        x_label, y_t,
        sp,
        ha="left",
        va="center",
        fontsize=TIP_LABEL_SIZE,
        color="black",
        zorder=4,
    )

# Nodos internos: estrellas + ID encima
for nid in range(Ntip + 1, Nnode_total + 1):
    x_n = x_coords[nid]
    y_n = y_coords[nid]
    ax.scatter(
        x_n, y_n,
        s=NODE_SIZE,
        facecolor="black",
        edgecolor=NODE_EDGE_COLOR,
        linewidth=NODE_EDGE_WIDTH,
        marker=NODE_MARKER,
        zorder=5,
    )
    dx = NODE_LABEL_DX
    dy = NODE_LABEL_DY
    ax.text(
        x_n + dx,
        y_n + dy,
        str(nid),
        ha="center",
        va="bottom",
        fontsize=NODE_LABEL_SIZE,
        color="black",
        zorder=6,
    )

# Ejes y estética
ax.set_ylim(y_min - 1, y_max + 1)
ax.set_ylabel("")

# Escala de tiempo
ax.set_xlabel("Time (Mya)", fontsize=16)
max_tick = int(np.ceil(age_max / 2.0) * 2)
ticks = np.arange(0, max_tick + 0.1, 2)
ax.set_xticks(ticks)
ax.tick_params(axis="x", labelsize=13)

ax.set_xlim(x_min_plot, x_max_plot)
ax.invert_xaxis()

# Quitar eje vertical y ticks Y
ax.set_yticks([])
ax.spines["left"].set_visible(False)

# Quitar marcos superior y derecho
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

plt.tight_layout()
fig.savefig(OUT_FIG_TREE, dpi=DPI_SAVE, bbox_inches="tight")
plt.show()

print(f"Árbol MCC guardado en alta resolución:\n{OUT_FIG_TREE}")


In [None]:
# Incertidumbre del ASR en el espacio completo de embeddings

library(readr)
library(dplyr)
library(ggplot2)

# Directorios y archivos
OUT_DIR    <- "../results"
boot_dir   <- file.path(OUT_DIR, "asr_MCC_OU_bootstrap")

ASR_FILE   <- file.path(OUT_DIR,  "asr_MCC_OU_nodes_high-level.csv")
BOOT_FILE  <- file.path(boot_dir, "boot_nodes_long.csv")
AGE_FILE   <- file.path(boot_dir, "node_uncertainty_with_age.csv")


# Cargar ASR y bootstrap

asr_tbl <- readr::read_csv(ASR_FILE)
boot_df <- readr::read_csv(BOOT_FILE)

# Columnas de rasgos (todas las de ASR excepto 'node')
trait_cols <- setdiff(colnames(asr_tbl), "node")

if (length(trait_cols) == 0L) {
  stop("No se identificaron columnas de rasgos en ASR_FILE (además de 'node'). Revisar el CSV.")
}

# Comprobación mínima
cat("ASR nodal:\n")
print(asr_tbl %>% select(node, all_of(trait_cols[1:min(5, length(trait_cols))])) %>% head(3))

cat("\nBootstrap (formato long):\n")
print(boot_df %>% select(node, rep, all_of(trait_cols[1:min(3, length(trait_cols))])) %>% head(3))


# Unir ASR oficial a cada fila bootstrap por nodo

boot_joined <- boot_df %>%
  inner_join(asr_tbl, by = "node", suffix = c("_boot", "_asr"))

# Distancia euclidiana en el espacio completo de rasgos

boot_dist <- boot_joined %>%
  rowwise() %>%
  mutate(
    dist = {
      v_boot <- c_across(paste0(trait_cols, "_boot"))
      v_asr  <- c_across(paste0(trait_cols, "_asr"))
      sqrt(sum((v_boot - v_asr)^2))
    }
  ) %>%
  ungroup() %>%
  select(node, rep, dist)

# Guardar distancias
OUT_DIST_CSV <- file.path(boot_dir, "bootstrap_distances_all_nodes.csv")
readr::write_csv(boot_dist, OUT_DIST_CSV)
cat("\nCSV de distancias (todos los nodos) guardado en:\n", OUT_DIST_CSV, "\n")


# Resumen por nodo

dist_summary <- boot_dist %>%
  group_by(node) %>%
  summarise(
    dist_mean   = mean(dist),
    dist_median = median(dist),
    dist_q025   = quantile(dist, 0.025),
    dist_q975   = quantile(dist, 0.975),
    .groups = "drop"
  )

cat("\nResumen de distancias por nodo (primeros 10):\n")
print(head(dist_summary, 10))


# Incorporar edad nodal

node_age <- readr::read_csv(AGE_FILE) %>%
  select(node, age_Mya)

dist_age <- dist_summary %>%
  inner_join(node_age, by = "node")

cat("\nResumen con edad (primeros 10):\n")
print(head(dist_age, 10))

# Figura

# Parámetros de estética
STAR_SIZE        <- 4.0          # tamaño de las estrellas (texto)
STAR_LABEL       <- "\u2605"     # símbolo Unicode de estrella: ★
STAR_COLOR       <- "black"      # color de la estrella
LINE_SIZE        <- 1.0          # grosor de la línea
RIBBON_ALPHA     <- 0.2          # transparencia de la banda
AXIS_TITLE_SIZE  <- 13           # tamaño de fuente de los títulos de ejes
AXIS_TEXT_SIZE   <- 11           # tamaño de fuente de los labels de los ejes

# Rango de ejes
Y_MIN <- 0
Y_MAX <- 4

MAX_AGE <- max(dist_age$age_Mya, na.rm = TRUE)
X_MAX   <- ceiling(MAX_AGE / 5) * 5   # redondear al múltiplo de 5 superior

p_dist <- ggplot(dist_age, aes(x = age_Mya, y = dist_median)) +
  # Banda (Q2.5–Q97.5) de la distancia
  geom_ribbon(aes(ymin = dist_q025, ymax = dist_q975),
              alpha = RIBBON_ALPHA) +
  geom_line(size = LINE_SIZE) +
  # Estrellas en los nodos (usamos geom_text con el símbolo ★)
  geom_text(
    label = STAR_LABEL,
    color = STAR_COLOR,
    size  = STAR_SIZE
  ) +
  scale_x_reverse(
    limits = c(38, 1),
    breaks = seq(1, 38, by = 2),
    expand = expansion(mult = c(0, 0))
  ) +
  scale_y_continuous(
    limits = c(Y_MIN, Y_MAX),
    expand = expansion(mult = c(0, 0))
  ) +
  labs(
    x = "Node age [Mya]",
    y = "Median Euclidean distance"
  ) +
  theme_bw() +
  theme(
    axis.title = element_text(size = AXIS_TITLE_SIZE),
    axis.text  = element_text(size = AXIS_TEXT_SIZE)
  )

print(p_dist)

# Guardar figura
OUT_FIG <- file.path(boot_dir, "asr_uncertainty_distance_vs_age.png")
dir.create(dirname(OUT_FIG), recursive = TRUE, showWarnings = FALSE)
ggsave(OUT_FIG, plot = p_dist, width = 12, height = 4.5, dpi = 300)

cat("\nFigura guardada en:\n", OUT_FIG, "\n")

In [None]:
# Resumen de incertidumbre en 5 grupos de edad (9 nodos por grupo)

library(dplyr)
library(readr)


age_band_summary_5 <- dist_age %>%
  mutate(
    ic_width = dist_q975 - dist_q025,
    # 5 grupos con el mismo número de nodos, de más antiguos a más jóvenes
    age_band_id = ntile(desc(age_Mya), 5L)
  ) %>%
  group_by(age_band_id) %>%
  summarise(
    n_nodes          = n(),
    age_min          = min(age_Mya),
    age_max          = max(age_Mya),
    dist_median_min  = min(dist_median),
    dist_median_max  = max(dist_median),
    dist_median_mean = mean(dist_median),
    ic_width_mean    = mean(ic_width),
    .groups = "drop"
  ) %>%
  arrange(age_band_id) %>%
  mutate(
    across(where(is.numeric), ~ round(.x, 3))
  ) %>%
  rename(
    `age band`         = age_band_id,
    `n nodes`          = n_nodes,
    `age min`          = age_min,
    `age max`          = age_max,
    `dist median min`  = dist_median_min,
    `dist median max`  = dist_median_max,
    `dist median mean` = dist_median_mean,
    `IC95 width mean`  = ic_width_mean
  )

# Guardar CSV
OUT_AGE_BANDS_5 <- file.path(boot_dir, "node_uncertainty_5bands_equalnodes_summary.csv")
write_csv(age_band_summary_5, OUT_AGE_BANDS_5)

age_band_summary_5

In [None]:
# Incertidumbre por dimensión vs rango dinámico de los tips

library(readr)
library(dplyr)

# Cargar embeddings de tips
TIPS_FILE <- "../traits/traits_high-level_aligned.csv"
tips_tbl  <- readr::read_csv(TIPS_FILE)

missing_cols <- setdiff(trait_cols, colnames(tips_tbl))
if (length(missing_cols) > 0) {
  stop("Faltan estas columnas de rasgos en TIPS_FILE: ",
       paste(missing_cols, collapse = ", "))
}

# Rango dinámico global de los embeddings en tips
tips_vals  <- as.matrix(tips_tbl[, trait_cols])
tips_min   <- min(tips_vals, na.rm = TRUE)
tips_max   <- max(tips_vals, na.rm = TRUE)
tips_range <- tips_max - tips_min

cat("Rango dinámico de embeddings en tips:\n")
cat("  min   =", tips_min, "\n")
cat("  max   =", tips_max, "\n")
cat("  rango =", tips_range, "\n\n")

# Incertidumbre "equivalente por dimensión" a partir de dist_age

if (!exists("dist_age")) stop("dist_age no está definido (revisar).")

D <- length(trait_cols)

uncert_per_dim <- dist_age %>%
  mutate(
    ic_width              = dist_q975 - dist_q025,
    step_median_per_dim   = dist_median / sqrt(D),
    step_icwidth_per_dim  = ic_width    / sqrt(D)
  ) %>%
  summarise(
    mean_step_median_per_dim  = mean(step_median_per_dim,  na.rm = TRUE),
    min_step_median_per_dim   = min(step_median_per_dim,   na.rm = TRUE),
    max_step_median_per_dim   = max(step_median_per_dim,   na.rm = TRUE),
    mean_step_ic_per_dim      = mean(step_icwidth_per_dim, na.rm = TRUE),
    min_step_ic_per_dim       = min(step_icwidth_per_dim,  na.rm = TRUE),
    max_step_ic_per_dim       = max(step_icwidth_per_dim,  na.rm = TRUE)
  )

print(uncert_per_dim)

# Expresar como % del rango dinámico de los tips

mean_med_pct <- uncert_per_dim$mean_step_median_per_dim / tips_range * 100
mean_ic_pct  <- uncert_per_dim$mean_step_ic_per_dim     / tips_range * 100

cat("\nRespecto al rango dinámico de los tips (", tips_range, "):\n", sep = "")
cat("  Desplazamiento medio por dimensión de la mediana de distancias ≈ ",
    round(mean_med_pct, 1), "% del rango por dimensión\n", sep = "")
cat("  Ancho medio por dimensión del IC95% de distancias              ≈ ",
    round(mean_ic_pct, 1), "% del rango por dimensión\n", sep = "")

# Opcional: tabla resumida
dyn_uncert_summary <- tibble::tibble(
  tips_min                 = tips_min,
  tips_max                 = tips_max,
  tips_range               = tips_range,
  mean_step_med_per_dim    = uncert_per_dim$mean_step_median_per_dim,
  mean_step_ic_per_dim     = uncert_per_dim$mean_step_ic_per_dim,
  step_med_pct_range       = mean_med_pct,
  step_ic_pct_range        = mean_ic_pct
)

dyn_uncert_summary


In [None]:
# Renombrar columnas, redondear y guardar

library(dplyr)
library(readr)

dyn_uncert_summary_3 <- dyn_uncert_summary %>%
  rename(
    `tips min`        = tips_min,
    `tips max`        = tips_max,
    `tips range`      = tips_range,
    `median disp dim` = mean_step_med_per_dim,
    `IC95 disp dim`   = mean_step_ic_per_dim,
    `median disp %`   = step_med_pct_range,
    `IC95 disp %`     = step_ic_pct_range
  ) %>%
  mutate(
    across(where(is.numeric), ~ round(.x, 3))
  )

OUT_DYN <- file.path(boot_dir, "uncertainty_vs_tips_range_summary.csv")
write_csv(dyn_uncert_summary_3, OUT_DYN)

dyn_uncert_summary_3
