In [None]:
# Cargar árbol MCC y embeddings de puntas/nodos

library(ape)
library(readr)
library(dplyr)

# Rutas
TRAITS_ALIGNED   <- "../traits/traits_high-level_aligned.csv"
TREE_MCC_FILE    <- "../trees/BEAST_MCC_46_ultrametric_FIXED.tre"
ASR_NODES_FILE   <- "../results/asr_MCC_OU_nodes_high-level.csv"

# Árbol MCC
tree <- read.tree(TREE_MCC_FILE)
stopifnot(inherits(tree, "phylo"))

Ntip  <- length(tree$tip.label)
Nnode <- tree$Nnode

cat("Ntip:", Ntip, "  Nnode:", Nnode, "\n")

# Embeddings de puntas (46 × p)
traits_tbl <- readr::read_csv(TRAITS_ALIGNED)

# Columna 'species' + columnas numéricas de embedding
stopifnot("species" %in% names(traits_tbl))

# Especies del árbol en el mismo orden que tree$tip.label
traits_tbl <- traits_tbl %>%
  filter(species %in% tree$tip.label) %>%
  arrange(match(species, tree$tip.label))

emb_tip <- traits_tbl %>%
  select(where(is.numeric)) %>%
  as.matrix()

rownames(emb_tip) <- traits_tbl$species

p <- ncol(emb_tip)
cat("Dimensión del embedding por especie (p):", p, "\n")

# Embeddings nodales (ASR) desde CSV
asr_nodes_tbl <- readr::read_csv(ASR_NODES_FILE)

stopifnot("node" %in% names(asr_nodes_tbl))

# Usar solo las columnas de embedding (excluyendo 'node')
emb_nodes <- asr_nodes_tbl %>%
  arrange(node) %>%           # por si acaso
  select(-node) %>%           # sacar la columna 'node'
  as.matrix()

# Rownames = id numérico de nodo (Ntip+1 ... Ntip+Nnode)
rownames(emb_nodes) <- as.character(asr_nodes_tbl$node)

cat("Nodos en ASR:", nrow(emb_nodes), "\n")

In [None]:
# Matriz de embeddings para todos (sp + nodos)

# Creamos una matriz [Ntip + Nnode] × p
emb_all <- matrix(NA_real_, nrow = Ntip + Nnode, ncol = p)

# Rownames = índices numéricos como en tree$edge
rownames(emb_all) <- as.character(1:(Ntip + Nnode))

# Puntas: índices 1..Ntip (en el mismo orden que tree$tip.label)
emb_all[as.character(1:Ntip), ] <- emb_tip

# Nodos internos: índices Ntip+1..Ntip+Nnode ( los 'node' del ASR)
emb_all[as.character(asr_nodes_tbl$node), ] <- emb_nodes

# Chequeo
stopifnot(!any(is.na(emb_all)))
cat("Matriz emb_all completa:", dim(emb_all)[1], "×", dim(emb_all)[2], "\n")


In [None]:
# Calcular edad (tips y nodos) en Mya

# branching.times() da edades desde el presente para nodos internos
bt <- branching.times(tree)  # named numeric, nombres = IDs de nodos internos (Ntip+1..)

# Vector de edades 
node_age <- numeric(Ntip + Nnode)
names(node_age) <- as.character(1:(Ntip + Nnode))

# Tips = 0 (presente)
node_age[as.character(1:Ntip)] <- 0

# Nodos internos = edades de branching.times()
node_age[names(bt)] <- bt

summary(node_age)

In [None]:
# Distancias de ramas + info temporal + correlaciones (Pearson)

library(dplyr)

# Matriz 
edge_df <- as.data.frame(tree$edge)
names(edge_df) <- c("parent", "child")

edge_df <- edge_df %>%
  rowwise() %>%
  mutate(
    # Distancia euclidiana en el embedding entre padre e hijo
    dist_embedding = {
      vp <- emb_all[as.character(parent), ]
      vc <- emb_all[as.character(child), ]
      sqrt(sum((vp - vc)^2))
    },
    age_parent = node_age[as.character(parent)],
    age_child  = node_age[as.character(child)],
    age_mid    = 0.5 * (age_parent + age_child),  # edad media de la rama
    age_length = age_parent - age_child           # longitud temporal de la rama (Myr)
  ) %>%
  ungroup()

cat("Resumen dist_embedding:\n")
print(summary(edge_df$dist_embedding))

# Modelos lineales ( para dibujar recta en el gráfico)
lm_len <- lm(dist_embedding ~ age_length, data = edge_df)
lm_age <- lm(dist_embedding ~ age_mid,    data = edge_df)

# Correlaciones de Pearson (r, p, CI95)
ct_len <- cor.test(edge_df$dist_embedding, edge_df$age_length, method = "pearson")
ct_age <- cor.test(edge_df$dist_embedding, edge_df$age_mid,    method = "pearson")

r_len      <- unname(ct_len$estimate)
pval_len   <- ct_len$p.value
ci_len     <- ct_len$conf.int  # vector de longitud 2: [lower, upper]

r_age      <- unname(ct_age$estimate)
pval_age   <- ct_age$p.value
ci_age     <- ct_age$conf.int

cat("\nCorrelación dist ~ age_length (Pearson):\n")
cat("  r =", round(r_len, 3),
    "  95% CI [", round(ci_len[1], 3), ",", round(ci_len[2], 3), "]",
    "  p =", signif(pval_len, 3), "\n")

cat("\nCorrelación dist ~ age_mid (Pearson):\n")
cat("  r =", round(r_age, 3),
    "  95% CI [", round(ci_age[1], 3), ",", round(ci_age[2], 3), "]",
    "  p =", signif(pval_age, 3), "\n")


In [None]:
# Figuras

library(ggplot2)
library(dplyr)
library(grid)

# Parámetros 
AXIS_TITLE_SIZE <- 14
AXIS_TEXT_SIZE  <- 12
TICK_LENGTH     <- unit(0.2, "cm")
TICK_WIDTH      <- 0.5

# salida
BRANCH_DIR <- "../results/asr_MCC_OU_branches"
dir.create(BRANCH_DIR, recursive = TRUE, showWarnings = FALSE)


# Etiquetas r, IC95 y p-value

label_len <- paste0(
  "r = ", round(r_len, 2),
  " (95% CI: ", round(ci_len[1], 2), "\u2013", round(ci_len[2], 2), ")",
  "\n p = ", formatC(pval_len, format = "e", digits = 2)
)

label_age <- paste0(
  "r = ", round(r_age, 2),
  " (95% CI: ", round(ci_age[1], 2), "\u2013", round(ci_age[2], 2), ")",
  "\n p = ", formatC(pval_age, format = "e", digits = 2)
)

#  posicionar el texto
x_len_max <- max(edge_df$age_length, na.rm = TRUE)
y_len_max <- max(edge_df$dist_embedding, na.rm = TRUE)

x_age_max <- max(edge_df$age_mid, na.rm = TRUE)
y_age_max <- max(edge_df$dist_embedding, na.rm = TRUE)


# Figura: dist ~ age_length (recta lm)

p_len <- ggplot(edge_df, aes(x = age_length, y = dist_embedding)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE, colour = "black") +
  annotate(
    "text",
    x = 0.95 * x_len_max,
    y = 0.95 * y_len_max,
    label = label_len,
    hjust = 1, vjust = 1,
    size = 4.5
  ) +
  labs(
    x = "Branch length [Myr]",
    y = "Euclidean distance in latent space"
  ) +
  theme_bw() +
  theme(
    axis.title      = element_text(size = AXIS_TITLE_SIZE),
    axis.text       = element_text(size = AXIS_TEXT_SIZE),
    axis.ticks.length = TICK_LENGTH,
    axis.ticks      = element_line(linewidth = TICK_WIDTH)
  )

print(p_len)


# Figura: dist ~ age_mid (recta lm, eje invertido)

range_age <- range(edge_df$age_mid, na.rm = TRUE)
x_age_lab <- range_age[2] - 0.2 * diff(range_age)

p_age <- ggplot(edge_df, aes(x = age_mid, y = dist_embedding)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE, colour = "black") +
  scale_x_reverse(
    expand = expansion(mult = c(0.02, 0.02))
  ) +
  annotate(
    "text",
    x = x_age_lab,
    y = 0.95 * y_age_max,
    label = label_age,
    hjust = 0, vjust = 1,
    size = 4.5
  ) +
  labs(
    x = "Mean branch age [Mya]",
    y = "Euclidean distance in latent space"
  ) +
  theme_bw() +
  theme(
    axis.title      = element_text(size = AXIS_TITLE_SIZE),
    axis.text       = element_text(size = AXIS_TEXT_SIZE),
    axis.ticks.length = TICK_LENGTH,
    axis.ticks      = element_line(linewidth = TICK_WIDTH)
  )

print(p_age)

# Exportar CSV

age_cols <- c("age_parent", "age_child", "age_mid", "age_length")

edge_df_export <- edge_df %>%
  mutate(
    dist_embedding = round(dist_embedding, 3),
    across(all_of(age_cols), ~ round(.x, 1))
  )

OUT_BRANCH_CSV <- file.path(BRANCH_DIR, "asr_MCC_OU_branch_distances_embeddings_round.csv")
readr::write_csv(edge_df_export, OUT_BRANCH_CSV)
cat("CSV de ramas (redondeado) guardado en:\n", OUT_BRANCH_CSV, "\n")

# Guardar PNG de alta resolución

OUT_FIG_LEN <- file.path(BRANCH_DIR, "asr_MCC_OU_branch_dist_vs_time_length.png")
ggsave(OUT_FIG_LEN, plot = p_len, width = 6, height = 6, dpi = 600)
cat("Figura longitud de rama guardada en:\n", OUT_FIG_LEN, "\n")

OUT_FIG_AGE <- file.path(BRANCH_DIR, "asr_MCC_OU_branch_dist_vs_age_mid.png")
ggsave(OUT_FIG_AGE, plot = p_age, width = 6, height = 6, dpi = 600)
cat("Figura edad media de rama guardada en:\n", OUT_FIG_AGE, "\n")
