In [7]:
import pandas as pd
import networkx as nx
import os

# === CONFIG ===
structures_csv = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\code\cfos_preprocessing\allen_mouse_10um_v1.2\structures.csv"
category_csv = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\pe_grouped.csv"
input_excel = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\mean_dff_per_region_by_mouse_brainmapper.xlsx"
output_file = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\collapsed_region_matrix.xlsx"
target_n_regions = 160

# === LOAD REGION HIERARCHY ===
region_table = pd.read_csv(structures_csv)
region_table['structure_id_path'] = region_table['structure_id_path'].astype(str)

# Build graph from structure_id_path
G = nx.DiGraph()
for _, row in region_table.iterrows():
    path = row["structure_id_path"].strip("/").split("/")
    for i in range(len(path) - 1):
        G.add_edge(path[i], path[i + 1])

# === COLLAPSE TREE TO ~130 REGIONS (STRUCTURE ONLY) ===
def collapse_tree_to_n(graph, n_regions):
    leaves = [node for node in graph.nodes if graph.out_degree(node) == 0]
    all_paths = [nx.shortest_path(graph, source="997", target=leaf) for leaf in leaves if nx.has_path(graph, "997", leaf)]
    from collections import Counter
    counts = Counter()
    for path in all_paths:
        counts.update(path)
    ranked = [node for node, _ in counts.most_common()]
    selected = set()
    for node in ranked:
        descendants = nx.descendants(graph, node)
        if not any(d in selected for d in descendants):
            selected.add(node)
        if len(selected) >= n_regions:
            break
    return list(selected)

collapsed_region_ids = collapse_tree_to_n(G, target_n_regions)


# === MAP ALL STRUCTURE IDs TO THEIR COLLAPSED TARGET REGION ===
structure_to_target = {}
for _, row in region_table.iterrows():
    sid = str(row["id"])
    path = row["structure_id_path"].strip("/").split("/")
    for node in reversed(path):
        if node in collapsed_region_ids:
            structure_to_target[sid] = node
            break

# === LOAD AND MAP DFF DATA FROM ALL MICE ===
df_by_mouse = pd.read_excel(input_excel, sheet_name=None)
matrix_data = {}

for sheet_name, df_mouse in df_by_mouse.items():
    if sheet_name.lower() == "summary" or "structure_id_path" not in df_mouse.columns:
        continue

    region_means = {}
    for _, row in df_mouse.iterrows():
        path = row["structure_id_path"].strip("/").split("/")
        leaf_id = path[-1]
        target = structure_to_target.get(leaf_id)
        if target:
            region_means.setdefault(target, []).append(row["mean_dff"])

    averaged = {rid: sum(vals) / len(vals) for rid, vals in region_means.items()}
    matrix_data[sheet_name] = averaged


# === ASSEMBLE WIDE MATRIX ===
df_matrix = pd.DataFrame(matrix_data).T  # Mice × Regions
available_regions = [rid for rid in collapsed_region_ids if rid in df_matrix.columns]
missing = set(collapsed_region_ids) - set(df_matrix.columns)
print(f"[INFO] {len(missing)} regions had no data and were excluded.")
df_matrix = df_matrix[available_regions]
df_matrix = df_matrix.T  # Regions × Mice

# === ADD REGION INFO: acronym, name, depth ===
id_to_meta = region_table.set_index("id")[["acronym", "name", "structure_id_path"]].astype(str)
df_matrix["acronym"] = df_matrix.index.map(lambda x: id_to_meta.loc[int(x), "acronym"] if int(x) in id_to_meta.index else "")
df_matrix["name"] = df_matrix.index.map(lambda x: id_to_meta.loc[int(x), "name"] if int(x) in id_to_meta.index else "")
df_matrix["depth"] = df_matrix.index.map(lambda x: len(id_to_meta.loc[int(x), "structure_id_path"].strip("/").split("/")) if int(x) in id_to_meta.index else None)
df_matrix.insert(0, "region_id", df_matrix.index)

# === ADD CATEGORY FROM pe_grouped.csv ===
category_df = pd.read_csv(category_csv)
category_df["abbrev"] = category_df["abbrev"].astype(str)
category_df["name"] = category_df["name"].astype(str)

df_matrix["category"] = df_matrix["acronym"].map(dict(zip(category_df["abbrev"], category_df["category"])))
unmatched = df_matrix["category"].isna()
df_matrix.loc[unmatched, "category"] = df_matrix.loc[unmatched, "name"].map(
    dict(zip(category_df["name"], category_df["category"]))
)

# === SAVE ===
df_matrix.to_excel(output_file, index=False)
print(f"Final matrix saved to:\n{output_file}")

[INFO] 47 regions had no data and were excluded.
Final matrix saved to:
Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\collapsed_region_matrix.xlsx


In [None]:
354	Medulla	/997/8/343/1065/354/
771	Pons	/997/8/343/1065/771/
1097	Hypothalamus	/997/8/343/1129/1097/
549	Thalamus	/997/8/343/1129/549/
313	Midbrain	/997/8/343/313/
512	Cerebellum	/997/8/512/
695	Cortical plate	/997/8/567/688/695/
703	Cortical subplate	/997/8/567/688/703/
803	Pallidum	/997/8/567/623/803/
477	Striatum	/997/8/567/623/477/


In [10]:
import pandas as pd
import networkx as nx
import os

# === CONFIG ===
structures_csv = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\code\cfos_preprocessing\allen_mouse_10um_v1.2\structures.csv"
input_excel = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\mean_dff_per_region_by_mouse_brainmapper.xlsx"
output_file = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\colapsed_dff_cleaned_210.xlsx"
target_n_regions = 210

# === LOAD REGION HIERARCHY ===
region_table = pd.read_csv(structures_csv)
region_table['structure_id_path'] = region_table['structure_id_path'].astype(str)
id_to_path = region_table.set_index("id")["structure_id_path"].astype(str).to_dict()

# Build anatomical graph
G = nx.DiGraph()
for _, row in region_table.iterrows():
    path = row["structure_id_path"].strip("/").split("/")
    for i in range(len(path) - 1):
        G.add_edge(path[i], path[i + 1])

# Collapse to 130 nodes
def collapse_tree_to_n(graph, n_regions):
    leaves = [node for node in graph.nodes if graph.out_degree(node) == 0]
    all_paths = [nx.shortest_path(graph, source="997", target=leaf) for leaf in leaves if nx.has_path(graph, "997", leaf)]
    from collections import Counter
    counts = Counter()
    for path in all_paths:
        counts.update(path)
    ranked = [node for node, _ in counts.most_common()]
    selected = set()
    for node in ranked:
        descendants = nx.descendants(graph, node)
        if not any(d in selected for d in descendants):
            selected.add(node)
        if len(selected) >= n_regions:
            break
    return list(selected)

collapsed_region_ids = collapse_tree_to_n(G, target_n_regions)

# === REMOVE REDUNDANT PARENT REGIONS ===
collapsed_with_paths = [(rid, id_to_path.get(int(rid), "")) for rid in collapsed_region_ids]

def is_subpath(p1, p2):
    return p2.startswith(p1) and p1 != p2

filtered_regions = []
paths = [p for _, p in collapsed_with_paths]
for i, (rid1, path1) in enumerate(collapsed_with_paths):
    if any(is_subpath(path1, p2) for j, p2 in enumerate(paths) if j != i):
        continue
    filtered_regions.append(rid1)

collapsed_region_ids = filtered_regions
print(f"[INFO] Reduced to {len(collapsed_region_ids)} non-overlapping regions.")

# === MAP STRUCTURE TO COLLAPSED REGIONS ===
structure_to_target = {}
for _, row in region_table.iterrows():
    sid = str(row["id"])
    path = row["structure_id_path"].strip("/").split("/")
    for node in reversed(path):
        if node in collapsed_region_ids:
            structure_to_target[sid] = node
            break

# === LOAD MOUSE DFF ===
df_by_mouse = pd.read_excel(input_excel, sheet_name=None)
matrix_data = {}

for sheet_name, df_mouse in df_by_mouse.items():
    if sheet_name.lower() == "summary" or "structure_id_path" not in df_mouse.columns:
        continue
    region_means = {}
    for _, row in df_mouse.iterrows():
        path = row["structure_id_path"].strip("/").split("/")
        leaf_id = path[-1]
        target = structure_to_target.get(leaf_id)
        if target:
            region_means.setdefault(target, []).append(row["mean_dff"])
    averaged = {rid: sum(vals) / len(vals) for rid, vals in region_means.items()}
    matrix_data[sheet_name] = averaged

# === FINAL MATRIX ===
df_matrix = pd.DataFrame(matrix_data).T
available_regions = [rid for rid in collapsed_region_ids if rid in df_matrix.columns]
missing = set(collapsed_region_ids) - set(df_matrix.columns)
print(f"[INFO] {len(missing)} collapsed regions had no data.")

df_matrix = df_matrix[available_regions].T
df_matrix["region_id"] = df_matrix.index.astype(int)

# Add structure metadata
id_meta = region_table.set_index("id")[["acronym", "name", "structure_id_path"]].astype(str)
df_matrix["acronym"] = df_matrix["region_id"].map(lambda x: id_meta.loc[x, "acronym"] if x in id_meta.index else "")
df_matrix["name"] = df_matrix["region_id"].map(lambda x: id_meta.loc[x, "name"] if x in id_meta.index else "")
df_matrix["structure_id_path"] = df_matrix["region_id"].map(lambda x: id_meta.loc[x, "structure_id_path"] if x in id_meta.index else "")
df_matrix["depth"] = df_matrix["structure_id_path"].map(lambda p: len(p.strip("/").split("/")))

# Add anatomical category based on path prefix
category_map = {
    "Medulla": "/997/8/343/1065/354/",
    "Pons": "/997/8/343/1065/771/",
    "Hypothalamus": "/997/8/343/1129/1097/",
    "Thalamus": "/997/8/343/1129/549/",
    "Midbrain": "/997/8/343/313/",
    "Cerebellum": "/997/8/512/",
    "Cortical plate": "/997/8/567/688/695/",
    "Cortical subplate": "/997/8/567/688/703/",
    "Pallidum": "/997/8/567/623/803/",
    "Striatum": "/997/8/567/623/477/"
}

def assign_category(path):
    for cat, prefix in category_map.items():
        if path.startswith(prefix):
            return cat
    return "Other"

df_matrix["category"] = df_matrix["structure_id_path"].map(assign_category)

# Reorder columns
meta_cols = ["region_id", "acronym", "name", "structure_id_path", "depth", "category"]
df_matrix = df_matrix[meta_cols + [col for col in df_matrix.columns if col not in meta_cols]]

# === SAVE ===
df_matrix.to_excel(output_file, index=False)
print(f" Final matrix saved to:\n{output_file}")

[INFO] Reduced to 134 non-overlapping regions.
[INFO] 18 collapsed regions had no data.
 Final matrix saved to:
Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\colapsed_dff_cleaned_210.xlsx


In [12]:
import pandas as pd

# === CONFIG ===
structures_path = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\code\cfos_preprocessing\allen_mouse_10um_v1.2\structures.csv"
output_csv = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\child_to_collapsed_map.csv"

# === Load region hierarchy ===
region_table = pd.read_csv(structures_path)
region_table["structure_id_path"] = region_table["structure_id_path"].astype(str)

id_to_path = region_table.set_index("id")["structure_id_path"].astype(str).to_dict()
id_to_name = region_table.set_index("id")["name"].astype(str).to_dict()

# === Define the 10 collapsed categories (based on root IDs) ===
collapsed_region_ids = [
    354,   # Medulla
    771,   # Pons
    1097,  # Hypothalamus
    549,   # Thalamus
    313,   # Midbrain
    512,   # Cerebellum
    695,   # Cortical plate
    703,   # Cortical subplate
    803,   # Pallidum
    477    # Striatum
]
collapsed_region_ids = list(map(str, collapsed_region_ids))

# === Build structure_id_path mapping ===
collapsed_paths = {cid: id_to_path[int(cid)] for cid in collapsed_region_ids}

# === Map each region to its collapsed parent ===
mapping_rows = []
for sid, path in id_to_path.items():
    assigned_parent = None
    for parent_id, parent_path in collapsed_paths.items():
        parent_path_clean = parent_path.strip("/")
        if path.startswith(parent_path_clean + "/") or path == parent_path_clean:
            assigned_parent = parent_id
            break

    if assigned_parent:
        mapping_rows.append({
            "child_region_id": sid,
            "child_region_name": id_to_name.get(int(sid), ""),
            "child_structure_id_path": path,
            "collapsed_region_id": assigned_parent,
            "collapsed_region_name": id_to_name.get(int(assigned_parent), "")
        })

df_map = pd.DataFrame(mapping_rows)

# === Save or display ===
print(f"Mapped {len(df_map)} regions to collapsed parents.")
print(df_map.head(10))

df_map.to_csv(output_csv, index=False)
print(f"Saved to: {output_csv}")


Mapped 0 regions to collapsed parents.
Empty DataFrame
Columns: []
Index: []
Saved to: Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\child_to_collapsed_map.csv


In [15]:
import pandas as pd
import networkx as nx
import os
from collections import defaultdict

# === CONFIG ===
structures_path = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\code\cfos_preprocessing\allen_mouse_10um_v1.2\structures.csv"
input_excel = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\mean_dff_per_region_by_mouse_brainmapper.xlsx"
output_matrix = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\final_collapsed_matrix.xlsx"
output_log = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\dff_collapsing_log.csv"
TARGET_N = 160

# === Load hierarchy ===
region_table = pd.read_csv(structures_path)
region_table["structure_id_path"] = region_table["structure_id_path"].astype(str)
id_to_path = region_table.set_index("id")["structure_id_path"].astype(str).to_dict()
id_to_name = region_table.set_index("id")["name"].astype(str).to_dict()

# === Build graph from structure_id_path ===
G = nx.DiGraph()
for _, row in region_table.iterrows():
    path = row["structure_id_path"].strip("/").split("/")
    for i in range(len(path) - 1):
        G.add_edge(path[i], path[i + 1])

# === Collapse function ===
def collapse_tree_to_n(graph, n):
    leaves = [n for n in graph.nodes if graph.out_degree(n) == 0]
    all_paths = [nx.shortest_path(graph, source="997", target=leaf)
                 for leaf in leaves if nx.has_path(graph, "997", leaf)]
    from collections import Counter
    counts = Counter()
    for path in all_paths:
        counts.update(path)
    ranked = [node for node, _ in counts.most_common()]
    selected = set()
    for node in ranked:
        descendants = nx.descendants(graph, node)
        if not any(d in selected for d in descendants):
            selected.add(node)
        if len(selected) >= n:
            break
    return list(selected)

collapsed_region_ids = collapse_tree_to_n(G, TARGET_N)

# === Map child → parent ===
structure_to_collapsed = {}
for sid, path in id_to_path.items():
    parts = path.strip("/").split("/")
    for node in reversed(parts):
        if node in collapsed_region_ids:
            structure_to_collapsed[str(sid)] = node
            break

# === Load per-mouse DFF ===
df_by_mouse = pd.read_excel(input_excel, sheet_name=None)
matrix_data = defaultdict(dict)
log_rows = []

for sheet_name, df_mouse in df_by_mouse.items():
    if sheet_name.lower() == "summary" or "structure_id_path" not in df_mouse.columns:
        continue
    mouse_id = sheet_name
    temp_assignments = defaultdict(list)

    for _, row in df_mouse.iterrows():
        path = row["structure_id_path"].strip("/").split("/")
        region_id = path[-1]
        collapsed_id = structure_to_collapsed.get(region_id)
        if collapsed_id:
            temp_assignments[collapsed_id].append((region_id, row["mean_dff"]))

            # Logging for traceability
            log_rows.append({
                "mouse": mouse_id,
                "collapsed_region_id": collapsed_id,
                "collapsed_region_name": id_to_name.get(int(collapsed_id), ""),
                "child_region_id": region_id,
                "child_region_name": id_to_name.get(int(region_id), ""),
                "child_structure_id_path": row["structure_id_path"],
                "mean_dff_contribution": row["mean_dff"]
            })

    for cid, values in temp_assignments.items():
        mean_val = sum(v for _, v in values) / len(values)
        matrix_data[mouse_id][cid] = mean_val

# === Build matrix
df_matrix = pd.DataFrame(matrix_data).T
df_matrix.index.name = "mouse"
df_matrix.columns.name = "collapsed_region_id"
df_matrix = df_matrix.T
df_matrix["region_id"] = df_matrix.index.astype(int)
df_matrix["name"] = df_matrix["region_id"].map(lambda x: id_to_name.get(x, ""))
df_matrix["structure_id_path"] = df_matrix["region_id"].map(lambda x: id_to_path.get(x, ""))
df_matrix["depth"] = df_matrix["structure_id_path"].map(lambda p: len(p.strip("/").split("/")))

# Move metadata columns to front
meta_cols = ["region_id", "name", "structure_id_path", "depth"]
df_matrix = df_matrix[meta_cols + [col for col in df_matrix.columns if col not in meta_cols]]

# === Save
df_matrix.to_excel(output_matrix, index=False)
print(f"Collapsed matrix saved to:\n{output_matrix}")

log_df = pd.DataFrame(log_rows)
log_df.to_csv(output_log, index=False)
print(f"Collapse log with ΔF/F contributions saved to:\n{output_log}")


Collapsed matrix saved to:
Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\final_collapsed_matrix.xlsx
Collapse log with ΔF/F contributions saved to:
Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\dff_collapsing_log.csv


In [None]:
import pandas as pd
import networkx as nx
import os
from collections import defaultdict
import numpy as np

# === CONFIG ===
structures_path = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\\structures.csv"
input_excel = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\per_mouse_sheets.xlsx"
output_excel = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\den_collapsed_matrix.xlsx"
output_log = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\den_collapsing_log.csv"
TARGET_N = 160

# === Load Allen structure data ===
region_table = pd.read_csv(structures_path)
region_table["id"] = region_table["id"].astype(str)
region_table["structure_id_path"] = region_table["structure_id_path"].astype(str)

id_to_acronym = region_table.set_index("id")["acronym"].astype(str).to_dict()
id_to_path    = region_table.set_index("id")["structure_id_path"].astype(str).to_dict()
id_to_name    = region_table.set_index("id")["name"].astype(str).to_dict()

# === Build anatomical hierarchy graph (string node IDs) ===
G = nx.DiGraph()
for _, row in region_table.iterrows():
    parts = [p for p in row["structure_id_path"].strip("/").split("/") if p]
    for i in range(len(parts) - 1):
        G.add_edge(parts[i], parts[i + 1])


# === Collapse region selection ===
def collapse_tree_to_n(graph, n):
    leaves = [n for n in graph.nodes if graph.out_degree(n) == 0]
    all_paths = [nx.shortest_path(graph, source="997", target=leaf)
                 for leaf in leaves if nx.has_path(graph, "997", leaf)]
    from collections import Counter
    counts = Counter()
    for path in all_paths:
        counts.update(path)
    ranked = [node for node, _ in counts.most_common()]
    selected = set()
    for node in ranked:
        descendants = nx.descendants(graph, node)
        if not any(d in selected for d in descendants):
            selected.add(node)
        if len(selected) >= n:
            break
    return list(selected)

collapsed_region_ids = collapse_tree_to_n(G, TARGET_N)

# === Map child → collapsed parent ===
structure_to_collapsed = {}
for sid, path in id_to_path.items():
    parts = path.strip("/").split("/")
    for node in reversed(parts):
        if node in collapsed_region_ids:
            structure_to_collapsed[str(sid)] = node
            break

# === Process all mice ===
df_by_mouse = pd.read_excel(input_excel, sheet_name=None)
mean_data = defaultdict(dict)
std_data = defaultdict(dict)
sem_data = defaultdict(dict)
log_rows = []

for sheet_name, df_mouse in df_by_mouse.items():
    if sheet_name.lower() == "summary" or "structure_id_path" not in df_mouse.columns:
        continue
    mouse_id = sheet_name
    temp_assignments = defaultdict(list)

    for _, row in df_mouse.iterrows():
        path = row["structure_id_path"].strip("/").split("/")
        region_id = path[-1]
        collapsed_id = structure_to_collapsed.get(region_id)
        if collapsed_id:
            temp_assignments[collapsed_id].append(row["mean_dff"])
            log_rows.append({
                "mouse": mouse_id,
                "collapsed_region_id": collapsed_id,
                "collapsed_region_name": id_to_name.get(int(collapsed_id), ""),
                "child_region_id": region_id,
                "child_region_name": id_to_name.get(int(region_id), ""),
                "child_structure_id_path": row["structure_id_path"],
                "mean_dff_contribution": row["mean_dff"]
            })

    for cid, values in temp_assignments.items():
        arr = np.array(values, dtype=float)
        mean_data[mouse_id][cid] = np.mean(arr)
        std_data[mouse_id][cid] = np.std(arr, ddof=1) if len(arr) > 1 else 0.0
        sem_data[mouse_id][cid] = std_data[mouse_id][cid] / np.sqrt(len(arr)) if len(arr) > 1 else 0.0

# === Format outputs
def build_df(data_dict, id_to_name, id_to_path, id_to_acronym):
    df = pd.DataFrame(data_dict).T
    df.index.name = "mouse"
    df.columns.name = "collapsed_region_id"
    df = df.T
    df["region_id"] = df.index.astype(int)
    df["name"] = df["region_id"].map(lambda x: id_to_name.get(x, ""))
    df["acronym"] = df["region_id"].map(lambda x: id_to_acronym.get(x, ""))
    df["structure_id_path"] = df["region_id"].map(lambda x: id_to_path.get(x, ""))
    df["depth"] = df["structure_id_path"].map(lambda p: len(p.strip("/").split("/")))
    meta_cols = ["region_id", "name", "acronym", "structure_id_path", "depth"]
    return df[meta_cols + [col for col in df.columns if col not in meta_cols]]

df_mean = build_df(mean_data, id_to_name, id_to_path, id_to_acronym)
df_std = build_df(std_data, id_to_name, id_to_path, id_to_acronym)
df_sem = build_df(sem_data, id_to_name, id_to_path, id_to_acronym)

# === Export all
with pd.ExcelWriter(output_excel) as writer:
    df_mean.to_excel(writer, sheet_name="mean_dff", index=False)
    df_std.to_excel(writer, sheet_name="std_dff", index=False)
    df_sem.to_excel(writer, sheet_name="sem_dff", index=False)
print(f"Matrix with mean, std, sem saved to:\n{output_excel}")

# Save log
pd.DataFrame(log_rows).to_csv(output_log, index=False)
print(f"Detailed ΔF/F log saved to:\n{output_log}")


Matrix with mean, std, sem saved to:
Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\final_collapsed_matrix_2.xlsx
Detailed ΔF/F log saved to:
Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\dff_collapsing_log_2.csv


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import networkx as nx
from collections import defaultdict, Counter

# === CONFIG ===
structures_path = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\structures.csv"
input_excel     = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\per_mouse_sheets.xlsx"
output_excel    = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\den_collapsed_matrix.xlsx"
output_log      = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\den_collapsing_log.csv"
out_dir = Path(r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data")
out_dir.mkdir(parents=True, exist_ok=True)
sel_csv  = out_dir / "collapsed_selection.csv"
map_csv  = out_dir / "child_to_collapsed_map.csv"
unmap_csv= out_dir / "unmapped_regions.csv"
TARGET_N        = 160
ROOT_ID         = "997"   # Allen root (mouse)

# === Load Allen structure data ===
region_table = pd.read_csv(structures_path)
# --- after loading region_table ---
region_table = region_table.copy()
region_table["id"] = region_table["id"].astype(str)
region_table["structure_id_path"] = region_table["structure_id_path"].astype(str)

# maps with STRING KEYS
id_to_name = region_table.set_index("id")["name"].astype(str).to_dict()
id_to_path = region_table.set_index("id")["structure_id_path"].astype(str).to_dict()

# build graph with STRING node ids
G = nx.DiGraph()
for _, row in region_table.iterrows():
    parts = [p for p in row["structure_id_path"].strip("/").split("/") if p]
    for i in range(len(parts) - 1):
        G.add_edge(parts[i], parts[i+1])

def node_depth(v: str) -> int:
    """Depth from path (edges from root)."""
    p = id_to_path.get(v, "")
    return max(len(p.strip("/").split("/")) - 1, 0)

def collapse_tree_to_n_antichain(graph: nx.DiGraph, n: int, root_id: str = ROOT_ID) -> list[str]:
    """Pick n nodes with no ancestor/descendant conflicts (antichain)."""
    # rank by root->leaf path frequency
    leaves = [v for v in graph.nodes if graph.out_degree(v) == 0 and nx.has_path(graph, root_id, v)]
    counts = Counter()
    for leaf in leaves:
        for v in nx.shortest_path(graph, root_id, leaf):
            counts[v] += 1

    # prefer deeper nodes when ties (avoids huge parents like 549)
    ranked = sorted(counts.keys(),
                    key=lambda v: (counts[v], node_depth(v)),
                    reverse=True)

    selected: list[str] = []
    selset: set[str] = set()
    for v in ranked:
        if v == root_id:
            continue
        # reject if ANY ancestor or descendant is already selected
        if any((v in nx.descendants(graph, s)) or (s in nx.descendants(graph, v)) for s in selset):
            continue
        selset.add(v)
        selected.append(v)
        if len(selected) >= n:
            break
    return selected

def prune_to_antichain(selected_ids: list[str], graph: nx.DiGraph) -> list[str]:
    """Hard prune until no ancestor/descendant pairs remain (keep the deeper node)."""
    sel = set(map(str, selected_ids))
    changed = True
    while changed:
        changed = False
        to_drop = set()
        for u in list(sel):
            for v in list(sel):
                if u == v:
                    continue
                # u ancestor of v?
                if v in nx.descendants(graph, u):
                    # drop the shallower one (smaller depth)
                    drop = u if node_depth(u) <= node_depth(v) else v
                    to_drop.add(drop)
        if to_drop:
            sel -= to_drop
            changed = True
    return list(sel)

def verify_antichain(selected_ids: list[str], graph: nx.DiGraph) -> list[tuple[str,str]]:
    """Return list of (ancestor, descendant) conflicts; empty means OK."""
    S = set(map(str, selected_ids))
    viol = []
    for u in S:
        for v in S:
            if u != v and v in nx.descendants(graph, u):
                viol.append((u, v))
    return viol

collapsed_region_ids = collapse_tree_to_n_antichain(G, TARGET_N, root_id=ROOT_ID)
collapsed_region_ids = prune_to_antichain(collapsed_region_ids, G)
conflicts = verify_antichain(collapsed_region_ids, G)
assert not conflicts, f"Still have ancestor/descendant pairs: {conflicts[:5]}"

# preview
preview = [(cid, id_to_name.get(cid, ""), id_to_path.get(cid, "")) for cid in collapsed_region_ids[:10]]
for cid, nm, path in preview:
    print(cid, nm, path)

def node_depth(sid: str) -> int:
    p = id_to_path.get(sid, "")
    return max(len(str(p).strip("/").split("/")) - 1, 0)

# --- 1) verify antichain (no ancestor/descendant pairs) ---
conflicts = []
S = set(map(str, collapsed_region_ids))
for u in S:
    # if any selected v is a descendant of u -> conflict
    bad = S.intersection(nx.descendants(G, u))
    if bad:
        conflicts.append((u, list(bad)[:3]))  # keep a few for print
assert not conflicts, f"Antichain violation(s): {conflicts[:5]}"

# --- 2) collapsed_selection.csv ---
rows = []
for cid in collapsed_region_ids:
    sid = str(cid)
    nm  = id_to_name.get(sid, "")
    acr = id_to_acronym.get(sid, "")
    pth = id_to_path.get(sid, "")
    d   = node_depth(sid)
    # region_id as int if possible
    try:
        rid = int(sid)
    except ValueError:
        rid = sid
    rows.append({
        "region_id": rid,
        "acronym": acr,
        "name": nm,
        "structure_id_path": pth,
        "depth": d
    })
collapsed_df = pd.DataFrame(rows).sort_values(["depth","region_id"])
collapsed_df.to_csv(sel_csv, index=False)

# --- 3) child_to_collapsed_map.csv + unmapped_regions.csv ---
selected_set = set(map(str, collapsed_region_ids))
map_rows = []
unmapped = []
for sid, pth in id_to_path.items():       # sid is already str (we coerced earlier)
    parts = [p for p in str(pth).strip("/").split("/") if p]
    mapped = next((p for p in reversed(parts) if p in selected_set), None)
    if mapped is None:
        unmapped.append({
            "child_region_id": int(sid) if sid.isdigit() else sid,
            "child_name": id_to_name.get(sid, ""),
            "child_acronym": id_to_acronym.get(sid, ""),
            "child_structure_id_path": pth
        })
        continue
    map_rows.append({
        "child_region_id": int(sid) if sid.isdigit() else sid,
        "child_name": id_to_name.get(sid, ""),
        "child_acronym": id_to_acronym.get(sid, ""),
        "child_structure_id_path": pth,
        "collapsed_region_id": int(mapped) if mapped.isdigit() else mapped,
        "collapsed_name": id_to_name.get(mapped, ""),
        "collapsed_acronym": id_to_acronym.get(mapped, ""),
        "collapsed_structure_id_path": id_to_path.get(mapped, ""),
        "collapsed_depth": node_depth(mapped)
    })

pd.DataFrame(map_rows).sort_values(["collapsed_depth","collapsed_region_id","child_region_id"]).to_csv(map_csv, index=False)
pd.DataFrame(unmapped).to_csv(unmap_csv, index=False)

print(f"[OK] Collapsed selection: {len(collapsed_region_ids)} regions → {sel_csv}")
print(f"[OK] Child→collapsed map: {len(map_rows)} rows → {map_csv}")
if unmapped:
    print(f"[WARN] Unmapped regions: {len(unmapped)} → {unmap_csv}")
else:
    print("[OK] All regions mapped to a collapsed parent.")

# === Map child → nearest collapsed parent (by walking up the id_path) ===
structure_to_collapsed: dict[str, str] = {}
for sid, path in id_to_path.items():
    parts = path.strip("/").split("/")
    for node in reversed(parts):
        if node in collapsed_region_ids:
            structure_to_collapsed[str(sid)] = node
            break

# === Process all mice (each sheet = one mouse), keep L/R separate ===
# Expect per-mouse sheets with metadata + optional columns "L" and "R"
df_by_mouse = pd.read_excel(input_excel, sheet_name=None)

mean_L: dict[str, dict[str, float]] = defaultdict(dict)  # mouse -> {collapsed_id: mean}
std_L:  dict[str, dict[str, float]] = defaultdict(dict)
sem_L:  dict[str, dict[str, float]] = defaultdict(dict)

mean_R: dict[str, dict[str, float]] = defaultdict(dict)
std_R:  dict[str, dict[str, float]] = defaultdict(dict)
sem_R:  dict[str, dict[str, float]] = defaultdict(dict)

log_rows = []

for sheet_name, df_mouse in df_by_mouse.items():
    if str(sheet_name).lower() == "summary":
        continue

    # We need structure_id_path. If missing but region_id present, reconstruct from structures.csv
    if "structure_id_path" not in df_mouse.columns:
        if "region_id" in df_mouse.columns:
            df_mouse["structure_id_path"] = (
                df_mouse["region_id"]
                .map(lambda x: id_to_path.get(int(x), ""))
                .astype(str)
            )
        else:
            # If there's only structure_name, you can’t reliably map to ID path — skip this sheet
            print(f"[WARN] Sheet {sheet_name}: no structure_id_path/region_id; skipping.")
            continue

    # Detect hemisphere columns. Our earlier exporter named them 'L'/'R'.
    has_L = "L" in df_mouse.columns
    has_R = "R" in df_mouse.columns
    if not (has_L or has_R):
        # Fall back: look for cells_per_mm3_L / cells_per_mm3_R
        for candidate in df_mouse.columns:
            if candidate.endswith("_L"):
                df_mouse["L"] = df_mouse[candidate]
                has_L = True
            if candidate.endswith("_R"):
                df_mouse["R"] = df_mouse[candidate]
                has_R = True

    if not (has_L or has_R):
        print(f"[WARN] Sheet {sheet_name}: no L/R columns; skipping.")
        continue

    mouse_id = str(sheet_name)

    # temp assignments per hemisphere: collapsed_id -> list of child values
    assign_L: dict[str, list[float]] = defaultdict(list)
    assign_R: dict[str, list[float]] = defaultdict(list)

    for _, row in df_mouse.iterrows():
        path = str(row["structure_id_path"]).strip("/")
        if not path:
            continue
        region_id = path.split("/")[-1]
        collapsed_id = structure_to_collapsed.get(region_id)
        if not collapsed_id:
            continue

        if has_L:
            vL = row["L"]
            if pd.notna(vL):
                assign_L[collapsed_id].append(float(vL))
                log_rows.append({
                    "mouse": mouse_id,
                    "hemisphere": "L",
                    "collapsed_region_id": collapsed_id,
                    "collapsed_region_name": id_to_name.get(int(collapsed_id), ""),
                    "child_region_id": region_id,
                    "child_region_name": id_to_name.get(int(region_id), ""),
                    "child_structure_id_path": row["structure_id_path"],
                    "cells_per_mm3_contribution": float(vL),
                })
        if has_R:
            vR = row["R"]
            if pd.notna(vR):
                assign_R[collapsed_id].append(float(vR))
                log_rows.append({
                    "mouse": mouse_id,
                    "hemisphere": "R",
                    "collapsed_region_id": collapsed_id,
                    "collapsed_region_name": id_to_name.get(int(collapsed_id), ""),
                    "child_region_id": region_id,
                    "child_region_name": id_to_name.get(int(region_id), ""),
                    "child_structure_id_path": row["structure_id_path"],
                    "cells_per_mm3_contribution": float(vR),
                })

    # reduce to mean/std/sem per collapsed region
    for cid, vals in assign_L.items():
        arr = np.asarray(vals, dtype=float)
        m = float(np.mean(arr)) if arr.size else np.nan
        s = float(np.std(arr, ddof=1)) if arr.size > 1 else 0.0
        e = float(s / np.sqrt(arr.size)) if arr.size > 1 else 0.0
        mean_L[mouse_id][cid] = m
        std_L[mouse_id][cid]  = s
        sem_L[mouse_id][cid]  = e

    for cid, vals in assign_R.items():
        arr = np.asarray(vals, dtype=float)
        m = float(np.mean(arr)) if arr.size else np.nan
        s = float(np.std(arr, ddof=1)) if arr.size > 1 else 0.0
        e = float(s / np.sqrt(arr.size)) if arr.size > 1 else 0.0
        mean_R[mouse_id][cid] = m
        std_R[mouse_id][cid]  = s
        sem_R[mouse_id][cid]  = e

# === Build wide dataframes with meta: columns = <mouse>_L / <mouse>_R ===
def build_df_hemis(mean_L, mean_R, label: str):
    """
    mean_L/mean_R: dict mouse -> {collapsed_id: value}
    label: thing we're summarizing ('cells_per_mm3')
    """
    cols_dict = {}
    all_mice = sorted(set(mean_L.keys()) | set(mean_R.keys()))
    for m in all_mice:
        if m in mean_L and mean_L[m]:
            cols_dict[f"{m}_L"] = mean_L[m]
        if m in mean_R and mean_R[m]:
            cols_dict[f"{m}_R"] = mean_R[m]

    # rows are collapsed_region_id (as index)
    df = pd.DataFrame(cols_dict)
    df.index.name = "collapsed_region_id"

    # attach metadata
    df["region_id"] = df.index.astype(int)
    df["name"] = df["region_id"].map(lambda x: id_to_name.get(x, ""))
    df["acronym"] = df["region_id"].map(lambda x: id_to_acronym.get(x, ""))
    df["structure_id_path"] = df["region_id"].map(lambda x: id_to_path.get(x, ""))
    df["depth"] = df["structure_id_path"].map(lambda p: len(str(p).strip("/").split("/")))

    meta_cols = ["region_id", "name", "acronym", "structure_id_path", "depth"]
    ordered = meta_cols + [c for c in df.columns if c not in meta_cols]
    return df[ordered].reset_index(drop=True)

df_mean = build_df_hemis(mean_L, mean_R, label="cells_per_mm3")
df_std  = build_df_hemis(std_L,  std_R,  label="cells_per_mm3")
df_sem  = build_df_hemis(sem_L,  sem_R,  label="cells_per_mm3")

# === Export all ===
with pd.ExcelWriter(output_excel) as writer:
    df_mean.to_excel(writer, sheet_name="mean_cells_per_mm3", index=False)
    df_std.to_excel(writer,  sheet_name="std_cells_per_mm3",  index=False)
    df_sem.to_excel(writer,  sheet_name="sem_cells_per_mm3",  index=False)

print(f"Collapsed L/R mean/std/sem saved to:\n{output_excel}")

# Save log
pd.DataFrame(log_rows).to_csv(output_log, index=False)
print(f"Detailed density collapsing log saved to:\n{output_log}")


Collapsed L/R mean/std/sem saved to:
Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\den_collapsed_matrix.xlsx
Detailed density collapsing log saved to:
Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\den_collapsing_log.csv


In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from collections import defaultdict, Counter
from pathlib import Path

# ===================== CONFIG =====================
structures_path = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\structures.csv"
input_excel     = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\per_mouse_sheets.xlsx"
output_excel    = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\den_collapsed_matrix.xlsx"
output_log      = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\den_collapsing_log.csv"
audit_dir       = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data"

TARGET_N        = 160
ROOT_ID         = "997"   # Allen mouse root
BAN_IDS         = {"8"}   # ban shallow hub: "Basic cell groups and regions"

# Category roots: nothing ABOVE these IDs will ever be selected
category_map = {
    "Medulla": "/997/8/343/1065/354/",
    "Pons": "/997/8/343/1065/771/",
    "Hypothalamus": "/997/8/343/1129/1097/",
    "Thalamus": "/997/8/343/1129/549/",
    "Midbrain": "/997/8/343/313/",
    "Cerebellum": "/997/8/512/",
    "Cortical plate": "/997/8/567/688/695/",
    "Cortical subplate": "/997/8/567/688/703/",
    "Pallidum": "/997/8/567/623/803/",
    "Striatum": "/997/8/567/623/477/",
}
# Selection behavior inside categories
MIN_OFFSET_BELOW_CATEGORY = 1     # require at least 1 level below the category root
ALLOW_CATEGORY_ROOTS      = False # forbid selecting the category root itself initially
RELAX_DEPTH_IF_N_SHORT    = True  # carefully relax toward the root if needed
# ==================================================
def canon(p) -> str:
    """Ensure leading and trailing slash: '/.../'."""
    p = "" if p is None else str(p)
    return "/" + p.strip("/") + "/"


# ------------------ Structures & Graph ------------------
def load_structures(structures_csv: str):
    S = pd.read_csv(structures_path)
    S["id"] = S["id"].astype(str)
    S["structure_id_path"] = S["structure_id_path"].apply(canon)

    id_to_acronym = S.set_index("id")["acronym"].astype(str).to_dict()
    id_to_path    = S.set_index("id")["structure_id_path"].to_dict()  # canonical now
    id_to_name    = S.set_index("id")["name"].astype(str).to_dict()
    return S, id_to_name, id_to_path, id_to_acronym

def build_graph_from_paths(S: pd.DataFrame) -> nx.DiGraph:
    G = nx.DiGraph()
    for _, row in S.iterrows():
        # row["structure_id_path"] is already canonical '/.../'
        parts = row["structure_id_path"].strip("/").split("/")
        for i in range(len(parts) - 1):
            G.add_edge(parts[i], parts[i + 1])  # IDs are strings
    return G


def node_depth_from_path(sid: str, id_to_path: dict) -> int:
    p = id_to_path.get(sid, "")
    return max(len(str(p).strip("/").split("/")) - 1, 0)

# ------------------ Diagnostics ------------------
def diagnose_categories(id_to_path: dict, category_map: dict):
    print("\n[DIAG] Category coverage:")
    for name, pref in category_map.items():
        pref_s = canon(pref)                     # <-- canonicalize the prefix
        depths, count = [], 0
        for sid, pth in id_to_path.items():      # pth already canonical
            if pth.startswith(pref_s):
                count += 1
                depths.append(len(pth.strip("/").split("/")) - 1)
        if count == 0:
            print(f"  - {name}: 0 nodes under prefix {pref_s}  <<< CHECK THIS PREFIX")
        else:
            print(f"  - {name}: {count} nodes; depth range {min(depths)}–{max(depths)} (prefix {pref_s})")
    print("")


# --------- Category-bounded antichain selection ----------
def select_collapsed_antichain_within_categories(
    G,
    id_to_path: dict,
    category_map: dict,
    target_n: int,
    root_id: str = "997",
    *,
    min_offset: int = 1,
    allow_category_roots: bool = False,
    ban_ids: set | None = None,
    relax: bool = True,
):
    """
    Pick up to target_n region IDs as an antichain (no ancestor/descendant pairs),
    constrained to lie INSIDE the provided category subtrees (never above them).

    Ranking: deeper nodes first, then root→leaf path frequency.
    """
    ban_ids = set(map(str, ban_ids or set()))
    root_id = str(root_id)

    # Canonicalize category prefixes (ensure '/.../')
    cat_prefixes = [canon(p) for p in category_map.values()]
    cat_roots    = {pref.strip("/").split("/")[-1] for pref in cat_prefixes}

    def depth(sid: str) -> int:
        return node_depth_from_path(sid, id_to_path)

    def in_categories(sid: str):
        """Return (inside_any_category, min_required_depth) for sid, using canonical paths."""
        p = id_to_path.get(sid, "")  # already canonical from load_structures
        if not p:
            return False, 10**9
        best_root_depth = -1
        for pref in cat_prefixes:
            if p.startswith(pref):
                d = len(pref.strip("/").split("/")) - 1
                if d > best_root_depth:
                    best_root_depth = d
        if best_root_depth < 0:
            return False, 10**9
        return True, best_root_depth + min_offset


    def depth(sid: str) -> int:
        return node_depth_from_path(sid, id_to_path)

    # Root->leaf path frequency counts (strings everywhere)
    leaves = [v for v in G.nodes if G.out_degree(v) == 0 and nx.has_path(G, root_id, v)]
    counts = Counter()
    for leaf in leaves:
        for v in nx.shortest_path(G, root_id, leaf):
            counts[v] += 1

    # Build candidate list under current offset
    def candidates_for_offset(offset: int):
        C = []
        for v in counts.keys():
            if v == root_id or v in ban_ids:
                continue
            inside, min_depth_req = in_categories(v)
            if not inside:
                continue
            if (not allow_category_roots) and (v in cat_roots):
                continue
            # relax toward roots by reducing the min depth requirement
            min_depth_adj = min_depth_req - (min_offset - offset)
            if depth(v) >= min_depth_adj:
                C.append(v)
        # Rank: deeper first, then frequency, then ID for reproducibility
        C.sort(key=lambda x: (depth(x), counts[x], x), reverse=True)
        return C

    def greedy_antichain_pick(cands):
        selected, selset = [], set()
        for v in cands:
            # forbid ancestor/descendant conflicts
            if any((v in nx.descendants(G, s)) or (s in nx.descendants(G, v)) for s in selset):
                continue
            selset.add(v)
            selected.append(v)
            if len(selected) >= target_n:
                break
        return selected

    # Pass 1: strict (min_offset)
    selected = greedy_antichain_pick(candidates_for_offset(min_offset))

    # Pass 2: optional relax within categories
    if relax and len(selected) < target_n:
        for off in range(min_offset - 1, -1, -1):
            cands = [v for v in candidates_for_offset(off) if v not in selected]
            if not cands:
                continue
            curr = list(selected); selset = set(curr)
            for v in cands:
                if any((v in nx.descendants(G, s)) or (s in nx.descendants(G, v)) for s in selset):
                    continue
                selset.add(v); curr.append(v)
                if len(curr) >= target_n:
                    break
            selected = curr
            if len(selected) >= target_n:
                break

    return selected

# -------------- Mapping & Aggregation --------------
def map_child_to_collapsed(id_to_path: dict, selected_ids: list[str]) -> dict:
    """Map every atlas region id -> nearest selected ancestor (if any)."""
    selected = set(selected_ids)
    mapping = {}
    for sid, pth in id_to_path.items():
        parts = [p for p in str(pth).strip("/").split("/") if p]
        mapped = next((p for p in reversed(parts) if p in selected), None)
        if mapped:
            mapping[sid] = mapped
    return mapping

def aggregate_per_mouse_hemi(input_excel: str,
                             structure_to_collapsed: dict,
                             id_to_name: dict,
                             id_to_path: dict):
    """
    Read per_mouse_sheets.xlsx and aggregate mean/std/sem across child regions, per mouse & hemisphere.
    Accepts L/R or any *_L / *_R columns.
    Reconstructs structure_id_path from region_id if needed.
    """
    df_by_mouse = pd.read_excel(input_excel, sheet_name=None)
    mean_L, std_L, sem_L = defaultdict(dict), defaultdict(dict), defaultdict(dict)
    mean_R, std_R, sem_R = defaultdict(dict), defaultdict(dict), defaultdict(dict)
    log_rows = []

    for sheet_name, df_mouse in df_by_mouse.items():
        if str(sheet_name).lower() == "summary":
            continue

        # Ensure structure_id_path exists (reconstruct from region_id if available)
        if "structure_id_path" not in df_mouse.columns:
            if "region_id" in df_mouse.columns:
                df_mouse = df_mouse.copy()
                df_mouse["structure_id_path"] = df_mouse["region_id"].astype(str).map(id_to_path).fillna("")
            else:
                print(f"[WARN] {sheet_name}: missing structure_id_path; skipping.")
                continue

        # Detect hemisphere columns (L/R). Fallback: any *_L / *_R
        has_L = "L" in df_mouse.columns
        has_R = "R" in df_mouse.columns
        if not (has_L or has_R):
            for c in df_mouse.columns:
                if c.endswith("_L") and not has_L:
                    df_mouse["L"] = df_mouse[c]; has_L = True
                if c.endswith("_R") and not has_R:
                    df_mouse["R"] = df_mouse[c]; has_R = True
        if not (has_L or has_R):
            print(f"[WARN] {sheet_name}: no L/R columns; skipping.")
            continue

        mouse_id = str(sheet_name)
        assign_L, assign_R = defaultdict(list), defaultdict(list)

        for _, row in df_mouse.iterrows():
            path = str(row["structure_id_path"]).strip("/")
            if not path:
                continue
            region_id = path.split("/")[-1]  # leaf id as string
            cid = structure_to_collapsed.get(region_id)
            if not cid:
                continue

            if has_L and pd.notna(row["L"]):
                vL = float(row["L"])
                assign_L[cid].append(vL)
                log_rows.append({
                    "mouse": mouse_id, "hemisphere": "L",
                    "collapsed_region_id": cid,
                    "collapsed_region_name": id_to_name.get(cid, ""),
                    "child_region_id": region_id,
                    "child_structure_id_path": row["structure_id_path"],
                    "cells_per_mm3_contribution": vL
                })
            if has_R and pd.notna(row["R"]):
                vR = float(row["R"])
                assign_R[cid].append(vR)
                log_rows.append({
                    "mouse": mouse_id, "hemisphere": "R",
                    "collapsed_region_id": cid,
                    "collapsed_region_name": id_to_name.get(cid, ""),
                    "child_region_id": region_id,
                    "child_structure_id_path": row["structure_id_path"],
                    "cells_per_mm3_contribution": vR
                })

        # reduce to mean/std/sem per collapsed region
        for cid, vals in assign_L.items():
            arr = np.asarray(vals, float)
            m = float(np.mean(arr))
            s = float(np.std(arr, ddof=1)) if arr.size > 1 else 0.0
            e = float(s / np.sqrt(arr.size)) if arr.size > 1 else 0.0
            mean_L[mouse_id][cid] = m; std_L[mouse_id][cid] = s; sem_L[mouse_id][cid] = e

        for cid, vals in assign_R.items():
            arr = np.asarray(vals, float)
            m = float(np.mean(arr))
            s = float(np.std(arr, ddof=1)) if arr.size > 1 else 0.0
            e = float(s / np.sqrt(arr.size)) if arr.size > 1 else 0.0
            mean_R[mouse_id][cid] = m; std_R[mouse_id][cid] = s; sem_R[mouse_id][cid] = e

    return (mean_L, std_L, sem_L), (mean_R, std_R, sem_R), log_rows

def build_wide_with_meta(mean_L, mean_R, id_to_name, id_to_path, id_to_acronym):
    """Build a wide table: meta columns + one column per mouse hemisphere (<mouse>_L / <mouse>_R)."""
    cols = {}
    all_mice = sorted(set(mean_L.keys()) | set(mean_R.keys()))
    for m in all_mice:
        if m in mean_L and mean_L[m]:
            cols[f"{m}_L"] = mean_L[m]   # dict: collapsed_id -> value
        if m in mean_R and mean_R[m]:
            cols[f"{m}_R"] = mean_R[m]

    df = pd.DataFrame(cols)
    df.index.name = "collapsed_region_id"

    # attach metadata
    df["region_id"] = df.index.astype(str)  # keep as string key for maps
    df["acronym"] = df["region_id"].map(lambda x: id_to_acronym.get(x, ""))
    df["name"] = df["region_id"].map(lambda x: id_to_name.get(x, ""))
    df["structure_id_path"] = df["region_id"].map(lambda x: id_to_path.get(x, ""))
    df["depth"] = df["structure_id_path"].map(lambda p: len(str(p).strip("/").split("/")))

    # if you prefer integer region_id in output, coerce safely
    def to_int_maybe(x):
        try: return int(x)
        except: return x
    df["region_id"] = df["region_id"].map(to_int_maybe)

    meta_cols = ["region_id", "acronym", "name", "structure_id_path", "depth"]
    ordered = meta_cols + [c for c in df.columns if c not in meta_cols]
    return df[ordered].reset_index(drop=True)


# -------------------- Audit Exports (safe) --------------------
def export_audit_files(collapsed_region_ids, id_to_name, id_to_acronym, id_to_path,
                       structure_to_collapsed, audit_dir: str):
    """Write: collapsed_selection.csv, child_to_collapsed_map.csv, unmapped_regions.csv. Safe if empty selection."""
    out_dir = Path(audit_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    sel_csv  = out_dir / "collapsed_selection.csv"
    map_csv  = out_dir / "child_to_collapsed_map.csv"
    unmap_csv= out_dir / "unmapped_regions.csv"

    # 1) collapsed_selection.csv
    rows = []
    for cid in collapsed_region_ids:
        sid = str(cid)
        rows.append({
            "region_id": int(sid) if sid.isdigit() else sid,
            "acronym":   id_to_acronym.get(sid, ""),
            "name":      id_to_name.get(sid, ""),
            "structure_id_path": id_to_path.get(sid, ""),
            "depth":     node_depth_from_path(sid, id_to_path)
        })
    sel_df = pd.DataFrame(rows)
    if not sel_df.empty:
        sel_df.sort_values(["depth","region_id"]).to_csv(sel_csv, index=False)
    else:
        # write headers so downstream tools don't choke
        pd.DataFrame(columns=["region_id","acronym","name","structure_id_path","depth"]).to_csv(sel_csv, index=False)

    # 2) child_to_collapsed_map.csv
    map_rows, unmapped = [], []
    selected_set = set(map(str, collapsed_region_ids))
    for sid, pth in id_to_path.items():
        parts = [p for p in str(pth).strip("/").split("/") if p]
        mapped = next((p for p in reversed(parts) if p in selected_set), None)
        if mapped is None:
            unmapped.append({
                "child_region_id": int(sid) if sid.isdigit() else sid,
                "child_name": id_to_name.get(sid, ""),
                "child_acronym": id_to_acronym.get(sid, ""),
                "child_structure_id_path": pth
            })
        else:
            map_rows.append({
                "child_region_id": int(sid) if sid.isdigit() else sid,
                "child_name": id_to_name.get(sid, ""),
                "child_acronym": id_to_acronym.get(sid, ""),
                "child_structure_id_path": pth,
                "collapsed_region_id": int(mapped) if mapped.isdigit() else mapped,
                "collapsed_name": id_to_name.get(mapped, ""),
                "collapsed_acronym": id_to_acronym.get(mapped, ""),
                "collapsed_structure_id_path": id_to_path.get(mapped, ""),
                "collapsed_depth": node_depth_from_path(mapped, id_to_path)
            })
    pd.DataFrame(map_rows).to_csv(map_csv, index=False)
    pd.DataFrame(unmapped).to_csv(unmap_csv, index=False)

    print(f"[AUDIT] collapsed_selection → {sel_csv} ({len(sel_df)} rows)")
    print(f"[AUDIT] child_to_collapsed_map → {map_csv} ({len(map_rows)} rows)")
    if unmapped:
        print(f"[AUDIT] unmapped_regions → {unmap_csv}  (count={len(unmapped)})")
    else:
        print("[AUDIT] All regions mapped.")


# ------------------------- MAIN -------------------------
def main():
    # 1) Load structures & build graph
    S, id_to_name, id_to_path, id_to_acronym = load_structures(structures_path)
    G = build_graph_from_paths(S)
    if ROOT_ID not in G:
        raise ValueError(f"Root id {ROOT_ID} not in graph.")

    # Diagnostics: ensure your prefixes actually match nodes
    diagnose_categories(id_to_path, category_map)

    # 2) Select collapsed regions **within categories** (never above them)
    collapsed_region_ids = select_collapsed_antichain_within_categories(
        G,
        id_to_path=id_to_path,
        category_map=category_map,
        target_n=TARGET_N,
        root_id=ROOT_ID,
        min_offset=MIN_OFFSET_BELOW_CATEGORY,
        allow_category_roots=ALLOW_CATEGORY_ROOTS,
        ban_ids=BAN_IDS,
        relax=RELAX_DEPTH_IF_N_SHORT,
    )

    # Fallback if empty or too small: allow category roots
    if len(collapsed_region_ids) == 0:
        print("[WARN] No nodes selected under current constraints. Retrying allowing category roots...")
        collapsed_region_ids = select_collapsed_antichain_within_categories(
            G, id_to_path, category_map, TARGET_N, root_id=ROOT_ID,
            min_offset=0, allow_category_roots=True, ban_ids=BAN_IDS, relax=True
        )

    depths = [node_depth_from_path(s, id_to_path) for s in collapsed_region_ids] if collapsed_region_ids else []
    if collapsed_region_ids:
        print(f"[SELECT] Selected {len(collapsed_region_ids)} collapsed nodes within categories. "
              f"Depth range: {min(depths)}–{max(depths)}")
    else:
        print("[SELECT] Still selected 0 nodes — check category prefixes in the DIAG output above.")

    # 3) Map every atlas region to its nearest selected ancestor (will be empty map if selection empty)
    structure_to_collapsed = map_child_to_collapsed(id_to_path, collapsed_region_ids)

    # 4) Aggregate per mouse & hemisphere (cells_per_mm3)
    (mean_L, std_L, sem_L), (mean_R, std_R, sem_R), log_rows = aggregate_per_mouse_hemi(
        input_excel, structure_to_collapsed, id_to_name, id_to_path
    )

    # 5) Build wide tables with metadata
    if collapsed_region_ids:
        df_mean = build_wide_with_meta(mean_L, mean_R, id_to_name, id_to_path, id_to_acronym)
        df_std  = build_wide_with_meta(std_L,  std_R,  id_to_name, id_to_path, id_to_acronym)
        df_sem  = build_wide_with_meta(sem_L,  sem_R,  id_to_name, id_to_path, id_to_acronym)
    else:
        # empty frames with just meta headers
        df_mean = pd.DataFrame(columns=["region_id","acronym","name","structure_id_path","depth"])
        df_std  = df_mean.copy(); df_sem = df_mean.copy()

    # 6) Save outputs
    with pd.ExcelWriter(output_excel) as writer:
        df_mean.to_excel(writer, sheet_name="mean_cells_per_mm3", index=False)
        df_std.to_excel(writer,  sheet_name="std_cells_per_mm3",  index=False)
        df_sem.to_excel(writer,  sheet_name="sem_cells_per_mm3",  index=False)
    print(f"[OK] Collapsed matrices written → {output_excel}")

    pd.DataFrame(log_rows).to_csv(output_log, index=False)
    print(f"[OK] Contribution log written → {output_log} (rows={len(log_rows)})")

    # 7) Audit CSVs (safe if empty)
    export_audit_files(collapsed_region_ids, id_to_name, id_to_acronym, id_to_path,
                       structure_to_collapsed, audit_dir)


if __name__ == "__main__":
    main()



[DIAG] Category coverage:
  - Medulla: 0 nodes under prefix 997/8/343/1065/354/  <<< CHECK THIS PREFIX
  - Pons: 0 nodes under prefix 997/8/343/1065/771/  <<< CHECK THIS PREFIX
  - Hypothalamus: 0 nodes under prefix 997/8/343/1129/1097/  <<< CHECK THIS PREFIX
  - Thalamus: 0 nodes under prefix 997/8/343/1129/549/  <<< CHECK THIS PREFIX
  - Midbrain: 0 nodes under prefix 997/8/343/313/  <<< CHECK THIS PREFIX
  - Cerebellum: 0 nodes under prefix 997/8/512/  <<< CHECK THIS PREFIX
  - Cortical plate: 0 nodes under prefix 997/8/567/688/695/  <<< CHECK THIS PREFIX
  - Cortical subplate: 0 nodes under prefix 997/8/567/688/703/  <<< CHECK THIS PREFIX
  - Pallidum: 0 nodes under prefix 997/8/567/623/803/  <<< CHECK THIS PREFIX
  - Striatum: 0 nodes under prefix 997/8/567/623/477/  <<< CHECK THIS PREFIX

[WARN] No nodes selected under current constraints. Retrying allowing category roots...
[SELECT] Still selected 0 nodes — check category prefixes in the DIAG output above.
[OK] Collapsed matric