In [None]:
import pandas as pd
import networkx as nx
import os
from collections import defaultdict
import numpy as np

# === CONFIG ===
structures_path = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\code\cfos_preprocessing\allen_mouse_10um_v1.2\structures.csv"
input_excel = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\mean_dff_per_region_by_mouse_brainmapper.xlsx"
output_excel = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\final_collapsed_matrix_2.xlsx"
output_log = r"Y:\public\projects\AnAl_20240405_Neuromod_PE\PE_mapping\processed_data\dff_downsampled\dff_collapsing_log_2.csv"
TARGET_N = 160
# === Load Allen structure data ===
region_table = pd.read_csv(structures_path)
region_table["structure_id_path"] = region_table["structure_id_path"].astype(str)
id_to_acronym = region_table.set_index("id")["acronym"].astype(str).to_dict()
id_to_path = region_table.set_index("id")["structure_id_path"].astype(str).to_dict()
id_to_name = region_table.set_index("id")["name"].astype(str).to_dict()

# === Build anatomical hierarchy graph ===
G = nx.DiGraph()
for _, row in region_table.iterrows():
    path = row["structure_id_path"].strip("/").split("/")
    for i in range(len(path) - 1):
        G.add_edge(path[i], path[i + 1])

# === Collapse region selection ===
def collapse_tree_to_n(graph, n):
    leaves = [n for n in graph.nodes if graph.out_degree(n) == 0]
    all_paths = [nx.shortest_path(graph, source="997", target=leaf)
                 for leaf in leaves if nx.has_path(graph, "997", leaf)]
    from collections import Counter
    counts = Counter()
    for path in all_paths:
        counts.update(path)
    ranked = [node for node, _ in counts.most_common()]
    selected = set()
    for node in ranked:
        descendants = nx.descendants(graph, node)
        if not any(d in selected for d in descendants):
            selected.add(node)
        if len(selected) >= n:
            break
    return list(selected)

collapsed_region_ids = collapse_tree_to_n(G, TARGET_N)

# === Map child → collapsed parent ===
structure_to_collapsed = {}
for sid, path in id_to_path.items():
    parts = path.strip("/").split("/")
    for node in reversed(parts):
        if node in collapsed_region_ids:
            structure_to_collapsed[str(sid)] = node
            break

# === Process all mice ===
df_by_mouse = pd.read_excel(input_excel, sheet_name=None)
mean_data = defaultdict(dict)
std_data = defaultdict(dict)
sem_data = defaultdict(dict)
log_rows = []

for sheet_name, df_mouse in df_by_mouse.items():
    if sheet_name.lower() == "summary" or "structure_id_path" not in df_mouse.columns:
        continue
    mouse_id = sheet_name
    temp_assignments = defaultdict(list)

    for _, row in df_mouse.iterrows():
        path = row["structure_id_path"].strip("/").split("/")
        region_id = path[-1]
        collapsed_id = structure_to_collapsed.get(region_id)
        if collapsed_id:
            temp_assignments[collapsed_id].append(row["mean_dff"])
            log_rows.append({
                "mouse": mouse_id,
                "collapsed_region_id": collapsed_id,
                "collapsed_region_name": id_to_name.get(int(collapsed_id), ""),
                "child_region_id": region_id,
                "child_region_name": id_to_name.get(int(region_id), ""),
                "child_structure_id_path": row["structure_id_path"],
                "mean_dff_contribution": row["mean_dff"]
            })

    for cid, values in temp_assignments.items():
        arr = np.array(values, dtype=float)
        mean_data[mouse_id][cid] = np.mean(arr)
        std_data[mouse_id][cid] = np.std(arr, ddof=1) if len(arr) > 1 else 0.0
        sem_data[mouse_id][cid] = std_data[mouse_id][cid] / np.sqrt(len(arr)) if len(arr) > 1 else 0.0

# === Format outputs
def build_df(data_dict, id_to_name, id_to_path, id_to_acronym):
    df = pd.DataFrame(data_dict).T
    df.index.name = "mouse"
    df.columns.name = "collapsed_region_id"
    df = df.T
    df["region_id"] = df.index.astype(int)
    df["name"] = df["region_id"].map(lambda x: id_to_name.get(x, ""))
    df["acronym"] = df["region_id"].map(lambda x: id_to_acronym.get(x, ""))
    df["structure_id_path"] = df["region_id"].map(lambda x: id_to_path.get(x, ""))
    df["depth"] = df["structure_id_path"].map(lambda p: len(p.strip("/").split("/")))
    meta_cols = ["region_id", "name", "acronym", "structure_id_path", "depth"]
    return df[meta_cols + [col for col in df.columns if col not in meta_cols]]

df_mean = build_df(mean_data, id_to_name, id_to_path, id_to_acronym)
df_std = build_df(std_data, id_to_name, id_to_path, id_to_acronym)
df_sem = build_df(sem_data, id_to_name, id_to_path, id_to_acronym)

# === Export all
with pd.ExcelWriter(output_excel) as writer:
    df_mean.to_excel(writer, sheet_name="mean_dff", index=False)
    df_std.to_excel(writer, sheet_name="std_dff", index=False)
    df_sem.to_excel(writer, sheet_name="sem_dff", index=False)
print(f"Matrix with mean, std, sem saved to:\n{output_excel}")

# Save log
pd.DataFrame(log_rows).to_csv(output_log, index=False)
print(f"Detailed ΔF/F log saved to:\n{output_log}")
