# Merge

In [1]:
%load_ext autoreload
%autoreload 2

import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import decoupler as dc

import os
from pathlib import Path

from utils import preprocessing
from utils import DEG


BASE_PATH = os.getenv("BASE_PATH_FOR_SAVING")
DISEASED_SAMPLES_FOLDER = f"{BASE_PATH}/diseased"
MSN_BICAN_FOLDER = "/home/gdallagl/myworkdir/XDP/data/STR_zonation_references_h5ad"

# Cell types to Analyse by GroupName in mmc
CT_TO_ANALYSE = [
"STRd D1 Matrix MSN","STRv D1 MSN","Tafa1 D1 Matrix MSN","STRd D2 Matrix MSN","STRv D2 MSN",
"Tafa1 D2 Matrix MSN","STRd D1 Striosome MSN","STRd D2 Striosome MSN","STRv D1 NUDAP MSN","STR D1D2 Hybrid MSN",
"STRd D2 StrioMat Hybrid MSN","Tafa1 STRd D2 StrioMat Hybrid MSN"]#,"OPC","Astrocyte"]

# Cell type annotation to use for DEG
CT_ANNOTATION_FOR_DEG = ""


2026-01-16 16:59:54 | [INFO] cffi mode is CFFI_MODE.ANY
2026-01-16 16:59:54 | [INFO] R home found: /usr/lib/R
2026-01-16 16:59:55 | [INFO] R library path: /usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/default-java/lib/server:/usr/local/cuda-12.2/lib64:/usr/local/cuda-12.2/lib64:/usr/local/cuda-12.2/lib64
2026-01-16 16:59:55 | [INFO] LD_LIBRARY_PATH: /usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/default-java/lib/server:/usr/local/cuda-12.2/lib64:/usr/local/cuda-12.2/lib64:/usr/local/cuda-12.2/lib64
2026-01-16 16:59:55 | [INFO] Default options to initialize R: rpy2, --quiet, --no-save
2026-01-16 16:59:55 | [INFO] Environment variable "LD_LIBRARY_PATH" redefined by R and overriding existing variable. Current: "/usr/local/cuda-12.2/lib64:/usr/local/cuda-12.2/lib64:/usr/local/cuda-12.2/lib64", R: "/usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/default-java/lib/server:/usr/local/cuda-12.2/lib64:/usr/local/cuda-12.2/lib64:/usr/local/cuda-12.2/lib64"
2026-01-16 16:59:55 

# Create Healthy Diseased merged adata

Create adata obj with both dieseased anf helathy.

Helthy and diseased samples are saved in differt locations and have differ nomenclture for columns.

In addition, diseased are split btu sample, while healthy are already split by cell type but not by sample.

### Merge Diseased adata

In [2]:
adata_diseased_list = []
sample_diseased_list = []

# Load Diseased data (each adata is a differt sample/patient)
for sample_folder in [p for p in Path(DISEASED_SAMPLES_FOLDER).iterdir() if p.is_dir()]:

    print(sample_folder)

    adata_path = sample_folder / "adata" / "all_lib_adata_zoned.h5ad"

    adata_tmp = sc.read_h5ad(adata_path, backed="r")

    # print(adata_tmp.obs.columns)
    # display(adata_tmp.obs)
    # print(sample_folder)

    # restart from raw counts
    adata_minimal = sc.AnnData(
        X=adata_tmp.layers["counts"].copy(),                   
        obs=adata_tmp.obs.copy(),
        var=adata_tmp.var.copy()
    )
    del adata_tmp

    # Very that .X is raw counts
    all_integer_like = np.all(np.mod(adata_minimal.X.data, 1) == 0)
    assert all_integer_like, f"Expression matrix .X is not integer-like (raw counts). --> {adata_path}"

    # Filter cell types
    adata_minimal = adata_minimal[adata_minimal.obs["Group_name"].isin(CT_TO_ANALYSE)]

    # add sample:
    sample_diseased_list.append(sample_folder.name)

    # add ot list
    adata_diseased_list.append(adata_minimal)

    #break

adata_diseased_list

/home/gdallagl/myworkdir/data/XDP/diseased/sample_01
/home/gdallagl/myworkdir/data/XDP/diseased/recon_241105


[View of AnnData object with n_obs × n_vars = 11130 × 32780
     obs: 'x', 'y', 'pct_intronic', 'is_cell', 'dbscan_clusters', 'dbscan_score', 'has_spatial', 'Neighborhood_name', 'Neighborhood_bootstrapping_probability', 'Neighborhood_aggregate_probability', 'Class_name', 'Class_bootstrapping_probability', 'Class_aggregate_probability', 'Subclass_name', 'Subclass_bootstrapping_probability', 'Subclass_aggregate_probability', 'Group_name', 'Group_bootstrapping_probability', 'Group_aggregate_probability', 'Cluster_label', 'Cluster_name', 'Cluster_alias', 'Cluster_bootstrapping_probability', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'my_hierarchy_all', 'library', 'library_name', 'leiden_1', 'leiden_2', 'leiden_3', 'leiden_4', 'leiden_scANVI', 'mapping_MSN_for_zoning', 'mapping_coupling_for_zoning', 'zone', 'zone_probability'
     var: 'feature_types', 'genome', 'gene_symbol', 'highly_var

In [3]:
# Save var info from first dataset before concat
# So all dataset will have same metadata for genes
var_info = adata_diseased_list[0].var[['feature_types', 'genome', 'gene_symbol']].copy()

# Concate
all_adata_diseased = sc.concat(adata_diseased_list, 
                  axis=0, # concatenate along observations, i.e., stacking cells)
                  index_unique='-' , # causes new barcodes to be orig_idx + '-' + key
                  keys=sample_diseased_list,
                  label="donor_id",
                  join='inner')  # keeps COMMON genes
del adata_diseased_list

# Restore var info for common genes
all_adata_diseased.var = var_info.loc[all_adata_diseased.var_names]

# Gene naems need to be symbols to be the same as healthy adata
all_adata_diseased.var_names = all_adata_diseased.var['gene_symbol']

print(all_adata_diseased)
display(all_adata_diseased.obs.head(3))
display(all_adata_diseased.var.head(3))
assert all_adata_diseased.obs_names.is_unique, "ERROR: all_adata_diseased.obs_names are not unique!"

AnnData object with n_obs × n_vars = 30844 × 30914
    obs: 'x', 'y', 'pct_intronic', 'is_cell', 'has_spatial', 'Neighborhood_name', 'Neighborhood_bootstrapping_probability', 'Neighborhood_aggregate_probability', 'Class_name', 'Class_bootstrapping_probability', 'Class_aggregate_probability', 'Subclass_name', 'Subclass_bootstrapping_probability', 'Subclass_aggregate_probability', 'Group_name', 'Group_bootstrapping_probability', 'Group_aggregate_probability', 'Cluster_label', 'Cluster_name', 'Cluster_alias', 'Cluster_bootstrapping_probability', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'my_hierarchy_all', 'library', 'leiden_1', 'leiden_2', 'leiden_3', 'leiden_4', 'mapping_coupling_for_zoning', 'zone', 'zone_probability', 'donor_id'
    var: 'feature_types', 'genome', 'gene_symbol'


AnnData expects .var.index to contain strings, but got values like:
    ['MIR1302-2HG', 'ENSG00000238009', 'ENSG00000239945', 'ENSG00000239906', 'ENSG00000241860']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "var")


Unnamed: 0,x,y,pct_intronic,is_cell,has_spatial,Neighborhood_name,Neighborhood_bootstrapping_probability,Neighborhood_aggregate_probability,Class_name,Class_bootstrapping_probability,...,my_hierarchy_all,library,leiden_1,leiden_2,leiden_3,leiden_4,mapping_coupling_for_zoning,zone,zone_probability,donor_id
240805_SL-EXD_0328_B22FKKYLT4---SI-TT-H1---AAACCAGCACTAGTCG-1-SI-TT-H1-sample_01,47925.552952,-3509.217099,0.647217,True,True,Subpallium GABA,1.0,1.0,CN LGE GABA,1.0,...,SPN,240805_SL-EXD_0328_B22FKKYLT4---SI-TT-H1,4,7,13,20,STR D1 Matrix MSN,3,0.819,sample_01
240805_SL-EXD_0328_B22FKKYLT4---SI-TT-H1---AAACCCCAGCCATAAT-1-SI-TT-H1-sample_01,39898.157546,394.065473,0.596793,True,True,Subpallium GABA,1.0,1.0,CN LGE GABA,1.0,...,SPN,240805_SL-EXD_0328_B22FKKYLT4---SI-TT-H1,6,8,16,25,STR D2 Matrix MSN,5,1.0,sample_01
240805_SL-EXD_0328_B22FKKYLT4---SI-TT-H1---AAACTAGAGCCTAGAC-1-SI-TT-H1-sample_01,2380.50731,2218.759126,0.657474,True,False,Subpallium GABA,1.0,1.0,CN LGE GABA,1.0,...,SPN,240805_SL-EXD_0328_B22FKKYLT4---SI-TT-H1,7,14,24,33,STRv D1 NUDAP MSN,3,1.0,sample_01


Unnamed: 0_level_0,feature_types,genome,gene_symbol
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MIR1302-2HG,Gene Expression,GRCh38,MIR1302-2HG
ENSG00000238009,Gene Expression,GRCh38,ENSG00000238009
ENSG00000239945,Gene Expression,GRCh38,ENSG00000239945


### Merge Healthy adata

In [4]:
adata_healthy_list = []

# Load Helahy data (each adata is a differt cell type)
for adata_path in [p for p in Path(MSN_BICAN_FOLDER).iterdir() if not p.is_dir()]:

    print(adata_path)

    adata_tmp = sc.read_h5ad(adata_path)#, backed="r")

    # print(adata_tmp.obs.columns)
    # display(adata_tmp.obs)
    # print(sample_folder)

    # restart from raw counts
    adata_minimal = sc.AnnData(
        X=adata_tmp.X.copy(),           # raw counts in X       
        obs=adata_tmp.obs.copy(),
        var=adata_tmp.var.copy()
    )
    del adata_tmp

        # Filter cell types
    adata_minimal = adata_minimal[adata_minimal.obs["Group_name"].isin(CT_TO_ANALYSE)]

    # Very that .X is raw counts
    all_integer_like = np.all(np.mod(adata_minimal.X.data, 1) == 0)
    assert all_integer_like, f"Expression matrix .X is not integer-like (raw counts). --> {adata_path}"

    # add ot list
    adata_healthy_list.append(adata_minimal)

adata_healthy_list

/home/gdallagl/myworkdir/XDP/data/STR_zonation_references_h5ad/STRd_D2_Striosome_MSN.h5ad


/home/gdallagl/myworkdir/XDP/data/STR_zonation_references_h5ad/STRd_D1_Striosome_MSN.h5ad
/home/gdallagl/myworkdir/XDP/data/STR_zonation_references_h5ad/STRd_D2_StrioMat_Hybrid_MSN.h5ad
/home/gdallagl/myworkdir/XDP/data/STR_zonation_references_h5ad/Astrocyte.h5ad
/home/gdallagl/myworkdir/XDP/data/STR_zonation_references_h5ad/STRv_D1_NUDAP_MSN.h5ad
/home/gdallagl/myworkdir/XDP/data/STR_zonation_references_h5ad/STR_D1_Matrix_MSN.h5ad
/home/gdallagl/myworkdir/XDP/data/STR_zonation_references_h5ad/STR_D2_Matrix_MSN.h5ad
/home/gdallagl/myworkdir/XDP/data/STR_zonation_references_h5ad/STR_D1D2_Hybrid_MSN.h5ad


[View of AnnData object with n_obs × n_vars = 17130 × 37905
     obs: 'orig.ident', 'nCount_originalexp', 'nFeature_originalexp', 'PREFIX', 'CELL_BARCODE', 'NUM_GENIC_READS', 'NUM_TRANSCRIPTS', 'NUM_GENES', 'num_retained_transcripts', 'pct_coding', 'pct_utr', 'pct_intergenic', 'pct_genic', 'pct_intronic', 'pct_mt', 'pct_ribosomal', 'frac_contamination', 'experiment', 'donor_external_id', 'is_primary_data', 'Neighborhood_label', 'Neighborhood_name', 'Neighborhood_bootstrapping_probability', 'Class_label', 'Class_name', 'Class_bootstrapping_probability', 'Subclass_label', 'Subclass_name', 'Subclass_bootstrapping_probability', 'Group_label', 'Group_name', 'Group_bootstrapping_probability', 'Cluster_label', 'Cluster_name', 'Cluster_alias', 'Cluster_bootstrapping_probability', 'donor_id', 'n_counts', 'nCount_RNA', 'nFeature_RNA', 'RNA_snn_res.0.8', 'seurat_clusters', 'library', 'donor', 'unique_cell_ID', 'AK_celltype', 'RNA_snn_res.0.2', 'cb', 'clusters', 'num_dbscan', 'logumi', 'cb_index',

In [5]:
# Concate
all_adata_healthy = sc.concat(adata_healthy_list, 
                  axis=0, # concatenate along observations, i.e., stacking cells)
                  index_unique='-' , # causes new barcodes to be orig_idx + '-' + key
                #   keys=sample_diseased_list,
                #   label="donor_id",
                  join='inner')  # keeps COMMON genes
del adata_healthy_list

# Rename columns to match diseased
all_adata_healthy.obs['zone'] = all_adata_healthy.obs['harmony_clusters'].astype(str).copy()

# claculte pythonic QC
all_adata_healthy.var['mt'] = all_adata_healthy.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(all_adata_healthy, qc_vars=['mt'], inplace=True)

print(all_adata_healthy)
display(all_adata_healthy.obs.head(3))
display(all_adata_healthy.var.head(3))
assert all_adata_healthy.obs_names.is_unique, "ERROR: all_adata_healthy.obs_names are not unique!"

AnnData object with n_obs × n_vars = 179572 × 37902
    obs: 'orig.ident', 'nCount_originalexp', 'nFeature_originalexp', 'PREFIX', 'CELL_BARCODE', 'NUM_GENIC_READS', 'NUM_TRANSCRIPTS', 'NUM_GENES', 'num_retained_transcripts', 'pct_coding', 'pct_utr', 'pct_intergenic', 'pct_genic', 'pct_intronic', 'pct_mt', 'pct_ribosomal', 'frac_contamination', 'experiment', 'donor_external_id', 'is_primary_data', 'Neighborhood_label', 'Neighborhood_name', 'Neighborhood_bootstrapping_probability', 'Class_label', 'Class_name', 'Class_bootstrapping_probability', 'Subclass_label', 'Subclass_name', 'Subclass_bootstrapping_probability', 'Group_label', 'Group_name', 'Group_bootstrapping_probability', 'Cluster_label', 'Cluster_name', 'Cluster_alias', 'Cluster_bootstrapping_probability', 'donor_id', 'n_counts', 'nCount_RNA', 'nFeature_RNA', 'RNA_snn_res.0.8', 'seurat_clusters', 'library', 'donor', 'unique_cell_ID', 'AK_celltype', 'RNA_snn_res.0.2', 'cb', 'clusters', 'num_dbscan', 'logumi', 'cb_index', 'percent

Unnamed: 0,orig.ident,nCount_originalexp,nFeature_originalexp,PREFIX,CELL_BARCODE,NUM_GENIC_READS,NUM_TRANSCRIPTS,NUM_GENES,num_retained_transcripts,pct_coding,...,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,total_counts_mt,log1p_total_counts_mt,pct_counts_mt
s5_s5_CGTTGGTAGATCCAAT_1-0,2024-10-25,144267.0,11179,2024-10-25_s5_Slide-tag_10X-GEMX-5P-GEX_BN_rxn1,CGTTGGTAGATCCAAT,262799.0,139798.0,10686.0,139539.0,0.1612,...,9.321882,144267.0,11.879428,24.225221,31.915823,40.788261,54.600151,60.0,4.110874,0.04159
s5_s5_AACGCACTCCCCTATG_2-0,2024-10-25,33064.0,7429,2024-10-25_s5_Slide-tag_10X-GEMX-5P-GEX_BN_rxn2,AACGCACTCCCCTATG,68618.0,32512.0,7260.0,32112.0,0.2329,...,8.913281,33064.0,10.406231,21.691265,28.097024,35.930317,49.283208,28.0,3.367296,0.084684
s5_s5_CTAAGCGGTCCGAAAT_2-0,2024-10-25,350285.0,14106,2024-10-25_s5_Slide-tag_10X-GEMX-5P-GEX_BN_rxn2,CTAAGCGGTCCGAAAT,694067.0,335038.0,13281.0,334810.0,0.1425,...,9.554426,350285.0,12.766505,21.112237,28.276404,37.604522,52.113565,319.0,5.768321,0.091069


Unnamed: 0,mt,n_cells_by_counts,mean_counts,log1p_mean_counts,pct_dropout_by_counts,total_counts,log1p_total_counts
A1BG,False,33370,0.227496,0.204977,81.416925,40852.0,10.617736
A1BG-AS1,False,78478,0.689695,0.524548,56.297196,123850.0,11.726835
A1CF,False,2836,0.017436,0.017286,98.420689,3131.0,8.049427


### Merge All togeteher

In [6]:
len(set(all_adata_diseased.var_names) & set(all_adata_healthy.var_names)) #* 100 / len(set(all_adata_diseased.var_names))


30771

In [7]:
import gc
gc.collect()  # Run once
gc.collect()  # Run TWICE for circular references
gc.collect()  # Even three times for stubborn cases

0

In [8]:
important_columns = ["donor_id", "library", "log1p_n_genes_by_counts", "pct_counts_mt", "zone"]


# Save var info from first dataset before concat
# So all dataset will have same metadata for genes
var_info = all_adata_diseased.var[['feature_types', 'genome', 'gene_symbol']].copy()

# Concate
adata = sc.concat([all_adata_diseased, all_adata_healthy], 
                  axis=0, # concatenate along observations, i.e., stacking cells)
                  index_unique='-' , # causes new barcodes to be orig_idx + '-' + key
                  keys=["diseased", "healthy"],
                  label="state",
                  join='inner')  # keeps COMMON genes
#del all_adata_diseased, all_adata_healthy

# Restore var info for common genes
adata.var = var_info.loc[adata.var_names]

print(adata)
display(adata.obs.head(3))
display(adata.var.head(3))
assert adata.obs_names.is_unique, "ERROR: adata.obs_names are not unique!"

AnnData object with n_obs × n_vars = 210416 × 30771
    obs: 'pct_intronic', 'Neighborhood_name', 'Neighborhood_bootstrapping_probability', 'Class_name', 'Class_bootstrapping_probability', 'Subclass_name', 'Subclass_bootstrapping_probability', 'Group_name', 'Group_bootstrapping_probability', 'Cluster_label', 'Cluster_name', 'Cluster_alias', 'Cluster_bootstrapping_probability', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'library', 'zone', 'donor_id', 'state'
    var: 'feature_types', 'genome', 'gene_symbol'


AnnData expects .var.index to contain strings, but got values like:
    ['MIR1302-2HG', 'ENSG00000238009', 'ENSG00000239906', 'ENSG00000241860', 'ENSG00000286448']

    Inferred to be: categorical

  value_idx = self._prep_dim_index(value.index, attr)


Unnamed: 0,pct_intronic,Neighborhood_name,Neighborhood_bootstrapping_probability,Class_name,Class_bootstrapping_probability,Subclass_name,Subclass_bootstrapping_probability,Group_name,Group_bootstrapping_probability,Cluster_label,...,log1p_n_genes_by_counts,total_counts,log1p_total_counts,total_counts_mt,log1p_total_counts_mt,pct_counts_mt,library,zone,donor_id,state
240805_SL-EXD_0328_B22FKKYLT4---SI-TT-H1---AAACCAGCACTAGTCG-1-SI-TT-H1-sample_01-diseased,0.647217,Subpallium GABA,1.0,CN LGE GABA,1.0,STR D1 MSN,1.0,STRv D1 MSN,1.0,CS20250428_CLUST_0524,...,9.069583,52026.0,10.859518,10.0,2.397895,0.019221,240805_SL-EXD_0328_B22FKKYLT4---SI-TT-H1,3,sample_01,diseased
240805_SL-EXD_0328_B22FKKYLT4---SI-TT-H1---AAACCCCAGCCATAAT-1-SI-TT-H1-sample_01-diseased,0.596793,Subpallium GABA,1.0,CN LGE GABA,1.0,STR D2 MSN,1.0,Tafa1 D2 Matrix MSN,1.0,CS20250428_CLUST_0546,...,9.047704,43110.0,10.671533,57.0,4.060443,0.13222,240805_SL-EXD_0328_B22FKKYLT4---SI-TT-H1,5,sample_01,diseased
240805_SL-EXD_0328_B22FKKYLT4---SI-TT-H1---AAACTAGAGCCTAGAC-1-SI-TT-H1-sample_01-diseased,0.657474,Subpallium GABA,1.0,CN LGE GABA,1.0,STR Hybrid MSN,1.0,STRv D1 NUDAP MSN,1.0,CS20250428_CLUST_0515,...,8.819813,28455.0,10.256114,24.0,3.218876,0.084344,240805_SL-EXD_0328_B22FKKYLT4---SI-TT-H1,3,sample_01,diseased


Unnamed: 0,feature_types,genome,gene_symbol
MIR1302-2HG,Gene Expression,GRCh38,MIR1302-2HG
ENSG00000238009,Gene Expression,GRCh38,ENSG00000238009
ENSG00000239906,Gene Expression,GRCh38,ENSG00000239906


### Save

In [9]:
adata.obs['zone'] = adata.obs['zone'].astype(str).astype('category')

In [10]:
adata.write(f"{BASE_PATH}/adata_for_DEG/adata_for_DEG_slidetag.h5ad")

----
---

---

In [None]:
# df = pd.read_csv("/home/gdallagl/myworkdir/XDP/utils/dorsal_ventral_gradient/exp_by_spline_combined.csv")
# df = df[(np.abs(df.mean_spearman) >= 0.1) & (df.combined_p <= 0.05)].sort_values("mean_spearman")
# print(df.shape)
# df.to_csv("/home/gdallagl/myworkdir/XDP/utils/dorsal_ventral_gradient/exp_by_spline_combined_corr>0.1.csv")

(1241, 3)


# Convert to .qs to .h5ad