In [1]:
import scanpy as sc
import pandas as pd
from scipy.io import mmread


base = "/home/gdallagl/myworkdir/XDP/data/_old/XDP_striatum_RNA_final_051925"

counts_mtx    = base + "_counts.mtx"
genes_tsv     = base + "_genes.tsv"
barcodes_tsv  = base + "_barcodes.tsv"
metadata_csv  = base + "_metadata.csv"

In [7]:
%%bash
R --vanilla << 'EOF'

library(qs)
library(Seurat)

# Set the path
QS_PATH <- "/home/gdallagl/myworkdir/XDP/data/XDP/artificial_bican/geneset_001/zonated_objs_combined_with_md__combined__rep_001__ventral_matrix_keep_1.0.qs"
QS_PATH_NOEXT <- tools::file_path_sans_ext(QS_PATH)

# Load the Seurat object
print("Loading Seurat object...")
seurat <- qread(QS_PATH)
print(seurat)
print(paste0("Loaded: ", ncol(seurat), " cells, ", nrow(seurat), " genes"))

# Extract the counts matrix
counts <- GetAssayData(seurat, assay = "RNA", layer = "counts")
summary(counts)

# Save files
print("Saving counts matrix...")
saveRDS(counts, paste0(QS_PATH_NOEXT, "_counts.rds"), compress = FALSE)

print("Saving gene names...")
write.table(rownames(counts), paste0(QS_PATH_NOEXT, "_genes.tsv"), 
            quote = FALSE, row.names = FALSE, col.names = FALSE)

print("Saving cell barcodes...")
write.table(colnames(counts), paste0(QS_PATH_NOEXT, "_barcodes.tsv"), 
            quote = FALSE, row.names = FALSE, col.names = FALSE)

print("Saving metadata...")
write.csv(seurat@meta.data, paste0(QS_PATH_NOEXT, "_metadata.csv"), row.names = TRUE)

EOF


R version 4.5.2 (2025-10-31) -- "[Not] Part in a Rumble"
Copyright (C) 2025 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.



> 
> library(qs)


qs 0.27.3. Announcement: https://github.com/qsbase/qs/issues/103


> library(Seurat)


Loading required package: SeuratObject
Loading required package: sp

Attaching package: ‘SeuratObject’

The following objects are masked from ‘package:base’:

    %||%, intersect, t


Attaching package: ‘Seurat’

The following object is masked from ‘package:base’:

    %||%



> 
> # Set the path
> QS_PATH <- "/home/gdallagl/myworkdir/XDP/data/XDP/artificial_bican/geneset_001/zonated_objs_combined_with_md__combined__rep_001__ventral_matrix_keep_1.0.qs"
> QS_PATH_NOEXT <- tools::file_path_sans_ext(QS_PATH)
> 
> # Load the Seurat object
> print("Loading Seurat object...")
[1] "Loading Seurat object..."
> seurat <- qread(QS_PATH)
> print(seurat)
An object of class Seurat 
37905 features across 191639 samples within 1 assay 
Active assay: RNA (37905 features, 0 variable features)
 1 layer present: counts
> print(paste0("Loaded: ", ncol(seurat), " cells, ", nrow(seurat), " genes"))
[1] "Loaded: 191639 cells, 37905 genes"
> 
> # Extract the counts matrix
> counts <- GetAssayData(seurat, assay = "RNA", layer = "counts")
> summary(counts)
37905 x 191639 sparse Matrix of class "dgCMatrix", with 1680369802 entries
          i j    x
1         1 1    1
2         2 1    1
3        14 1    1
4        15 1    2
5        24 1    9
6        25 1   30
7        26 1    3
8   

In [10]:
import rpy2.robjects as ro
r = ro.r
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
import numpy as np
import pandas as pd
import scanpy as sc
from scipy.sparse import csc_matrix

import os


print("\nLoading sparse matrix from RDS...")

# Load the RDS file
counts_r = r.readRDS(base + "_counts.rds")

# Extract sparse matrix components manually
# R's dgCMatrix has slots: x (values), i (row indices), p (column pointers), Dim (dimensions)
data = np.array(counts_r.slots['x'])
indices = np.array(counts_r.slots['i'])
indptr = np.array(counts_r.slots['p'])
shape = tuple(counts_r.slots['Dim'])

# Create scipy sparse matrix and transpose for scanpy
X = csc_matrix((data, indices, indptr), shape=shape).T.tocsr()

# Load gene and cell names
genes = pd.read_csv(base + "_genes.tsv", header=None)[0].values
cells = pd.read_csv(base + "_barcodes.tsv", header=None)[0].values

# Create AnnData object
adata = sc.AnnData(X=X)
adata.var_names = genes
adata.obs_names = cells

# Load and attach metadata
metadata = pd.read_csv(base + "_metadata.csv", index_col=0)
adata.obs = metadata.loc[adata.obs_names]

# Save as h5ad
adata.write_h5ad(base + "_raw.h5ad")

print(f"\nSuccess! Created AnnData with {adata.n_obs} cells and {adata.n_vars} genes")
print(f"Saved to: {base}_raw.h5ad")



Loading sparse matrix from RDS...


  metadata = pd.read_csv(base + "_metadata.csv", index_col=0)



Success! Created AnnData with 191639 cells and 37905 genes
Saved to: /home/gdallagl/myworkdir/XDP/data/XDP/artificial_bican/geneset_001/zonated_objs_combined_with_md__combined__rep_001__ventral_matrix_keep_1.0_raw.h5ad


In [11]:
adata

AnnData object with n_obs × n_vars = 191639 × 37905
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'donor_id', 'PREFIX', 'CELL_BARCODE', 'NUM_GENIC_READS', 'NUM_TRANSCRIPTS', 'NUM_GENES', 'num_retained_transcripts', 'pct_coding', 'pct_utr', 'pct_intergenic', 'pct_genic', 'pct_intronic', 'pct_mt', 'pct_ribosomal', 'frac_contamination', 'experiment', 'Neighborhood_name', 'Neighborhood_bootstrapping_probability', 'Class_name', 'Class_bootstrapping_probability', 'Subclass_name', 'Subclass_bootstrapping_probability', 'Group_name', 'Group_bootstrapping_probability', 'Cluster_name', 'Cluster_alias', 'Cluster_bootstrapping_probability', 'x', 'y', 'library', 'donor', 'x_um2', 'y_um2', 'unique_cell_ID', 'cb', 'Final_Zone_Assignments', 'Group_name.1', 'cell_type', 'Cohort', 'Biobank', 'Age.at.Death', 'Sex', 'Race', 'Ethnicity', 'Cause.of.Death', 'PMI', 'spn_type', 'case_control', 'RNA_snn_res.0.2', 'seurat_clusters', 'RNA_snn_res.0.5', 'Row.names', 'PC_1', 'PC_2', 'PC_3', 'PC_4', 'PC_5', 'P

In [12]:
import numpy as np

np.all(np.mod(adata.X.data, 1) == 0)


np.True_

In [None]:
obs_df = adata.obs.copy()

for col in obs_df.columns:
    unique_vals = obs_df[col].unique()
    print(f"Column: {col}")
    print(f"Unique values ({len(unique_vals)}): {unique_vals}\n")

Column: orig.ident
Unique values (15): ['2024-10-25', '2025-03-11', '2025-03-18', '2025-03-19', '2025-03-24', ..., '2025-04-15', '2025-05-13', '2025-05-23', '2025-05-29', '2025-06-04']
Length: 15
Categories (15, object): ['2024-10-25', '2025-03-11', '2025-03-18', '2025-03-19', ..., '2025-05-29', '2025-05-30', '2025-06-03', '2025-06-04']

Column: nCount_RNA
Unique values (104327): [149923. 105479.  86852. ...  82117.  28432.  33696.]

Column: nFeature_RNA
Unique values (10993): [11741 10583  9882 ...  2508  3418  3965]

Column: donor_id
Unique values (19): ['PT13935', 'UMBEB23073', 'UMBEB23033', 'MS913848', 'UMBEB23127', ..., 'UMBEB24013', 'UMBEB23158', 'MD6927', 'MS876075', 'MS986638']
Length: 19
Categories (19, object): ['MD6927', 'MD9129', 'MD9162', 'MD9244', ..., 'UMBEB23127', 'UMBEB23158', 'UMBEB23164', 'UMBEB24013']

Column: PREFIX
Unique values (77): ['2024-10-25_s5_Slide-tag_10X-GEMX-5P-GEX_BN_rxn1', '2024-10-25_s5_Slide-tag_10X-GEMX-5P-GEX_BN_rxn2', '2024-10-25_s5_Slide-tag_10X

In [None]:
adata.obs.donor_id.unique()
adata.obs.region.unique()
adata.obs.repeat_length.unique()
adata.obs.age_of_onset.unique()
adata.obs.age_of_death.unique()
adata.obs.disease_duration.unique()
adata.obs["immediate.cause.of.death"].unique()
adata.obs.infection_related_death.unique()
adata.obs.sex.unique()
adata.obs.condition.unique()


adata.obs.columns



table = (
    adata.obs
    .groupby(["infection_related_death", "condition"])["donor_id"]
    .nunique()
    .unstack(fill_value=0) 
)

table