In [1]:
# notebook to generate VISp_neighbor_stats_v0.h5ad

import anndata as ad
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.neighbors import NearestNeighbors

from txtox.utils import get_paths

path = get_paths()["data_root"] + "VISp.h5ad"
adata = ad.read_h5ad(path)

k = 15

xyz_coords = adata.obs[["x_section", "y_section", "z_section"]].values
nn = NearestNeighbors(n_neighbors=k + 1, metric="euclidean")  # 16 because it includes the point itself
nn.fit(xyz_coords)


def get_top_k_neighbors(row_index):
    distances, indices = nn.kneighbors(xyz_coords[row_index].reshape(1, -1))
    return indices[0][1:]  # excludes the point itself


neighbor_indices = [get_top_k_neighbors(i) for i in range(len(adata.obs))]
neighbor_indices = np.vstack(neighbor_indices)



In [2]:
# keep only non-blank entries
blank_genes = adata.var["gene_symbol"].str.startswith("Blank")
X = adata.X.toarray()
X_nonblank = X[:, ~blank_genes]

X_neighbors = X_nonblank[neighbor_indices]  # shape (n_cells, n_neighbors, n_genes)
assert X_neighbors.shape == (X_nonblank.shape[0], k, X_nonblank.shape[1])
X_means = X_neighbors.mean(axis=1)  # shape (n_cells, n_genes)
X_stds = X_neighbors.std(axis=1)  # shape (n_cells, n_genes)

X = np.concatenate([X_nonblank, X_means, X_stds], axis=1)

In [3]:
gene_names = adata.var["gene_symbol"][~blank_genes]
gene_names = np.concatenate([
    gene_names,
    [f"{gene}_mean" for gene in gene_names],
    [f"{gene}_std" for gene in gene_names],
])
var_df = pd.DataFrame({"gene_symbol": gene_names})
adata_neighbor_stats_v0 = ad.AnnData(sp.csr_matrix(X), obs=adata.obs, var=var_df)



In [4]:
data_path = get_paths()["data_root"]
adata_neighbor_stats_v0.write_h5ad(data_path + "/VISp_neighbor_stats_v0.h5ad")

In [None]:
# Get patches and their neighbors, put them in a separate file. This would be a reasonable validation set.