In [4]:
import sys
print(sys.executable)
!conda info | grep 'active env'

/sc/arion/work/massen06/.conda/envs/scvi/bin/python
     active environment : scvi
    active env location : /sc/arion/work/massen06/.conda/envs/scvi


In [5]:
import os
import copy
import scvi
import anndata as ad
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.sparse as sparse
import h5py

# Mouse Living Brain - Process Allen Brain Map Data
Nicolas Masse, Donghoon Lee  
August 2023

Allen Brain Map data was downloaded from http://portal.brain-map.org/atlases-and-data/rnaseq/mouse-whole-cortex-and-hippocampus-10x.

### Get highly variable genes from processed living brain data

In [46]:
living_brain_path = "/sc/arion/work/massen06/living_brain1/anndata/mouse_living_brain_pre_final_clean.h5ad"
adata_lb = sc.read_h5ad(living_brain_path)
var_genes_lb = [k for k, v in adata_lb.var.highly_variable_features.items() if v]

### Allen data file path names

In [24]:
allen_meta_path = "/sc/arion/projects/psychAD/massen06/allen_brain_map/metadata.csv"
allen_data_path = "/sc/arion/projects/psychAD/massen06/allen_brain_map/expression_matrix.hdf5"

### Load metadata and filter by region label
We only include cells from visual cortex, retrosplenial cortex

In [25]:
allen_meta = pd.read_csv(allen_meta_path)

In [26]:
allen_meta.region_label.unique()

array(['RSP', 'TEa-PERI-ECT', 'ACA', 'AI', 'SSs-GU-VISC-AIp', 'AUD',
       'ENT', 'MOp', 'MOs_FRP', 'PAR-POST-PRE-SUB-ProS', 'PL-ILA-ORB',
       'PTLp', 'SSp', 'VIS', 'VISl', 'VISm', 'VISp', 'HIP'], dtype=object)

In [27]:
idx = [n.startswith("VIS") or n.startswith("RSP") for n in allen_meta.region_label]
allen_meta = allen_meta[idx]

### Find Allen Brain genes that match var_genes_lb

In [51]:
with h5py.File(allen_data_path, "r") as f:
    genes_allen = f["data"]["gene"][:]

In [53]:
var_genes_lb = [v.lower() for v in var_genes_lb]

idx_allen = []
genes_allen = [g.decode('ascii').lower() for g in genes_allen]
idx_allen = [n for n, g in enumerate(genes_allen) if g in var_genes_lb]         

In [56]:
print(f"Number of Allen genes: {len(idx_allen)} and number of living brain genes: {len(var_genes_lb)}")

Number of Allen genes: 1707 and number of living brain genes: 1744


### Create the AnnData structures using only the highly variable genes

In [33]:
with h5py.File(allen_data_path, "r") as f:
    x = f["data"]["counts"][idx_allen, :]
    genes = f["data"]["gene"][idx_allen]

In [34]:
x = sparse.csr_matrix(x.T)

In [35]:
adata = ad.AnnData(x)

In [36]:
adata.var["gene_name"] = list(genes)

In [37]:
with h5py.File(allen_data_path, "r") as f:
    samples = f["data"]["samples"][:]

In [19]:
region_id = []
region_label = []
class_label = []
subclass_label = []
index = []
sample_name = np.array(allen_meta["sample_name"])
for n, s in enumerate(samples):
    s = s.decode("ascii")
    
    # no visual or RSP cells occur before index 675_000
    if n < 675_000:
        continue
    if n % 5000 == 0:
        print(n, len(samples), len(subclass_label), len(index))

    if s in sample_name:
        idx = np.where(s == sample_name)[0][0]
        region_id.append(allen_meta["region_id"].values[idx])
        class_label.append(allen_meta["class_label"].values[idx])
        subclass_label.append(allen_meta["subclass_label"].values[idx])
        region_label.append(allen_meta["region_label"].values[idx])
        index.append(n)



675000 1169320 0 0
680000 1169320 0 0
685000 1169320 3460 3460
690000 1169320 8460 8460
695000 1169320 13460 13460
700000 1169320 18460 18460
705000 1169320 23460 23460
710000 1169320 28460 28460
715000 1169320 33460 33460
720000 1169320 38460 38460
725000 1169320 43460 43460
730000 1169320 48460 48460
735000 1169320 53460 53460
740000 1169320 58460 58460
745000 1169320 63460 63460
750000 1169320 65710 65710
755000 1169320 65710 65710
760000 1169320 65710 65710
765000 1169320 65710 65710
770000 1169320 65710 65710
775000 1169320 65710 65710
780000 1169320 65710 65710
785000 1169320 65710 65710
790000 1169320 65710 65710
795000 1169320 65710 65710
800000 1169320 65710 65710
805000 1169320 65710 65710
810000 1169320 65710 65710
815000 1169320 65710 65710
820000 1169320 65710 65710
825000 1169320 65710 65710
830000 1169320 67410 67410
835000 1169320 72410 72410
840000 1169320 77410 77410
845000 1169320 82410 82410
850000 1169320 87410 87410
855000 1169320 92410 92410
860000 1169320 97410 

In [20]:
adata_sample = adata[index]

In [21]:
adata_sample.obs["region_id"] = region_id
adata_sample.obs["region_label"] = region_label
adata_sample.obs["class_label"] = class_label
adata_sample.obs["subclass_label"] = subclass_label


  adata_sample.obs["region_id"] = region_id


In [23]:
adata_sample.write("allen_brain_map_vis_rsp_v2.h5ad")