In [1]:
import sys
print(sys.executable)
!conda info | grep 'active env'

/sc/arion/work/massen06/.conda/envs/qc/bin/python3.8
     active environment : qc
    active env location : /sc/arion/work/massen06/.conda/envs/qc


In [2]:
import os
import copy
import pge
import anndata as ad
import scanpy as sc
import pandas as pd
import pegasus as pg
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


scanpy==1.9.3 anndata==0.9.1 umap==0.5.3 numpy==1.24.4 scipy==1.10.1 pandas==2.0.3 scikit-learn==1.3.0 statsmodels==0.14.0 python-igraph==0.10.6 louvain==0.8.1 pynndescent==0.5.10


# Mouse Living Brain - Data Loading
Nicolas Masse, Donghoon Lee  
July 2023

### Paths for metadata spreadsheet, gene info, and target path for AnnData

In [3]:
metadata_dir = "/sc/arion/projects/CommonMind/single-cell-neurogenomics/LIVING_BRAIN/metadata"
base_dir = "/sc/arion/projects/CommonMind/single-cell-neurogenomics/LIVING_BRAIN/pre_processed_files/STARsolo_bams"
save_dir = "/sc/arion/work/massen06/living_brain1/anndata"
gene_info_fn = "/hpc/users/massen06/MM_GRCm39_full.txt" 
mouse_mitocarta_fn = "MouseMitoCarta3.0.csv" 

### Extract metadata from Living_brain_snRNA_template.xlsx


In [4]:
metadata_fn = os.path.join(metadata_dir, "Living_brain_snRNA_template.xlsx")
metadata = pd.read_excel(metadata_fn, sheet_name="Mouse_Master Libraries")

In [5]:
def extract_donor_information(data):
    """Extract donor name, PMI and living status"""
    
    donor_info = {
        "unique_id": [],
        "SubID" : [],
        "pmi": [],
        "living": [],
    }

    for d in data.values:
        
        donor_info["unique_id"].append(d)
        
        # get donor name
        i0 = d.find("_")
        donor_info["SubID"].append(d[: i0])
        
        # get PMI
        time = int("".join([s for s in d[i0 :] if s.isdigit()]))
        if "hr" in d[i0 :]:
            time *= 60
        donor_info["pmi"].append(time)
        
        if "PM" in d or "Post" in d or time > 0:
            donor_info["living"].append(False)
        else:
            donor_info["living"].append(True)
        
    return donor_info

In [6]:
for k in metadata.keys():
    
    column_name = metadata[k].iloc[0]

    if column_name == "unique sample ID":
        donor_info = extract_donor_information(metadata[k].iloc[1 :])
        
    elif column_name == "Step 2 date":
        dates = metadata[k].iloc[1 :]
        dates = [d.strftime("%m%d%Y") for d in dates]
        donor_info["Channel"] = dates
        
    elif column_name == "cDNA Amp cycle #s":
        cycles = metadata[k].iloc[1 :]
        cycles = [int(c) for c in cycles]
        donor_info["cdna_amp_cycle"] = cycles
        
    elif column_name == "input (ng)":
        inputs = metadata[k].iloc[1 :]
        inputs = [round(float(i), 1) for i in inputs]
        donor_info["input_ng"] = inputs
        
    elif column_name == "Indexing PCR cycle #":
        cycle = metadata[k].iloc[1 :]
        cycle = [int(c) for c in cycle]
        donor_info["pcr_cycle"] = cycle
        
donor_info = pd.DataFrame(data = donor_info)

print(donor_info)  

              unique_id SubID  pmi  living   Channel  cdna_amp_cycle  \
0   WTA_0min_Living_PFC   WTA    0    True  06022023              11   
1           WTA_0min_PM   WTA    0   False  06022023              11   
2              912_0min   912    0    True  06022023              11   
3              912_5min   912    5   False  06022023              11   
4              WTF_0min   WTF    0    True  06022023              11   
5               WTF_1hr   WTF   60   False  06022023              11   
6   WTB_0min_Living_PFC   WTB    0    True  06062023              12   
7           WTB_0min_PM   WTB    0   False  06062023              12   
8              913_0min   913    0    True  06062023              12   
9              913_5min   913    5   False  06062023              12   
10      WTE_0min_Living   WTE    0    True  06062023              12   
11         WTE_6hr_Post   WTE  360   False  06062023              12   
12      WTC_0min_Living   WTC    0    True  06072023            

### Load gene info from Biomart

In [7]:
gene_info = pd.read_csv(gene_info_fn, sep='\t')
gene_info.drop_duplicates(subset = "Gene name", inplace=True)
gene_info.reset_index(inplace=True)
gene_info

  gene_info = pd.read_csv(gene_info_fn, sep='\t')


Unnamed: 0,index,Gene stable ID,Gene stable ID version,Transcript stable ID,Transcript stable ID version,Gene start (bp),Gene end (bp),Gene name,Gene type,Gene description,Chromosome/scaffold name
0,0,ENSMUSG00000064336,ENSMUSG00000064336.1,ENSMUST00000082387,ENSMUST00000082387.1,1,68,mt-Tf,Mt_tRNA,mitochondrially encoded tRNA phenylalanine [So...,MT
1,1,ENSMUSG00000064337,ENSMUSG00000064337.1,ENSMUST00000082388,ENSMUST00000082388.1,70,1024,mt-Rnr1,Mt_rRNA,mitochondrially encoded 12S rRNA [Source:MGI S...,MT
2,2,ENSMUSG00000064338,ENSMUSG00000064338.1,ENSMUST00000082389,ENSMUST00000082389.1,1025,1093,mt-Tv,Mt_tRNA,mitochondrially encoded tRNA valine [Source:MG...,MT
3,3,ENSMUSG00000064339,ENSMUSG00000064339.1,ENSMUST00000082390,ENSMUST00000082390.1,1094,2675,mt-Rnr2,Mt_rRNA,mitochondrially encoded 16S rRNA [Source:MGI S...,MT
4,4,ENSMUSG00000064340,ENSMUSG00000064340.1,ENSMUST00000082391,ENSMUST00000082391.1,2676,2750,mt-Tl1,Mt_tRNA,mitochondrially encoded tRNA leucine 1 [Source...,MT
...,...,...,...,...,...,...,...,...,...,...,...
56477,149518,ENSMUSG00000054766,ENSMUSG00000054766.14,ENSMUST00000134364,ENSMUST00000134364.8,29947390,29962589,Set,protein_coding,SET nuclear oncogene [Source:MGI Symbol;Acc:MG...,2
56478,149525,ENSMUSG00000026785,ENSMUSG00000026785.15,ENSMUST00000125346,ENSMUST00000125346.8,29967696,29981034,Pkn3,protein_coding,protein kinase N3 [Source:MGI Symbol;Acc:MGI:2...,2
56479,149533,ENSMUSG00000015335,ENSMUSG00000015335.17,ENSMUST00000148717,ENSMUST00000148717.8,29980956,29983660,Zdhhc12,protein_coding,"zinc finger, DHHC domain containing 12 [Source...",2
56480,149537,ENSMUSG00000039686,ENSMUSG00000039686.15,ENSMUST00000044751,ENSMUST00000044751.14,29987295,30014597,Zer1,protein_coding,"zyg-11 related, cell cycle regulator [Source:M...",2


In [8]:
def link_gene_info(adata, gene_info, verbose=False):
    """Add gene info to AnnData"""
    
    adata.var = adata.var.merge(gene_info, left_on="gene_ids", right_on='Gene stable ID', how="outer")

    good_genes_idx = [i for i, name in enumerate(adata.var["Gene name"].values) if isinstance(name, str)]
       
    new_adata = ad.AnnData(adata.X[:, good_genes_idx])
    new_adata.obs = adata.obs
    new_adata.var["gene_name"] = [v for n, v in enumerate(adata.var["Gene name"].values) if n in good_genes_idx]
    new_adata.var["gene_type"] = [v for n, v in enumerate(adata.var["Gene type"].values)  if n in good_genes_idx]
    new_adata.var["gene_id"] = [v for n, v in enumerate(adata.var["Gene stable ID"].values)  if n in good_genes_idx]
    new_adata.var["gene_chrom"] = [v for n, v in enumerate(adata.var["Chromosome/scaffold name"].values)  if n in good_genes_idx]
    new_adata.var["gene_start"] = [v for n, v in enumerate(adata.var["Gene start (bp)"].values)  if n in good_genes_idx]
    new_adata.var["gene_end"] = [v for n, v in enumerate(adata.var["Gene end (bp)"].values)  if n in good_genes_idx]
    
    print(f"Variables added. Number of genes remaining: {new_adata.X.shape[1]}")
    
    return new_adata

### Create AnnData from moise samples

In [9]:
target_dirs = [fn for fn in os.listdir(base_dir) if "Solo.out" in fn]
print(f"Number of target directories found: {len(target_dirs)}")

Number of target directories found: 18


In [10]:
adata = None

for n, t in enumerate(target_dirs):

    # Load 10x data
    data_path = os.path.join(base_dir, t, "GeneFull", "filtered")
    print(f"Creating anndata from {data_path}")
    adata_temp = sc.read_10x_mtx(data_path)
    
    # Find the matching donor ID
    idx = t.find("Solo")
    sample_id = t[3: idx - 1] # index starts at 3 to remove prefix "LB_"
    donor_idx = [i for i, donor_id in enumerate(donor_info.unique_id) if sample_id in donor_id]
    assert len(donor_idx) <= 1, "Multiple matches for sample ID"
    
    for k, v in donor_info.iloc[donor_idx[0]].items():
        adata_temp.obs[k] = v
    if adata is None:
        adata = copy.deepcopy(adata_temp)
    else:        
        # adata = ad.concat([adata, adata_temp], axis=0, merge="same")   
        adata = adata.concatenate(adata_temp)

for k, v in adata_temp.var.items():
    adata.var[k] = v

adata = link_gene_info(adata, gene_info)
adata.var.index = adata.var.gene_name

adata.var.gene_chrom = adata.var.gene_chrom.astype('str')

anndata_fn = os.path.join(save_dir, "mouse_living_brain.h5ad")
adata.write(anndata_fn)
print(f"Saved {anndata_fn}")

Creating anndata from /sc/arion/projects/CommonMind/single-cell-neurogenomics/LIVING_BRAIN/pre_processed_files/STARsolo_bams/LB_912_0min_Solo.out/GeneFull/filtered
Creating anndata from /sc/arion/projects/CommonMind/single-cell-neurogenomics/LIVING_BRAIN/pre_processed_files/STARsolo_bams/LB_WTE_0min_Living_Solo.out/GeneFull/filtered
Creating anndata from /sc/arion/projects/CommonMind/single-cell-neurogenomics/LIVING_BRAIN/pre_processed_files/STARsolo_bams/LB_WTF_0min_Solo.out/GeneFull/filtered
Creating anndata from /sc/arion/projects/CommonMind/single-cell-neurogenomics/LIVING_BRAIN/pre_processed_files/STARsolo_bams/LB_WTD_6hr_Post_Solo.out/GeneFull/filtered
Creating anndata from /sc/arion/projects/CommonMind/single-cell-neurogenomics/LIVING_BRAIN/pre_processed_files/STARsolo_bams/LB_WTD_0min_Living_Solo.out/GeneFull/filtered
Creating anndata from /sc/arion/projects/CommonMind/single-cell-neurogenomics/LIVING_BRAIN/pre_processed_files/STARsolo_bams/LB_WTG_0min_Solo.out/GeneFull/filtere