Research Notes



In [None]:
# These packages are pre-installed on Google Colab, but are included here to facilitate running this notebook locally
!pip install --quiet matplotlib
!pip install --quiet scikit-learn
!pip install --quiet numpy
!pip install --quiet scipy
!pip install --quiet pacmap
!pip install --quiet leidenalg
!pip install --quiet sinfo
# snRNA-seq analysis
!pip install --quiet scanpy
!pip install --quiet omnipath
!pip install --quiet decoupler

In [None]:
!git clone https://github.com/EugOT/CN-pr-MDD-snRNA-seq.git
%cd /content/CN-pr-MDD-snRNA-seq/

In [None]:
import os
import random
import pacmap
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
import decoupler as dc

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
from anndata.experimental.multi_files import AnnCollection

# Load Data

In [None]:
samples_males = pd.read_csv("data/PRJNA602867.tsv", delimiter="\t")
samples_males = samples_males[["Run", "Condition", "LibraryName", "BioProject", "Sex", "NTotalCells"]]

samples_females = pd.read_csv("data/PRJNA883411.tsv", delimiter="\t")
samples_females = samples_females[["Run", "Condition", "LibraryName", "BioProject", "Sex", "NTotalCells"]]

In [None]:
samples_females

In [6]:
males = sc.read_h5ad(
    "data/PRJNA602867-whole_dataset-fpr_0.001-clusters.h5ad"
)
males.obs['Run'] = males.obs['orig.ident']
males.obs = pd.merge(samples_males, males.obs, on="Run").set_index("cell_name", drop=False)
males.uns["name"] = "PRJNA602867"
sc.pp.filter_cells(males, min_genes=200)
sc.pp.filter_genes(males, min_cells=5)

females = sc.read_h5ad(
    "data/PRJNA883411-whole_dataset-fpr_0.001-clusters.h5ad"
)
females.obs['Run'] = females.obs['orig.ident']
females.obs = pd.merge(females.obs, samples_females, on="Run", how = "inner").set_index("cell_name", drop=False)
females.uns["name"] = "PRJNA883411"
sc.pp.filter_cells(females, min_genes=200)
sc.pp.filter_genes(females, min_cells=5)

In [8]:
males.obs = males.obs[[
    'cell_name',
    'background_fraction',
    'droplet_efficiency',
    'doublet_score',
    'nFeature_Diff',
    'nCount_Diff',
    'percent_mito',
    'percent_ribo',
    'percent_mito_ribo',
    'percent_hb',
    'log10GenesPerUMI',
    'k_tree',
    'Run',
    'Condition',
    'BioProject',
    'Sex',
    'n_genes']]

females.obs = females.obs[[
    'cell_name',
    'background_fraction',
    'droplet_efficiency',
    'doublet_score',
    'nFeature_Diff',
    'nCount_Diff',
    'percent_mito',
    'percent_ribo',
    'percent_mito_ribo',
    'percent_hb',
    'log10GenesPerUMI',
    'k_tree',
    'Run',
    'Condition',
    'BioProject',
    'Sex',
    'n_genes']]

In [9]:
sc.experimental.pp.highly_variable_genes(males, flavor="pearson_residuals", n_top_genes=5000)
sc.experimental.pp.highly_variable_genes(females, flavor="pearson_residuals", n_top_genes=5000)

males.obs["k_tree"] = males.obs["Sex"].astype(str) + "_" + males.obs["k_tree"].astype(str)
females.obs["k_tree"] = females.obs["Sex"].astype(str) + "_" + females.obs["k_tree"].astype(str)

In [None]:

adata = ad.concat([males, females], join="inner")
adata.layers["raw"] = adata.X.copy()
adata.layers["sqrt_norm"] = np.sqrt(sc.pp.normalize_total(adata, inplace=False)["X"])
adata.raw = adata
sc.experimental.pp.recipe_pearson_residuals(adata, n_top_genes=2000, batch_key="Run")


# Check quality of data

In [None]:
hvgs = adata.var["highly_variable"]
embedding = pacmap.PaCMAP(
    n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, apply_pca=False
)
adata.obsm["X_pacmap"] = embedding.fit_transform(adata.obsm["X_pca"], init="pca")
n_cells = len(adata)
sc.pp.neighbors(adata, n_neighbors=20, n_pcs=50, method='umap')
sc.tl.umap(adata, method='umap')
sc.tl.leiden(adata)

In [15]:
#Normalize with count depth scaling and apply log
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)


In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='logreg', key_added="leiden")

In [None]:
sc.pl.rank_genes_groups(adata, key='leiden')

In [22]:
sc.tl.leiden(females, flavor="igraph", n_iterations=2)

sc.tl.umap(females)

In [None]:
sc.pl.umap(
    adata,
    color=["leiden", "log1p_total_counts", "pct_counts_mt", "log1p_n_genes_by_counts"],
    wspace=0.5,
    ncols=2,
)

# Marker Genes

In [None]:
# Query Omnipath and get PanglaoDB
markers = dc.get_resource("PanglaoDB")
markers

In [None]:
# Filter by canonical_marker and human
markers = markers[
    (markers["human"] == True) & (markers["canonical_marker"] == True)
]


In [None]:
# Remove duplicated entries
markers = markers[~markers.duplicated(["cell_type", "genesymbol"])]
markers["genesymbol_hs"] = [i.capitalize() for i in markers["genesymbol"]]
markers

In [None]:
# Enrichment with Over Representation Analysis
dc.run_ora(
    mat=adata,
    net=markers,
    source="cell_type",
    target="genesymbol",
    min_n=3,
    verbose=False,
    use_raw=True,
)


# Object for visualizing the ORA-results
acts = dc.get_acts(adata, obsm_key="ora_estimate")
acts

In [None]:
with rc_context({'figure.figsize': (6, 3)}):
    sc.pl.embedding(
        acts,
        basis="X_pacmap",
        color=[
            "Astrocytes",
            "Neuroblasts",
            "Neurons",
            "Pyramidal cells",
            "Oligodendrocytes",
            "Oligodendrocyte progenitor cells",
            "Endothelial cells",
            "Mast cells"
        ],
        ncols=4
        )

In [None]:
# Annotaiton
mean_enr = dc.summarize_acts(acts, groupby="leiden", min_std=1)
annotation_dict = dc.assign_groups(mean_enr)
annotation_dict

In [None]:
# Add cell type column based on annotation
adata.obs["ora_celltype"] = [
    annotation_dict[str(clust)] for clust in adata.obs["leiden"]
]

In [None]:
sc.pl.embedding(
    adata,
    basis="X_pacmap",
    color="ora_celltype",
    title="PaCMAP: {feature}".format(feature="ora_celltype"),
    add_outline=True,
    legend_loc="on data",
    legend_fontsize=12,
    legend_fontoutline=2,
    frameon=False,
    palette="tab20",
)

In [None]:
sc.pl.umap(
    adata,
    color="ora_celltype",
    title="UMAP: {feature}".format(feature="ora_celltype"),
    add_outline=True,
    legend_loc="on data",
    legend_fontsize=12,
    legend_fontoutline=2,
    frameon=False,
    palette="tab20",
)

In [None]:
#Investigate general distribution of male and female in cell with celltype==Neurons
neurons = adata[adata.obs["ora_celltype"] == "Neurons"]
neurons.obs.groupby("Sex")["cell_name"].count().reset_index()

In [None]:

#Filter to only keep cells that express the PVALB gene
gene_index = adata.var_names.get_loc('PVALB')
mask = adata.X[:, gene_index] > 0
only_pvalb_expressing_cells = adata[mask]

#Filter to only keep those PVALB expressing cells that are also of celltype==Neuron
only_ovalb_expressing_neurons = only_pvalb_expressing_cells[only_pvalb_expressing_cells.obs["ora_celltype"] == "Neurons"]


In [None]:
#Group by condition and sex
cluster_condition_counts = only_ovalb_expressing_neurons.obs.groupby(["Sex",'Condition'])['cell_name'].count().reset_index()
cluster_condition_counts