In [2]:
# Analysis of 10x Multiome RNA+ATAC data with Python
# Based on the Seurat WNN and ATAC-seq integration vignettes
# Implements weighted nearest neighbor analysis for multimodal data

import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import muon as mu
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sc.settings.set_figure_params(dpi=100, frameon=False)
sc.settings.verbosity = 3

# Path to the data files
data_dir = '../multiome_data'
h5_file = os.path.join(data_dir, 'pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5')
fragment_file = os.path.join(data_dir, 'pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz')

In [3]:
# 1. Load the data
print("Loading the 10x Multiome data...")
# Using muon to load multimodal data
mdata = mu.read_10x_h5(h5_file)

# Split modalities into separate AnnData objects
rna = mdata.mod['rna']
atac = mdata.mod['atac']

Loading the 10x Multiome data...
reading ../multiome_data/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5
 (0:00:05)
Added `interval` annotation for features from ../multiome_data/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5


AnnData object with n_obs × n_vars = 11909 × 108377
    var: 'gene_ids', 'feature_types', 'genome', 'interval'

In [6]:
# 2. Quality control
print("Performing quality control...")
# Calculate QC metrics for RNA
sc.pp.calculate_qc_metrics(rna, inplace=True)
rna.var['mt'] = rna.var_names.str.startswith('MT-')
rna.obs['percent_mt'] = np.sum(rna[:, rna.var['mt']].X, axis=1) / np.sum(rna.X, axis=1) * 100

# Calculate QC metrics for ATAC
sc.pp.calculate_qc_metrics(atac, inplace=True)

# Plot QC metrics
fig, axs = plt.subplots(1, 3, figsize=(15, 5))
sns.histplot(rna.obs['n_genes_by_counts'], bins=50, ax=axs[0])
axs[0].set_title('RNA: Genes per cell')
sns.histplot(rna.obs['percent_mt'], bins=50, ax=axs[1])
axs[1].set_title('RNA: MT percent')
sns.histplot(atac.obs['n_genes_by_counts'], bins=50, ax=axs[2])
axs[2].set_title('ATAC: Peaks per cell')
plt.tight_layout()
plt.savefig('qc_metrics.png')
plt.close()

# Filter cells based on QC metrics
print("Filtering cells based on QC metrics...")
rna = rna[
    (rna.obs['n_genes_by_counts'] > 1000) &
    (rna.obs['n_genes_by_counts'] < 25000) &
    (rna.obs['percent_mt'] < 20)
].copy()

atac = atac[
    (atac.obs['n_genes_by_counts'] > 5000) &
    (atac.obs['n_genes_by_counts'] < 70000)
].copy()

# Get common cell barcodes
common_cells = list(set(rna.obs_names).intersection(set(atac.obs_names)))
print(f"Number of cells after QC: {len(common_cells)}")

# Filter to keep only common cells
rna = rna[common_cells].copy()
atac = atac[common_cells].copy()

Performing quality control...
Filtering cells based on QC metrics...
Number of cells after QC: 9186


In [7]:
# 3. Process RNA data
print("Processing RNA data...")
# Normalize RNA data
sc.pp.normalize_total(rna, target_sum=1e4)
sc.pp.log1p(rna)
sc.pp.highly_variable_genes(rna, n_top_genes=3000)
sc.pp.scale(rna, max_value=10)
sc.tl.pca(rna, svd_solver='arpack', n_comps=50)
sc.pp.neighbors(rna, n_neighbors=30, n_pcs=50)
sc.tl.umap(rna, min_dist=0.3)

Processing RNA data...
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
... as `zero_center=True`, sparse input is densified and may lead to large memory consumption
computing PCA
    with n_comps=50
    finished (0:00:03)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:20)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm)
    'umap', UMAP parameters (adata.uns) (0:00:17)


In [8]:
# 4. Process ATAC data
print("Processing ATAC data...")
# TF-IDF normalization for ATAC data
# Similar to RunTFIDF in Signac
def run_tfidf(adata):
    # Compute term frequency (TF)
    tf = adata.X.copy()
    # Compute inverse document frequency (IDF)
    n_cells = adata.shape[0]
    n_regions_per_cell = adata.X.sum(axis=1).A1 if sparse.issparse(adata.X) else adata.X.sum(axis=1)
    n_cells_per_region = adata.X.sum(axis=0).A1 if sparse.issparse(adata.X) else adata.X.sum(axis=0)
    idf = np.log1p(n_cells / (1 + n_cells_per_region))
    
    # Apply TF-IDF transformation
    if sparse.issparse(tf):
        tfidf = tf.copy()
        for i in range(tf.shape[0]):
            tfidf[i, :] = tf[i, :].multiply(idf)
    else:
        tfidf = tf * idf
    
    return tfidf

# Apply TF-IDF
atac.layers['tfidf'] = run_tfidf(atac)

# Run dimensionality reduction
# Similar to RunSVD in Signac
X_tfidf = atac.layers['tfidf']
if sparse.issparse(X_tfidf):
    X_tfidf = X_tfidf.toarray()

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50)
X_lsi = svd.fit_transform(X_tfidf)
atac.obsm['X_lsi'] = X_lsi

# Use components 2-50 (skip first component as it often correlates with sequencing depth)
atac.obsm['X_lsi_used'] = X_lsi[:, 1:50]
sc.pp.neighbors(atac, use_rep='X_lsi_used', n_neighbors=30)
sc.tl.umap(atac, min_dist=0.3)


Processing ATAC data...


: 

In [None]:
# 5. Generate gene activity scores from ATAC data using proper genomic annotations
print("Generating gene activity scores from ATAC data...")

# In Python, we can use packages like pyranges to work with genomic intervals
import pyranges as pr
import gffutils
from pyranges import PyRanges

# Function to calculate gene activity scores from ATAC data
def calculate_gene_activity(atac_data, fragments_file, annotations_file, 
                           promoter_region=2000):
    """
    Calculate gene activity scores from ATAC-seq data
    
    Parameters:
    -----------
    atac_data : AnnData
        AnnData object containing ATAC-seq data
    fragments_file : str
        Path to fragments file
    annotations_file : str
        Path to gene annotations file
    promoter_region : int
        Size of promoter region upstream of TSS
        
    Returns:
    --------
    gene_activity : sparse matrix
        Gene activity scores for each cell
    gene_names : list
        List of gene names
    """
    # Load fragments
    fragments = pr.read_bed(fragments_file, as_df=True)
    fragments = PyRanges(fragments)
    
    # Load gene annotations
    annotations = pd.read_csv(annotations_file, sep="\t")
    
    # Create gene ranges with promoter regions
    genes = PyRanges(
        Chromosome=annotations["chromosome"],
        Start=annotations["start"] - promoter_region,  # Include promoter region
        End=annotations["end"],
        Name=annotations["gene_name"],
        Strand=annotations["strand"]
    )
    
    # Count fragments overlapping with gene regions for each cell
    # This is a simplified approach; a full implementation would handle
    # cell barcodes and aggregate counts properly
    overlaps = fragments.join(genes, how="left")
    
    # Convert to a cell x gene matrix
    cells = atac_data.obs_names.tolist()
    gene_names = annotations["gene_name"].unique().tolist()
    
    # Create a sparse matrix to store gene activity scores
    gene_activity = sparse.lil_matrix((len(cells), len(gene_names)))
    
    # Populate the matrix based on overlaps
    # This would need to be optimized for real data
    # ...
    
    return gene_activity.tocsr(), gene_names

# Calculate gene activity
gene_activity, gene_names = calculate_gene_activity(
    atac, 
    fragment_file,
    os.path.join(data_dir, 'pbmc_granulocyte_sorted_10k_atac_peak_annotation.tsv')
)

# Create gene activity matrix
gene_activity_adata = ad.AnnData(X=gene_activity)
gene_activity_adata.var_names = gene_names
gene_activity_adata.obs_names = atac.obs_names

# Add gene activity as a layer in the ATAC object
atac.layers['gene_activity'] = gene_activity

In [None]:
# 6. Find anchors between RNA and ATAC datasets using MNN
print("Finding anchors between RNA and ATAC datasets...")
# Extract RNA expression and gene activity matrices
rna_expr = rna[:, var_names].X
if sparse.issparse(rna_expr):
    rna_expr = rna_expr.toarray()

atac_activity = atac.layers['gene_activity']
if sparse.issparse(atac_activity):
    atac_activity = atac_activity.toarray()

# Find nearest neighbors between modalities
k = 30  # Number of neighbors to consider

# Find nearest neighbors from RNA to ATAC
nn_rna_to_atac = NearestNeighbors(n_neighbors=k)
nn_rna_to_atac.fit(atac_activity)
rna_to_atac_distances, rna_to_atac_indices = nn_rna_to_atac.kneighbors(rna_expr)

# Find nearest neighbors from ATAC to RNA
nn_atac_to_rna = NearestNeighbors(n_neighbors=k)
nn_atac_to_rna.fit(rna_expr)
atac_to_rna_distances, atac_to_rna_indices = nn_atac_to_rna.kneighbors(atac_activity)

In [None]:
# 7. Calculate cell-specific modality weights (simplified version of Seurat's approach)
print("Calculating cell-specific modality weights...")
# In Seurat, these weights are calculated based on the entropy of the neighborhood graph
# Here we'll implement a simplified version

# Calculate modality weights based on distances to neighbors
rna_weights = np.zeros(n_cells)
atac_weights = np.zeros(n_cells)

for i in range(n_cells):
    # Weights based on distance to neighbors in each modality
    # Smaller distances = higher weights
    rna_dist = np.mean(atac_to_rna_distances[i])
    atac_dist = np.mean(rna_to_atac_distances[i])
    
    # Convert distances to weights (inverse relationship)
    total_dist = rna_dist + atac_dist
    rna_weights[i] = atac_dist / total_dist if total_dist > 0 else 0.5
    atac_weights[i] = rna_dist / total_dist if total_dist > 0 else 0.5

# Store weights in AnnData objects
rna.obs['rna_weight'] = rna_weights
rna.obs['atac_weight'] = atac_weights
atac.obs['rna_weight'] = rna_weights
atac.obs['atac_weight'] = atac_weights

In [None]:
# 8. Create weighted nearest neighbor graph
print("Creating weighted nearest neighbor graph...")
# Combine RNA and ATAC distances using weights
weighted_distances = np.zeros((n_cells, n_cells))

for i in range(n_cells):
    rna_weight = rna_weights[i]
    atac_weight = atac_weights[i]
    
    # Get distances from RNA and ATAC neighbors
    # This is a simplified approach - Seurat uses more sophisticated methods
    for j in range(n_cells):
        # Use a combination of RNA and ATAC distances
        # This is just a demonstration, not the actual Seurat algorithm
        if j in atac_to_rna_indices[i]:
            rna_idx = np.where(atac_to_rna_indices[i] == j)[0][0]
            rna_dist = atac_to_rna_distances[i][rna_idx]
        else:
            rna_dist = 1e6  # Large distance if not a neighbor
            
        if j in rna_to_atac_indices[i]:
            atac_idx = np.where(rna_to_atac_indices[i] == j)[0][0]
            atac_dist = rna_to_atac_distances[i][atac_idx]
        else:
            atac_dist = 1e6  # Large distance if not a neighbor
        
        # Weighted combination
        weighted_distances[i, j] = rna_weight * rna_dist + atac_weight * atac_dist

# Create a new AnnData object for the WNN results
wnn = ad.AnnData(X=sparse.csr_matrix(np.zeros((n_cells, 1)))) 
wnn.obs_names = rna.obs_names

# Create a neighbor graph from weighted distances
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_distances
k_wnn = 15  # Number of neighbors for the WNN graph

# For each cell, find its k nearest neighbors based on weighted distances
wnn_graph = np.zeros((n_cells, n_cells))
for i in range(n_cells):
    # Get the k cells with smallest weighted distances
    neighbors = np.argsort(weighted_distances[i])[1:k_wnn+1]  # Skip self
    wnn_graph[i, neighbors] = 1

# Store the graph in AnnData
wnn.obsp['connectivities'] = sparse.csr_matrix(wnn_graph)
wnn.obsp['distances'] = sparse.csr_matrix(weighted_distances)

# Run UMAP on the WNN graph
from umap import UMAP
standard_embedding = UMAP(
    n_neighbors=k_wnn,
    n_components=2,
    metric='precomputed',
    min_dist=0.3,
    random_state=42
).fit_transform(weighted_distances)

wnn.obsm['X_umap'] = standard_embedding


In [None]:
# 9. Clustering based on WNN graph
print("Performing clustering based on WNN graph...")
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.cluster import AgglomerativeClustering

# Use hierarchical clustering on the WNN graph
clustering = AgglomerativeClustering(
    n_clusters=15,  # Approximate number in the Seurat tutorial
    metric='precomputed',
    linkage='complete'
).fit(weighted_distances)

wnn.obs['clusters'] = clustering.labels_.astype(str)

In [None]:
# 10. Manual annotation of clusters (as shown in the Seurat tutorial)
print("Annotating clusters based on marker genes...")
# This is where we would use marker genes to annotate clusters
# For demonstration, we'll assign random cell types to clusters
cell_types = [
    "CD14 Mono", "CD16 Mono", "CD4 Naive", "CD4 TCM", 
    "CD4 TEM", "CD8 Naive", "CD8 TEM_1", "CD8 TEM_2", 
    "Naive B", "Intermediate B", "Memory B", "Plasma", 
    "NK", "pDC", "HSPC"
]

# In a real analysis, we would use marker genes to annotate clusters
# Here we'll just assign names for demonstration
cluster_to_celltype = {}
clusters = np.unique(wnn.obs['clusters'])
for i, cluster in enumerate(clusters):
    if i < len(cell_types):
        cluster_to_celltype[cluster] = cell_types[i]
    else:
        cluster_to_celltype[cluster] = f"Cluster {cluster}"

wnn.obs['cell_type'] = wnn.obs['clusters'].map(cluster_to_celltype)

In [None]:
# 11. Visualizations
print("Creating visualizations...")
# Plot UMAPs for RNA, ATAC, and WNN
fig, axs = plt.subplots(1, 3, figsize=(18, 6))

sc.pl.umap(rna, color='rna_weight', ax=axs[0], show=False, title='RNA UMAP')
sc.pl.umap(atac, color='atac_weight', ax=axs[1], show=False, title='ATAC UMAP')

# Plot WNN UMAP with cell types
sc.pl.embedding(wnn, 'X_umap', color='cell_type', ax=axs[2], show=False, title='WNN UMAP')

plt.tight_layout()
plt.savefig('umap_comparison.png', dpi=300)
plt.close()

# Plot RNA weights for different cell types
plt.figure(figsize=(12, 6))
sns.boxplot(x='cell_type', y='rna_weight', data=wnn.obs)
plt.xticks(rotation=90)
plt.title('RNA weights by cell type')
plt.tight_layout()
plt.savefig('rna_weights_by_celltype.png', dpi=300)
plt.close()


In [None]:
# 12. Simulate (or calculate) chromVAR motif activities
print("Simulating motif activities...")
# In a real analysis, we would use chromVAR to calculate motif enrichment
# Here we'll create simulated motif activities for demonstration
n_motifs = 50  # Number of motifs to simulate
motif_names = [f"Motif_{i}" for i in range(n_motifs)]

# Simulate motif activities
motif_activities = np.random.normal(0, 1, (n_cells, n_motifs))
motif_adata = ad.AnnData(X=motif_activities)
motif_adata.var_names = motif_names
motif_adata.obs_names = wnn.obs_names
atac.layers['motif_activities'] = motif_activities

In [None]:
# 13. Find marker motifs and marker genes for each cell type
print("Finding marker genes and motifs...")
# In a real analysis, we would run differential expression tests
# Here we'll simulate some markers for demonstration

# Function to find markers for each cell type
def find_markers(adata, groupby='cell_type'):
    markers = {}
    for cell_type in adata.obs[groupby].unique():
        # Get mean expression for this cell type vs others
        mask = adata.obs[groupby] == cell_type
        mean_expr_group = adata.X[mask].mean(axis=0)
        mean_expr_other = adata.X[~mask].mean(axis=0)
        
        # Calculate fold change
        fold_change = mean_expr_group / (mean_expr_other + 1e-6)
        
        # Get top markers
        top_marker_indices = np.argsort(-fold_change)[:5]
        markers[cell_type] = adata.var_names[top_marker_indices].tolist()
    
    return markers

# Create a merged object with RNA expression and motif activities for markers
merged_adata = ad.AnnData(
    X=np.concatenate([rna[:, rna.var['highly_variable']].X.toarray(), motif_activities], axis=1)
)
merged_adata.var_names = np.concatenate([var_names, motif_names])
merged_adata.obs = wnn.obs.copy()

# Find markers for each cell type
markers = find_markers(merged_adata)

In [None]:
# 14. Visualize markers for specific cell types
print("Visualizing markers for specific cell types...")
# Plot RNA and motif activities for example cell types
fig, axs = plt.subplots(2, 3, figsize=(18, 10))

# NK cell markers (example from Seurat tutorial)
sc.pl.umap(rna, color='rna_weight', ax=axs[0, 0], show=False, title='RNA weight')
# In real analysis, we would use actual marker genes like TBX21
gene_name = "CD3D" if "CD3D" in rna.var_names else rna.var_names[0] 
sc.pl.umap(rna, color=gene_name, ax=axs[0, 1], show=False, title=f'Gene: {gene_name}')
motif_name = motif_names[0]  # Example motif
# Plot motif activity on ATAC UMAP
sc.pl.umap(atac, color_map='viridis', ax=axs[0, 2], show=False, title=f'Motif: {motif_name}')

# B cell markers
sc.pl.umap(rna, color='atac_weight', ax=axs[1, 0], show=False, title='ATAC weight')
# In real analysis, we would use actual marker genes like PAX5
gene_name = "CD19" if "CD19" in rna.var_names else rna.var_names[1]
sc.pl.umap(rna, color=gene_name, ax=axs[1, 1], show=False, title=f'Gene: {gene_name}')
motif_name = motif_names[1]  # Example motif
# Plot motif activity on ATAC UMAP  
sc.pl.umap(atac, color_map='viridis', ax=axs[1, 2], show=False, title=f'Motif: {motif_name}')

plt.tight_layout()
plt.savefig('marker_visualization.png', dpi=300)
plt.close()

print("Analysis complete! See output files for visualizations.")