In [1]:
import muon as mu
import mudata as md
import anndata as ad
import scanpy as sc
import loompy as lp
import numpy as np
import pandas as pd
import os
import re

In [2]:
mu.set_options(pull_on_update = False)

<muon._core.config.set_options at 0x14d9803bd280>

### Step 1. Load mudata objects

In [3]:
day_14_data_combined = mu.read_h5mu('D14_CITESeq_ALL.h5mu')
day_28_data_combined = mu.read_h5mu('D28_CITESeq_ALL.h5mu')

### Step 2. Create .loom files of the raw counts, var_names, and obs_names for pyscenic implementation

In [4]:
adatas = [day_14_data_combined, day_28_data_combined]
names = ["D14", "D28"]
#os.mkdir('pyscenic') #unhash to make the directory that will save the loom files

for ad, name in zip(adatas, names):
    ad['rna'].X = ad['rna'].layers['counts'].copy()
    
    adata_row_attrs = {
        'Gene': np.array(ad['rna'].var_names)
        }
    adata_col_attrs = {
        'CellID': np.array(ad['rna'].obs_names),
        'nGene': np.array(np.sum(ad['rna'].X.transpose() > 0, axis=0)).flatten(),
        'nUMI': np.array(np.sum(ad['rna'].X.transpose(), axis=0)).flatten()
        }
    
    lp.create(f'pyscenic/{name}.loom', ad['rna'].X.transpose(), adata_row_attrs, adata_col_attrs)

### Step 3: PySCENIC Execution (HPC/SLURM)
The following steps were performed on the Ohio Supercomputer Center (OSC) cluster due to memory and runtime requirements. The input files `D14.loom` or `D28.loom` were processed using the standard PySCENIC CLI pipeline (GRN -> CTX -> AUCell).  
The files in pySCENIC_files below were downloaded from https://resources.aertslab.org/

In [5]:
%%writefile pyscenic/run_pyscenic_job.sh
#!/bin/bash
#SBATCH --account=PAS2527
#SBATCH --time=4:00:00
#SBATCH --mail-type=ALL
#SBATCH --ntasks-per-node=80
#SBATCH --partition=nextgen
#SBATCH --output=CITEseq_pyscenic.slurm-%j.out
#SBATCH --error=CITEseq_pyscenic.slurm-%j.err

# Exit on error
set -e

# Load modules and activate conda environment with pyscenic
module load miniconda3/24.1.2-py310
source activate pyscenic

# ==============================================================================
# CONFIGURATION
# ==============================================================================
# Define the source directory to make the script cleaner
SRC_DIR="$HOME/Single_Cell_Files/03012023_CITESeq/scanpy_muon/pyscenic"
DB_DIR="$HOME/Single_Cell_Files/03012023_CITESeq/scanpy_muon/pyscenic/pySCENIC_files"

# Copy Static Database Files to $TMPDIR (Do this once)
echo "Copying reference databases to $TMPDIR..."
rsync -av $DB_DIR/allTFs_hg38.txt $TMPDIR/
rsync -av $DB_DIR/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl $TMPDIR/
rsync -av $DB_DIR/hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather $TMPDIR/
rsync -av $DB_DIR/hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather $TMPDIR/

# Move to scratch
cd $TMPDIR

# ==============================================================================
# FUNCTION: Run Pipeline for a specific Sample
# ==============================================================================
run_pyscenic() {
    SAMPLE_ID=$1
    echo "----------------------------------------------------------------"
    echo "STARTING PIPELINE FOR: $SAMPLE_ID"
    echo "----------------------------------------------------------------"

    # 1. Copy specific loom file to TMP
    echo "Copying $SAMPLE_ID.loom..."
    rsync -av $SRC_DIR/$SAMPLE_ID.loom $TMPDIR/

    # 2. GRN Step
    echo "Running GRN for $SAMPLE_ID..."
    pyscenic grn \
        -o ${SAMPLE_ID}_adj.csv \
        ${SAMPLE_ID}.loom allTFs_hg38.txt \
        --num_workers 80 \
        --seed 123

    # 3. CTX Step
    # Note: Outputting as .csv (standard), assumed previously .yml in your script
    echo "Running CTX for $SAMPLE_ID..."
    pyscenic ctx \
        -o ${SAMPLE_ID}_reg.csv \
        --expression_mtx_fname ${SAMPLE_ID}.loom \
        --annotations_fname motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl \
        ${SAMPLE_ID}_adj.csv \
        hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather \
        hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather \
        --num_workers 40 \
        --mode custom_multiprocessing \
        --mask_dropouts

    # 4. AUCell Step
    echo "Running AUCell for $SAMPLE_ID..."
    pyscenic aucell \
        -o ${SAMPLE_ID}_pyscenic_results.loom \
        ${SAMPLE_ID}.loom ${SAMPLE_ID}_reg.csv \
        --num_workers 40 \
        --seed 123

    # 5. Copy Results Back
    echo "Copying results for $SAMPLE_ID to a results folder"
    OUT_FOLDER="$SRC_DIR/${SAMPLE_ID}_results"
    mkdir -p $OUT_FOLDER
    rsync -av ${SAMPLE_ID}_adj.csv $OUT_FOLDER
    rsync -av ${SAMPLE_ID}_reg.csv $OUT_FOLDER
    rsync -av ${SAMPLE_ID}_pyscenic_results.loom $OUT_FOLDER
    
    # Clean up loom from TMP to save space for next run
    rm ${SAMPLE_ID}.loom
    rm ${SAMPLE_ID}_adj.csv
    rm ${SAMPLE_ID}_reg.csv
    rm ${SAMPLE_ID}_pyscenic_results.loom
    
    echo "Completed $SAMPLE_ID"
}

# ==============================================================================
# EXECUTION
# ==============================================================================

# Run for D28
run_pyscenic "D28"

# Run for D14
run_pyscenic "D14"

echo "All samples processed successfully!"

Overwriting pyscenic/run_pyscenic_job.sh


**Note:** The script above was submitted to the scheduler (`sbatch run_pyscenic_job.sh`). Once the job completed successfully, the output files `Tonsil_pyscenic_results.loom` was downloaded for downstream analysis.

In [6]:
# ==============================================================================
# CONFIGURATION
# ==============================================================================
# Update this to wherever your "pyscenic" folder lives relative to this notebook
PYSCENIC_DIR = 'pyscenic' 

# ==============================================================================
# FUNCTION: Import PySCENIC Results
# ==============================================================================
def import_pyscenic_results(mdata, sample_id):
    """
    Loads AUCell matrix from Loom, cleans columns, aligns to MuData, and saves to .obsm
    """
    # 1. Construct the path based on your Bash script's folder structure
    #    Path: pyscenic/D28_results/D28_pyscenic_results.loom
    loom_path = os.path.join(PYSCENIC_DIR, f"{sample_id}_results", f"{sample_id}_pyscenic_results.loom")
    
    if not os.path.exists(loom_path):
        print(f"❌ Error: File not found: {loom_path}")
        return

    print(f"Loading results for {sample_id} from: {loom_path}")
    
    # 2. Connect and load data
    with lp.connect(loom_path, mode='r', validate=False) as lf:
        auc_mtx = pd.DataFrame(lf.ca.RegulonsAUC, index=lf.ca.CellID)

    # 3. Clean the column names (RegEx)
    #    Turns "Regulon(SOX2(+))" -> "SOX2"
    new_columns = [re.sub(r"Regulon\((\w+)\(.*\)\)", r"\1", c) for c in auc_mtx.columns]
    auc_mtx.columns = new_columns

    # 4. Re-index to match the specific MuData object
    #    This ensures cells are in the exact same order as your mdata object
    #    and fills missing cells with NaN (though there shouldn't be any if indices match)
    auc_mtx = auc_mtx.reindex(mdata['rna'].obs_names)

    # 5. Save to .obsm
    mdata['rna'].obsm['Pyscenic_AUC'] = auc_mtx

    print(f"✅ Success! Added {auc_mtx.shape[1]} regulons to .obsm['Pyscenic_AUC']")
    print("-" * 40)

# ==============================================================================
# EXECUTION
# ==============================================================================

# Run for Day 14
import_pyscenic_results(day_14_data_combined, "D14")

# Run for Day 28
import_pyscenic_results(day_28_data_combined, "D28")

Loading results for D14 from: pyscenic/D14_results/D14_pyscenic_results.loom
✅ Success! Added 222 regulons to .obsm['Pyscenic_AUC']
----------------------------------------
Loading results for D28 from: pyscenic/D28_results/D28_pyscenic_results.loom
✅ Success! Added 103 regulons to .obsm['Pyscenic_AUC']
----------------------------------------


In [7]:
# Save updated MuData Objects
day_14_data_combined.write_h5mu('D14_CITESeq_ALL.h5mu', compression = 'lzf')
day_28_data_combined.write_h5mu('D28_CITESeq_ALL.h5mu', compression = 'lzf')

  return func(*args, **kwargs)
  return func(*args, **kwargs)


## Session Info

In [8]:
import session_info
session_info.show(excludes=['distributed'])

  mod_version = _find_version(mod.__version__)
  mod_version = _find_version(mod.__version__)
  mod_version = _find_version(mod.__version__)
