In [8]:
import os
import glob
import yaml
import tqdm.auto as tqdm


Caching the list of root modules, please wait!
(This will only be done once - type '%rehashx' to reset cache!)



# Load in template

In [7]:
path_template_config = "/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/evaluation_pipeline.yml"

In [10]:
# load template config
with open(path_template_config, 'r') as f:
    template_config = yaml.safe_load(f)
template_config

{'io': {'path_mdata': '/path/to/your/mdata/file.h5mu',
  'path_out': '/path/to/output/directory',
  'data_key': 'rna',
  'prog_key': 'cNMF'},
 'categorical_association': {'categorical_key': 'sample',
  'pseudobulk_key': None,
  'test': 'dunn',
  'n_jobs': -1,
  'inplace': False},
 'perturbation_association': {'groupby_key': None,
  'collapse_targets': True,
  'pseudobulk': False,
  'reference_targets': ['non-targeting'],
  'n_jobs': -1,
  'inplace': False},
 'gene_set_enrichment': {'prog_nam': None,
  'organism': 'human',
  'libraries': ['Reactome_2022', 'GO_Biological_Process_2023'],
  'method': 'fisher',
  'database': 'enrichr',
  'loading_rank_thresh': 300,
  'n_jobs': -1,
  'inplace': False,
  'user_geneset': None,
  'max_size': 500,
  'min_size': 5},
 'trait_enrichment': {'gwas_data': '/path/to/your/gwas/data.csv.gz',
  'prog_nam': None,
  'library': 'OT_GWAS',
  'n_jobs': -1,
  'inplace': False,
  'key_column': 'trait_efos',
  'gene_column': 'gene_name',
  'method': 'fisher',
  '

# Identify all possible MuData's to be evaluated

In [12]:
path_inference = "/cellar/users/aklie/opt/gene_program_evaluation/examples/inference/iPSC_EC"
path_out = "/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC"

In [11]:
path_h5mus = glob.glob(os.path.join(path_inference, "*", "*.h5mu"))

In [14]:
# Create path_outs using basename of path_out + basename of path_h5mus
path_outs = [os.path.join(path_out, os.path.basename(path_h5mu)).split("_0.2_gene_names.h5mu")[0] for path_h5mu in path_h5mus]
path_outs

['/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/factor_analysis_100',
 '/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_250',
 '/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_80',
 '/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_30',
 '/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_100',
 '/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_300',
 '/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_60',
 '/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_200']

In [26]:
# Grab prog_keys from path_outs as the basename of the path_outs
prog_keys = [os.path.basename(path_out) for path_out in path_outs]
prog_keys = [prog_key.split("_")[0] if "cNMF" in prog_key else prog_key for prog_key in prog_keys]
prog_keys

['factor_analysis_100', 'cNMF', 'cNMF', 'cNMF', 'cNMF', 'cNMF', 'cNMF', 'cNMF']

In [29]:
# Write out config files where each config file has a different path_mdata, path_out and prog_key
path_configs = []
for path_h5mu, path_out, prog_key in tqdm.tqdm(zip(path_h5mus, path_outs, prog_keys), total=len(path_h5mus)):
    config = template_config.copy()
    config["io"]["path_mdata"] = path_h5mu
    config["io"]["path_out"] = path_out
    config["io"]["prog_key"] = prog_key

    path_config = os.path.join(path_out, "evaluation_pipeline.yml")
    os.makedirs(path_out, exist_ok=True)
    # order the keys in the yaml file
    with open(path_config, 'w') as f:
        yaml.dump(config, f, sort_keys=False)
    print(f"Wrote {path_config}")
    path_configs.append(path_config)

100%|██████████| 8/8 [00:00<00:00, 55.15it/s]

Wrote /cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/factor_analysis_100/evaluation_pipeline.yml
Wrote /cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_250/evaluation_pipeline.yml
Wrote /cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_80/evaluation_pipeline.yml
Wrote /cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_30/evaluation_pipeline.yml
Wrote /cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_100/evaluation_pipeline.yml
Wrote /cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_300/evaluation_pipeline.yml
Wrote /cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_60/evaluation_pipeline.yml
Wrote /cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC/cNMF_200/evaluation_pipeline.yml





In [None]:
#!/bin/bash

#####
# USAGE:
# sbatch meme.sh --SLURM_SETINGS <input_bed> <genome> <output>
#####

# Date
date
echo -e "Job ID: $SLURM_JOB_ID\n"

# Configuring env (choose either singularity or conda)
source activate /cellar/users/aklie/opt/miniconda3/envs/chrombpnet
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/opt/miniconda3/lib/

# Inputs
input_tsv=$1
input_peak_paths=($(cut -f1 $input_tsv))
names=($(cut -f2 $input_tsv))
input_peak_path=${input_peak_paths[$SLURM_ARRAY_TASK_ID-1]}
name=${names[$SLURM_ARRAY_TASK_ID-1]}
genome=$2
out=$3/${name}

# Echo inputs
echo -e "input_peak_path: $input_peak_path"
echo -e "name: $name"
echo -e "genome: $genome"
echo -e "out: $out"

# make output directory
mkdir -p $out

# use bedtools to extract temporary fasta file from input peak file
fasta=$out/temp.fa
bedtools getfasta -fi $genome -bed $input_peak_path -fo $fasta
 
# Cmd
cmd="meme \
$fasta \
-dna \
-revcomp \
-mod anr \
-nmotifs 3 \
-minw 6 \
-maxw 50 \
-objfun classic \
-markov_order 0 \
-oc $out \
-seed 1234"

# Run the command
echo "Running command: $cmd"
eval $cmd

# clean up 
rm $fasta

# Date
date

In [32]:
# Write a SLURM array script to run all the configs in parallel in the following format
path_script = "/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/evaluation_pipeline.py"
path_env = "/cellar/users/aklie/opt/miniconda3/envs/test_celloracle"

script_content = f"""#!/bin/bash
#SBATCH --partition=carter-compute
#SBATCH --job-name=evaluation_pipelines
#SBATCH --output=evaluation_pipelines.out
#SBATCH --time=24:00:00
#SBATCH --mem=16G
#SBATCH --cpus-per-task=4
#SBATCH --array=1-{len(path_configs)}

#####
# USAGE:
# sbatch evaluation_pipelines.sh <config>
#####

# Date
date
echo -e "Job ID: $SLURM_JOB_ID\n"

# Configuring env
source activate {path_env}

# Inputs
configs=({path_configs})
config=${{configs[$SLURM_ARRAY_TASK_ID-1]}}

# Run the command
echo "Running command: python {path_script} $config"
python {path_script} $config

# Date
date
"""

path_slurm_out = "/cellar/users/aklie/opt/gene_program_evaluation/examples/evaluation/iPSC_EC"
path_slurm_script = os.path.join(path_slurm_out, "evaluation_pipelines.sh")
with open(path_slurm_script, 'w') as f:
    f.write(script_content)

# DONE!

---