# Slurm Job Submission

- **Creator**: Sebastian Birk (<sebastian.birk@helmholtz-munich.de>).
- **Affiliation:** Helmholtz Munich, Institute of Computational Biology (ICB), Talavera-López Lab
- **Date of Creation:** 20.03.2023
- **Date of Last Modification:** 27.06.2023

## 1. Setup

### 1.1 Import Libraries

In [2]:
import os

### 1.2 Define Parameters

In [3]:
conda_env_name = "nichecompass"
reproducibility_conda_env_name = "nichecompass-reproducibility"

### 1.3 Define Functions

In [4]:
def submit_python_script(
        job_name_prefix,
        job_id,
        job_folder_path,
        conda_env_name,
        script_folder_path,
        script_name,
        script_args,
        t="48:00:00",
        p="gpu_p",
        gres="gpu:1",
        qos="gpu",
        nice=10000):
    job_name = f"{job_name_prefix}_{job_id}"
    # Account for fact that submit node has different home path than compute node
    job_file_path = f"{job_folder_path.replace('/aih', '')}/job_{job_name}.cmd"
    out_file_path = f"{job_folder_path}/logs/out_{job_name}.txt"
    err_file_path = f"{job_folder_path}/logs/err_{job_name}.txt"
    
    os.makedirs(job_folder_path + "/logs", exist_ok=True)
    
    with open(job_file_path, "w") as handle:
        handle.writelines("#!/bin/bash\n")
        handle.writelines(f"#SBATCH -J {job_name}\n")
        handle.writelines(f"#SBATCH -o {out_file_path}\n")
        handle.writelines(f"#SBATCH -e {err_file_path}\n")
        handle.writelines(f"#SBATCH -t {t}\n")
        handle.writelines(f"#SBATCH -p {p}\n")
        handle.writelines("#SBATCH -c 6\n")
        handle.writelines(f"#SBATCH --gres={gres}\n")
        handle.writelines(f"#SBATCH --qos={qos}\n")       
        handle.writelines("#SBATCH --mem=64GB\n")
        handle.writelines(f"#SBATCH --nice={nice}\n")
        handle.writelines("source $HOME/.bashrc\n")
        handle.writelines(f"conda activate {conda_env_name}\n")
        handle.writelines("cd /\n")
        handle.writelines(f"cd {script_folder_path}\n")
        handle.writelines(f"python ../{script_name}")
        handle.writelines(f"{script_args}")
        handle.writelines("\n")
        
    os.system(f"sbatch {job_file_path}")

## 2. NicheCompass Ablation

### 2.1 Loss Weights & Gene Program Mask

Ablating:
- Loss Weights: Gene Expression Reconstruction, Edge Reconstruction, KL Divergence
- Gene Program Mask: FC GP Mask, NicheCompass Default GP Mask
- Spatial Neighborhood Graph: Number of Neighbors

#### 2.1.1 Xenium Human Breast Cancer

##### 2.1.1.1 Fully Connected GP Mask

In [None]:
job_id = 1
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "xenium_human_breast_cancer"
            reference_batches = "batch1"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          f" --add_fc_gps_instead_of_gp_dict_gps" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

##### 2.1.1.2 NicheCompass GP Mask (Keep All Target Genes)

In [None]:
job_id = 1
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "xenium_human_breast_cancer"
            reference_batches = "batch1"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

In [None]:
job_id = 1000
for lambda_edge_recon in [50000]:
    for lambda_gene_expr_recon in [300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "xenium_human_breast_cancer"
            reference_batches = "batch1"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

In [None]:
job_id = 2000
for lambda_edge_recon in [500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "xenium_human_breast_cancer"
            reference_batches = "batch1"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

##### 2.1.1.3 NicheCompass GP Mask (Weakly Filter Target Genes)

In [None]:
job_id = 1
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "xenium_human_breast_cancer"
            reference_batches = "batch1"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 0.1" \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

##### 2.1.1.4 NicheCompass GP Mask (Strongly Filter Target Genes)

In [None]:
job_id = 1
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "xenium_human_breast_cancer"
            reference_batches = "batch1"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 0.01" \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

#### 2.1.2 STARmap PLUS Mouse Central Nervous System

##### 2.1.2.1 Fully Connected GP Mask

In [None]:
job_id = 1
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "starmap_plus_mouse_cns"
            reference_batches = "batch1"
            species = "mouse"
            node_label_method = "one-hop-norm"
            edge_batch_size = 1024

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          f" --add_fc_gps_instead_of_gp_dict_gps" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

##### 2.1.2.2 NicheCompass GP Mask (Keep All Target Genes)

In [None]:
job_id = 1
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "starmap_plus_mouse_cns"
            reference_batches = "batch1"
            species = "mouse"
            node_label_method = "one-hop-norm"
            edge_batch_size = 1024

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

##### 2.1.2.3 NicheCompass GP Mask (Weakly Filter Target Genes)

In [None]:
job_id = 1
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "starmap_plus_mouse_cns"
            reference_batches = "batch1"
            species = "mouse"
            node_label_method = "one-hop-norm"
            edge_batch_size = 1024

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 0.1" \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

In [8]:
job_id = 1000
for lambda_edge_recon in [50000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "starmap_plus_mouse_cns"
            reference_batches = "batch1"
            species = "mouse"
            node_label_method = "one-hop-norm"
            edge_batch_size = 1024

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 0.1" \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

Submitted batch job 12416085
Submitted batch job 12416086
Submitted batch job 12416087
Submitted batch job 12416088
Submitted batch job 12416089
Submitted batch job 12416090
Submitted batch job 12416091
Submitted batch job 12416092
Submitted batch job 12416093
Submitted batch job 12416094
Submitted batch job 12416095
Submitted batch job 12416096


In [9]:
job_id = 2000
for lambda_edge_recon in [0]:
    for lambda_gene_expr_recon in [0]:
        for n_neighbors in [16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "starmap_plus_mouse_cns"
            reference_batches = "batch1"
            species = "mouse"
            node_label_method = "one-hop-norm"
            edge_batch_size = 1024

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 0.1" \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

Submitted batch job 12416097


##### 2.1.2.4 NicheCompass GP Mask (Strongly Filter Target Genes)

In [None]:
job_id = 1
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "starmap_plus_mouse_cns"
            reference_batches = "batch1"
            species = "mouse"
            node_label_method = "one-hop-norm"
            edge_batch_size = 1024

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 0.01" \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

#### 2.1.3 Vizgen MERFISH Human Ovarian Cancer

##### 2.1.3.1 Fully Connected GP Mask

In [8]:
job_id = 1
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "vizgen_merfish_human_ovarian_cancer"
            reference_batches = "batch2"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          f" --add_fc_gps_instead_of_gp_dict_gps" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

Submitted batch job 12363204
Submitted batch job 12363205
Submitted batch job 12363206
Submitted batch job 12363207
Submitted batch job 12363208
Submitted batch job 12363209
Submitted batch job 12363210
Submitted batch job 12363211
Submitted batch job 12363212
Submitted batch job 12363213
Submitted batch job 12363214
Submitted batch job 12363215
Submitted batch job 12363216
Submitted batch job 12363217
Submitted batch job 12363218
Submitted batch job 12363219
Submitted batch job 12363220
Submitted batch job 12363221
Submitted batch job 12363222
Submitted batch job 12363223
Submitted batch job 12363224
Submitted batch job 12363225
Submitted batch job 12363226
Submitted batch job 12363227
Submitted batch job 12363228
Submitted batch job 12363229
Submitted batch job 12363230
Submitted batch job 12363231
Submitted batch job 12363232
Submitted batch job 12363233
Submitted batch job 12363234
Submitted batch job 12363235
Submitted batch job 12363236
Submitted batch job 12363237
Submitted batc

##### 2.1.3.2 NicheCompass GP Mask (Keep All Target Genes)

In [13]:
job_id = 1
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "vizgen_merfish_human_ovarian_cancer"
            reference_batches = "batch2"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

Submitted batch job 12420157
Submitted batch job 12420158
Submitted batch job 12420159
Submitted batch job 12420160
Submitted batch job 12420161
Submitted batch job 12420162
Submitted batch job 12420163
Submitted batch job 12420164
Submitted batch job 12420165
Submitted batch job 12420166
Submitted batch job 12420167
Submitted batch job 12420168
Submitted batch job 12420169
Submitted batch job 12420170
Submitted batch job 12420171
Submitted batch job 12420172
Submitted batch job 12420173
Submitted batch job 12420174
Submitted batch job 12420175
Submitted batch job 12420176
Submitted batch job 12420177
Submitted batch job 12420178
Submitted batch job 12420179
Submitted batch job 12420180
Submitted batch job 12420181
Submitted batch job 12420182
Submitted batch job 12420183
Submitted batch job 12420184
Submitted batch job 12420185
Submitted batch job 12420186
Submitted batch job 12420187
Submitted batch job 12420188
Submitted batch job 12420189
Submitted batch job 12420190
Submitted batc

In [5]:
job_id = 5000
for lambda_edge_recon in [500000]:
    for lambda_gene_expr_recon in [0]:
        for n_neighbors in [16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "vizgen_merfish_human_ovarian_cancer"
            reference_batches = "batch2"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

Submitted batch job 12442278


##### 2.1.3.3 NicheCompass GP Mask (Weakly Filter Target Genes)

In [4]:
job_id = 1
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "vizgen_merfish_human_ovarian_cancer"
            reference_batches = "batch2"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 0.1" \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

Submitted batch job 12374961
Submitted batch job 12374962
Submitted batch job 12374963
Submitted batch job 12374964
Submitted batch job 12374965
Submitted batch job 12374966
Submitted batch job 12374967
Submitted batch job 12374968
Submitted batch job 12374969
Submitted batch job 12374970
Submitted batch job 12374971
Submitted batch job 12374972
Submitted batch job 12374973
Submitted batch job 12374974
Submitted batch job 12374975
Submitted batch job 12374976
Submitted batch job 12374977
Submitted batch job 12374978
Submitted batch job 12374979
Submitted batch job 12374980
Submitted batch job 12374981
Submitted batch job 12374982
Submitted batch job 12374983
Submitted batch job 12374985
Submitted batch job 12374986
Submitted batch job 12374987
Submitted batch job 12374988
Submitted batch job 12374989
Submitted batch job 12374990
Submitted batch job 12374991
Submitted batch job 12374992
Submitted batch job 12374993
Submitted batch job 12374994
Submitted batch job 12374995
Submitted batc

##### 2.1.3.4 NicheCompass GP Mask (Strongly Filter Target Genes)

In [8]:
job_id = 1
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "vizgen_merfish_human_ovarian_cancer"
            reference_batches = "batch2"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 0.01" \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

Submitted batch job 12411010
Submitted batch job 12411011
Submitted batch job 12411012
Submitted batch job 12411013
Submitted batch job 12411014
Submitted batch job 12411015
Submitted batch job 12411016
Submitted batch job 12411017
Submitted batch job 12411018
Submitted batch job 12411019
Submitted batch job 12411020
Submitted batch job 12411021
Submitted batch job 12411022
Submitted batch job 12411023
Submitted batch job 12411024
Submitted batch job 12411025
Submitted batch job 12411026
Submitted batch job 12411027
Submitted batch job 12411028
Submitted batch job 12411029
Submitted batch job 12411030
Submitted batch job 12411031
Submitted batch job 12411032
Submitted batch job 12411033
Submitted batch job 12411034
Submitted batch job 12411035
Submitted batch job 12411036
Submitted batch job 12411037
Submitted batch job 12411038
Submitted batch job 12411039
Submitted batch job 12411040
Submitted batch job 12411041
Submitted batch job 12411042
Submitted batch job 12411043
Submitted batc

#### 2.1.4 Metrics Computation

In [None]:
task = "ablation"
datasets = "xenium_human_breast_cancer starmap_plus_mouse_cns"
ablation_task = "loss_weights"
cell_type_keys = "cell_states Main_molecular_cell_type"
condition_keys = "None None"
experiment_ids = "3 4"

job_name_prefix = f"nichecompass_{ablation_task}_{task}_metrics_computation"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "compute_metrics.py"
script_args = f" --ablation_task {ablation_task}" \
              f" --datasets {datasets}" \
              f" --cell_type_keys {cell_type_keys}" \
              f" --condition_keys {condition_keys}" \
              f" --experiment_ids {experiment_ids}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=reproducibility_conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        t="12:00:00",
        p="interactive_gpu_p",
        gres="gpu:1",
        qos="interactive_gpu",
        nice=9999)

### 2.2 Gene Program & Gene Selection

Ablating:
- Gene Program Selection: Active GP Threshold Ratio, Group Lasso Regularization
- Gene Selection: L1 Regularization
- Spatial Neighborhood Graph: Number of Neighbors

#### 2.2.1 Xenium Human Breast Cancer

In [None]:
job_id = 1
for active_gp_thresh_ratio in [0., 0.05, 0.1]:
    for lambda_group_lasso in [0, 5, 10]:
        for lambda_l1_masked in [0., 5., 10., 50., 100.]:
            for n_neighbors in [4, 8, 12, 16]:
                task = "ablation"
                ablation_task = "gene_program_gene_selection"
                dataset = "xenium_human_breast_cancer"
                reference_batches = "batch1"
                species = "human"
                node_label_method = "one-hop-norm"
                edge_batch_size = 512
                lambda_edge_recon = 500000.
                lambda_gene_expr_recon = 300.

                job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                job_folder_path = f"../scripts/{task}/slurm_jobs"
                script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                script_name = "train_nichecompass_reference_model.py"
                script_args = f" --dataset {dataset}" \
                              f" --reference_batches {reference_batches}" \
                              f" --n_neighbors {n_neighbors}" \
                              " --no-filter_genes" \
                              " --nichenet_keep_target_genes_ratio 1." \
                              " --nichenet_max_n_target_genes_per_gp 250" \
                              " --include_mebocost_gps" \
                              f" --species {species}" \
                              " --gp_filter_mode subset" \
                              " --combine_overlap_gps" \
                              " --overlap_thresh_source_genes 0.9" \
                              " --overlap_thresh_target_genes 0.9" \
                              " --overlap_thresh_genes 0.9" \
                              " --counts_key counts" \
                              " --spatial_key spatial" \
                              " --adj_key spatial_connectivities" \
                              " --mapping_entity_key mapping_entity" \
                              " --gp_targets_mask_key nichecompass_gp_targets" \
                              " --gp_sources_mask_key nichecompass_gp_sources" \
                              " --gp_names_key nichecompass_gp_names" \
                              f" --model_label {ablation_task}_{task}" \
                              " --active_gp_names_key nichecompass_active_gp_names" \
                              " --latent_key nichecompass_latent" \
                              f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
                              " --gene_expr_recon_dist nb" \
                              " --log_variational" \
                              f" --node_label_method {node_label_method}" \
                              " --n_layers_encoder 1" \
                              " --n_hidden_encoder None" \
                              " --conv_layer_encoder gcnconv" \
                              " --n_epochs 100" \
                              " --n_epochs_all_gps 25" \
                              " --lr 0.001" \
                              f" --lambda_edge_recon {lambda_edge_recon}" \
                              f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                              f" --lambda_group_lasso {lambda_group_lasso}" \
                              f" --lambda_l1_masked {lambda_l1_masked}" \
                              f" --edge_batch_size {edge_batch_size}" \
                              " --node_batch_size None" \
                              f" --timestamp_suffix _{job_id}"

                submit_python_script(
                        job_name_prefix=job_name_prefix,
                        job_id=job_id,
                        job_folder_path=job_folder_path,
                        conda_env_name=conda_env_name,
                        script_folder_path=script_folder_path,
                        script_name=script_name,
                        script_args=script_args,
                        nice=10000)

                job_id += 1

#### 2.2.2 STARmap PLUS Mouse Central Nervous System

In [None]:
job_id = 1
for active_gp_thresh_ratio in [0., 0.05, 0.1]:
    for lambda_group_lasso in [0, 5, 10]:
        for lambda_l1_masked in [0., 5., 10., 50., 100.]:
            for n_neighbors in [4, 8, 12, 16]:
                task = "ablation"
                ablation_task = "gene_program_gene_selection"
                dataset = "starmap_plus_mouse_cns"
                reference_batches = "batch1"
                species = "mouse"
                node_label_method = "one-hop-norm"
                edge_batch_size = 1024
                lambda_edge_recon = 500000.
                lambda_gene_expr_recon = 300.

                job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                job_folder_path = f"../scripts/{task}/slurm_jobs"
                script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                script_name = "train_nichecompass_reference_model.py"
                script_args = f" --dataset {dataset}" \
                              f" --reference_batches {reference_batches}" \
                              f" --n_neighbors {n_neighbors}" \
                              " --no-filter_genes" \
                              " --nichenet_keep_target_genes_ratio 1." \
                              " --nichenet_max_n_target_genes_per_gp 250" \
                              " --include_mebocost_gps" \
                              f" --species {species}" \
                              " --gp_filter_mode subset" \
                              " --combine_overlap_gps" \
                              " --overlap_thresh_source_genes 0.9" \
                              " --overlap_thresh_target_genes 0.9" \
                              " --overlap_thresh_genes 0.9" \
                              " --counts_key counts" \
                              " --spatial_key spatial" \
                              " --adj_key spatial_connectivities" \
                              " --mapping_entity_key mapping_entity" \
                              " --gp_targets_mask_key nichecompass_gp_targets" \
                              " --gp_sources_mask_key nichecompass_gp_sources" \
                              " --gp_names_key nichecompass_gp_names" \
                              f" --model_label {ablation_task}_{task}" \
                              " --active_gp_names_key nichecompass_active_gp_names" \
                              " --latent_key nichecompass_latent" \
                              f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
                              " --gene_expr_recon_dist nb" \
                              " --log_variational" \
                              f" --node_label_method {node_label_method}" \
                              " --n_layers_encoder 1" \
                              " --n_hidden_encoder None" \
                              " --conv_layer_encoder gcnconv" \
                              " --n_epochs 100" \
                              " --n_epochs_all_gps 25" \
                              " --lr 0.001" \
                              f" --lambda_edge_recon {lambda_edge_recon}" \
                              f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                              f" --lambda_group_lasso {lambda_group_lasso}" \
                              f" --lambda_l1_masked 0." \
                              f" --edge_batch_size {edge_batch_size}" \
                              " --node_batch_size None" \
                              f" --timestamp_suffix _{job_id}"

                submit_python_script(
                        job_name_prefix=job_name_prefix,
                        job_id=job_id,
                        job_folder_path=job_folder_path,
                        conda_env_name=conda_env_name,
                        script_folder_path=script_folder_path,
                        script_name=script_name,
                        script_args=script_args,
                        nice=10000)

                job_id += 1

#### 2.2.3 Vizgen MERFISH Human Ovarian Cancer

In [None]:
job_id = 1
for active_gp_thresh_ratio in [0., 0.05, 0.1]:
    for lambda_group_lasso in [0, 5, 10]:
        for lambda_l1_masked in [0., 5., 10., 50., 100.]:
            for n_neighbors in [4, 8, 12, 16]:
                task = "ablation"
                ablation_task = "gene_program_gene_selection"
                dataset = "vizgen_merfish_human_ovarian_cancer"
                reference_batches = "batch2"
                species = "human"
                node_label_method = "one-hop-norm"
                edge_batch_size = 1024
                lambda_edge_recon = 500000.
                lambda_gene_expr_recon = 300.

                job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                job_folder_path = f"../scripts/{task}/slurm_jobs"
                script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                script_name = "train_nichecompass_reference_model.py"
                script_args = f" --dataset {dataset}" \
                              f" --reference_batches {reference_batches}" \
                              f" --n_neighbors {n_neighbors}" \
                              " --no-filter_genes" \
                              " --nichenet_keep_target_genes_ratio 1." \
                              " --nichenet_max_n_target_genes_per_gp 250" \
                              " --include_mebocost_gps" \
                              f" --species {species}" \
                              " --gp_filter_mode subset" \
                              " --combine_overlap_gps" \
                              " --overlap_thresh_source_genes 0.9" \
                              " --overlap_thresh_target_genes 0.9" \
                              " --overlap_thresh_genes 0.9" \
                              " --counts_key counts" \
                              " --spatial_key spatial" \
                              " --adj_key spatial_connectivities" \
                              " --mapping_entity_key mapping_entity" \
                              " --gp_targets_mask_key nichecompass_gp_targets" \
                              " --gp_sources_mask_key nichecompass_gp_sources" \
                              " --gp_names_key nichecompass_gp_names" \
                              f" --model_label {ablation_task}_{task}" \
                              " --active_gp_names_key nichecompass_active_gp_names" \
                              " --latent_key nichecompass_latent" \
                              f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
                              " --gene_expr_recon_dist nb" \
                              " --log_variational" \
                              f" --node_label_method {node_label_method}" \
                              " --n_layers_encoder 1" \
                              " --n_hidden_encoder None" \
                              " --conv_layer_encoder gcnconv" \
                              " --n_epochs 100" \
                              " --n_epochs_all_gps 25" \
                              " --lr 0.001" \
                              f" --lambda_edge_recon {lambda_edge_recon}" \
                              f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                              f" --lambda_group_lasso {lambda_group_lasso}" \
                              f" --lambda_l1_masked 0." \
                              f" --edge_batch_size {edge_batch_size}" \
                              " --node_batch_size None" \
                              f" --timestamp_suffix _{job_id}"

                submit_python_script(
                        job_name_prefix=job_name_prefix,
                        job_id=job_id,
                        job_folder_path=job_folder_path,
                        conda_env_name=conda_env_name,
                        script_folder_path=script_folder_path,
                        script_name=script_name,
                        script_args=script_args,
                        nice=10000)

                job_id += 1

### 2.3 Encoder Architecture

#### 2.3.1 Xenium Human Breast Cancer

In [None]:
job_id = 1
for n_layers_encoder in [1, 2]:
    for conv_layer_encoder in ["gcnconv", "gatv2conv"]:
        for n_hidden_encoder in [None]:
            for n_neighbors in [4, 8, 12, 16]:
                task = "ablation"
                ablation_task = "encoder_architecture"
                dataset = "xenium_human_breast_cancer"
                reference_batches = "batch1"
                species = "human"
                node_label_method = "one-hop-norm"
                edge_batch_size = 512
                lambda_edge_recon = 500000.
                lambda_gene_expr_recon = 300.
                active_gp_thresh_ratio = 0.05
                lambda_group_lasso = 0.   

                job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                job_folder_path = f"../scripts/{task}/slurm_jobs"
                script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                script_name = "train_nichecompass_reference_model.py"
                script_args = f" --dataset {dataset}" \
                              f" --reference_batches {reference_batches}" \
                              f" --n_neighbors {n_neighbors}" \
                              " --no-filter_genes" \
                              " --nichenet_keep_target_genes_ratio 1." \
                              " --nichenet_max_n_target_genes_per_gp 250" \
                              " --include_mebocost_gps" \
                              f" --species {species}" \
                              " --gp_filter_mode subset" \
                              " --combine_overlap_gps" \
                              " --overlap_thresh_source_genes 0.9" \
                              " --overlap_thresh_target_genes 0.9" \
                              " --overlap_thresh_genes 0.9" \
                              " --counts_key counts" \
                              " --spatial_key spatial" \
                              " --adj_key spatial_connectivities" \
                              " --mapping_entity_key mapping_entity" \
                              " --gp_targets_mask_key nichecompass_gp_targets" \
                              " --gp_sources_mask_key nichecompass_gp_sources" \
                              " --gp_names_key nichecompass_gp_names" \
                              f" --model_label {ablation_task}_{task}" \
                              " --active_gp_names_key nichecompass_active_gp_names" \
                              " --latent_key nichecompass_latent" \
                              f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
                              " --gene_expr_recon_dist nb" \
                              " --log_variational" \
                              f" --node_label_method {node_label_method}" \
                              f" --n_layers_encoder {n_layers_encoder}" \
                              f" --n_hidden_encoder {n_hidden_encoder}" \
                              f" --conv_layer_encoder {conv_layer_encoder}" \
                              " --n_epochs 100" \
                              " --n_epochs_all_gps 25" \
                              " --lr 0.001" \
                              f" --lambda_edge_recon {lambda_edge_recon}" \
                              f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                              f" --lambda_group_lasso {lambda_group_lasso}" \
                              f" --lambda_l1_masked {lambda_l1_masked}" \
                              f" --edge_batch_size {edge_batch_size}" \
                              " --node_batch_size None" \
                              f" --timestamp_suffix _{job_id}"

                submit_python_script(
                        job_name_prefix=job_name_prefix,
                        job_id=job_id,
                        job_folder_path=job_folder_path,
                        conda_env_name=conda_env_name,
                        script_folder_path=script_folder_path,
                        script_name=script_name,
                        script_args=script_args,
                        nice=10000)

                job_id += 1

#### 2.3.2 STARmap PLUS Mouse Central Nervous System

In [None]:
job_id = 1
for n_layers_encoder in [1, 2]:
    for conv_layer_encoder in ["gcnconv", "gatv2conv"]:
        for n_hidden_encoder in [None]:
            for n_neighbors in [4, 8, 12, 16]:
                task = "ablation"
                ablation_task = "encoder_architecture"
                dataset = "starmap_plus_mouse_cns"
                reference_batches = "batch1"
                species = "mouse"
                node_label_method = "one-hop-norm"
                edge_batch_size = 1024
                lambda_edge_recon = 500000.
                lambda_gene_expr_recon = 300.
                active_gp_thresh_ratio = 0.05
                lambda_group_lasso = 0.

                job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                job_folder_path = f"../scripts/{task}/slurm_jobs"
                script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                script_name = "train_nichecompass_reference_model.py"
                script_args = f" --dataset {dataset}" \
                              f" --reference_batches {reference_batches}" \
                              f" --n_neighbors {n_neighbors}" \
                              " --no-filter_genes" \
                              " --nichenet_keep_target_genes_ratio 1." \
                              " --nichenet_max_n_target_genes_per_gp 250" \
                              " --include_mebocost_gps" \
                              f" --species {species}" \
                              " --gp_filter_mode subset" \
                              " --combine_overlap_gps" \
                              " --overlap_thresh_source_genes 0.9" \
                              " --overlap_thresh_target_genes 0.9" \
                              " --overlap_thresh_genes 0.9" \
                              " --counts_key counts" \
                              " --spatial_key spatial" \
                              " --adj_key spatial_connectivities" \
                              " --mapping_entity_key mapping_entity" \
                              " --gp_targets_mask_key nichecompass_gp_targets" \
                              " --gp_sources_mask_key nichecompass_gp_sources" \
                              " --gp_names_key nichecompass_gp_names" \
                              f" --model_label {ablation_task}_{task}" \
                              " --active_gp_names_key nichecompass_active_gp_names" \
                              " --latent_key nichecompass_latent" \
                              f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
                              " --gene_expr_recon_dist nb" \
                              " --log_variational" \
                              f" --node_label_method {node_label_method}" \
                              f" --n_layers_encoder {n_layers_encoder}" \
                              f" --n_hidden_encoder {n_hidden_encoder}" \
                              f" --conv_layer_encoder {conv_layer_encoder}" \
                              " --n_epochs 100" \
                              " --n_epochs_all_gps 25" \
                              " --lr 0.001" \
                              f" --lambda_edge_recon {lambda_edge_recon}" \
                              f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                              f" --lambda_group_lasso {lambda_group_lasso}" \
                              f" --lambda_l1_masked 0." \
                              f" --edge_batch_size {edge_batch_size}" \
                              " --node_batch_size None" \
                              f" --timestamp_suffix _{job_id}"

                submit_python_script(
                        job_name_prefix=job_name_prefix,
                        job_id=job_id,
                        job_folder_path=job_folder_path,
                        conda_env_name=conda_env_name,
                        script_folder_path=script_folder_path,
                        script_name=script_name,
                        script_args=script_args,
                        nice=10000)

                job_id += 1

#### 2.3.3 Vizgen MERFISH Human Ovarian Cancer

In [None]:
job_id = 1
for n_layers_encoder in [1, 2]:
    for conv_layer_encoder in ["gcnconv", "gatv2conv"]:
        for n_hidden_encoder in [None]:
            for n_neighbors in [4, 8, 12, 16]:
                task = "ablation"
                ablation_task = "encoder_architecture"
                dataset = "vizgen_merfish_human_ovarian_cancer"
                reference_batches = "batch2"
                species = "human"
                node_label_method = "one-hop-norm"
                edge_batch_size = 1024
                lambda_edge_recon = 500000.
                lambda_gene_expr_recon = 300.

                job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                job_folder_path = f"../scripts/{task}/slurm_jobs"
                script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                script_name = "train_nichecompass_reference_model.py"
                script_args = f" --dataset {dataset}" \
                              f" --reference_batches {reference_batches}" \
                              f" --n_neighbors {n_neighbors}" \
                              " --no-filter_genes" \
                              " --nichenet_keep_target_genes_ratio 1." \
                              " --nichenet_max_n_target_genes_per_gp 250" \
                              " --include_mebocost_gps" \
                              f" --species {species}" \
                              " --gp_filter_mode subset" \
                              " --combine_overlap_gps" \
                              " --overlap_thresh_source_genes 0.9" \
                              " --overlap_thresh_target_genes 0.9" \
                              " --overlap_thresh_genes 0.9" \
                              " --counts_key counts" \
                              " --spatial_key spatial" \
                              " --adj_key spatial_connectivities" \
                              " --mapping_entity_key mapping_entity" \
                              " --gp_targets_mask_key nichecompass_gp_targets" \
                              " --gp_sources_mask_key nichecompass_gp_sources" \
                              " --gp_names_key nichecompass_gp_names" \
                              f" --model_label {ablation_task}_{task}" \
                              " --active_gp_names_key nichecompass_active_gp_names" \
                              " --latent_key nichecompass_latent" \
                              f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
                              " --gene_expr_recon_dist nb" \
                              " --log_variational" \
                              f" --node_label_method {node_label_method}" \
                              f" --n_layers_encoder {n_layers_encoder}" \
                              f" --n_hidden_encoder {n_hidden_encoder}" \
                              f" --conv_layer_encoder {conv_layer_encoder}" \
                              " --n_epochs 100" \
                              " --n_epochs_all_gps 25" \
                              " --lr 0.001" \
                              f" --lambda_edge_recon {lambda_edge_recon}" \
                              f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                              f" --lambda_group_lasso {lambda_group_lasso}" \
                              f" --lambda_l1_masked 0." \
                              f" --edge_batch_size {edge_batch_size}" \
                              " --node_batch_size None" \
                              f" --timestamp_suffix _{job_id}"

                submit_python_script(
                        job_name_prefix=job_name_prefix,
                        job_id=job_id,
                        job_folder_path=job_folder_path,
                        conda_env_name=conda_env_name,
                        script_folder_path=script_folder_path,
                        script_name=script_name,
                        script_args=script_args,
                        nice=10000)

                job_id += 1

### 2.5 Categorical Covariate Embedding & Contrastive Integration

#### 2.5.1 Xenium Human Breast Cancer

In [None]:
job_id = 1
for cat_covariates_embeds_injection in ["encoder", "gene_expr_decoder"]:
for lambda_cat_covariates_contrastive in [0., 1, 2]:
    for contrastive_logits_pos_ratio in ["gcnconv", "gatv2conv"]:
        for n_hidden_encoder in [None]:
            for n_neighbors in [4, 8, 12, 16]:
                task = "ablation"
                ablation_task = "encoder_architecture"
                dataset = "xenium_human_breast_cancer"
                reference_batches = "batch1"
                species = "human"
                node_label_method = "one-hop-norm"
                edge_batch_size = 512
                lambda_edge_recon = 500000.
                lambda_gene_expr_recon = 300.
                active_gp_thresh_ratio = 0.05
                lambda_group_lasso = 0.
                n_layers_encoder = 1
                n_hidden_encoder = None
                conv_layer_encoder = "gcnconv"
                
                job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                job_folder_path = f"../scripts/{task}/slurm_jobs"
                script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                script_name = "train_nichecompass_reference_model.py"
                script_args = f" --dataset {dataset}" \
                              f" --reference_batches {reference_batches}" \
                              f" --n_neighbors {n_neighbors}" \
                              " --no-filter_genes" \
                              " --nichenet_keep_target_genes_ratio 1." \
                              " --nichenet_max_n_target_genes_per_gp 250" \
                              " --include_mebocost_gps" \
                              f" --species {species}" \
                              " --gp_filter_mode subset" \
                              " --combine_overlap_gps" \
                              " --overlap_thresh_source_genes 0.9" \
                              " --overlap_thresh_target_genes 0.9" \
                              " --overlap_thresh_genes 0.9" \
                              " --counts_key counts" \
                              f" --cat_covariates_keys {cat_covariates_keys}" \
                              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                              " --spatial_key spatial" \
                              " --adj_key spatial_connectivities" \
                              " --mapping_entity_key mapping_entity" \
                              " --gp_targets_mask_key nichecompass_gp_targets" \
                              " --gp_sources_mask_key nichecompass_gp_sources" \
                              " --gp_names_key nichecompass_gp_names" \
                              f" --model_label {ablation_task}_{task}" \
                              " --active_gp_names_key nichecompass_active_gp_names" \
                              " --latent_key nichecompass_latent" \
                              f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
                              " --gene_expr_recon_dist nb" \
                              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                              " --log_variational" \
                              f" --node_label_method {node_label_method}" \
                              f" --n_layers_encoder {n_layers_encoder}" \
                              f" --n_hidden_encoder {n_hidden_encoder}" \
                              f" --conv_layer_encoder {conv_layer_encoder}" \
                              " --n_epochs 100" \
                              " --n_epochs_all_gps 25" \
                              f" --n_epochs_no_cat_covariates_contrastive {n_epochs_no_cat_covariates_contrastive}" \
                              " --lr 0.001" \
                              f" --lambda_edge_recon {lambda_edge_recon}" \
                              f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                              f" --lambda_group_lasso {lambda_group_lasso}" \
                              f" --lambda_l1_masked {lambda_l1_masked}" \
                              f" --edge_batch_size {edge_batch_size}" \
                              " --node_batch_size None" \
                              f" --timestamp_suffix _{job_id}"

                submit_python_script(
                        job_name_prefix=job_name_prefix,
                        job_id=job_id,
                        job_folder_path=job_folder_path,
                        conda_env_name=conda_env_name,
                        script_folder_path=script_folder_path,
                        script_name=script_name,
                        script_args=script_args,
                        nice=10000)

                job_id += 1

## 3. NicheCompass Single Sample Method Benchmarking Models Training

### 3.1 seqFISH Mouse Organogenesis

#### 3.1.1 Spatial Transcriptomics Data

In [None]:
task = "single_sample_method_benchmarking"
dataset = "seqfish_mouse_organogenesis_embryo2"
cell_type_key = "celltype_mapped_refined"
species = "mouse"
node_label_method = "one-hop-norm" # one-hop-norm, one-hop-attention
edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None None None" \
               " --seeds 0 1 2 3 4 5 6 7 8 9" \
               " --run_index 1 2 3 4 5 6 7 8 9 10" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1." \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 3.1.2 Spatial Transcriptomics Data Subsamples

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"seqfish_mouse_organogenesis_subsample_{subsample_pct}pct_embryo2"
    cell_type_key = "celltype_mapped_refined"
    species = "mouse"
    node_label_method = "one-hop-attention" # one-hop-norm, one-hop-attention
    edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset

    job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None None None" \
                   " --seeds 0 1 2 3 4 5 6 7 8 9" \
                   " --run_index 1 2 3 4 5 6 7 8 9 10" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {node_label_method}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --active_gp_thresh_ratio 0.05" \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method {node_label_method}" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   " --conv_layer_encoder gcnconv" \
                   " --n_epochs 100" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 500000." \
                   " --lambda_gene_expr_recon 300." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

### 3.2 STARmap PLUS Mouse Central Nervous System

#### 3.2.1 Spatial Transcriptomics Data

In [None]:
task = "single_sample_method_benchmarking"
dataset = "starmap_plus_mouse_cns_batch1"
cell_type_key = "Main_molecular_cell_type"
species = "mouse"
node_label_method = "one-hop-norm" # one-hop-norm, one-hop-attention
edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None None None" \
               " --seeds 0 1 2 3 4 5 6 7 8 9" \
               " --run_index 1 2 3 4 5 6 7 8 9 10" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1." \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 3.2.2 Spatial Transcriptomics Data Subsamples

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"starmap_plus_mouse_cns_subsample_{subsample_pct}pct_batch1"
    cell_type_key = "Main_molecular_cell_type"
    species = "mouse"
    node_label_method = "one-hop-attention" # one-hop-norm, one-hop-attention
    edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset

    job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None None None" \
                   " --seeds 0 1 2 3 4 5 6 7 8 9" \
                   " --run_index 1 2 3 4 5 6 7 8 9 10" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {node_label_method}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --active_gp_thresh_ratio 0.05" \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method {node_label_method}" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   " --conv_layer_encoder gcnconv" \
                   " --n_epochs 100" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 500000." \
                   " --lambda_gene_expr_recon 300." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

### 3.3 nanoString CosMx SMI Human Non-Small-Cell Lung Cancer (NSCLC)

#### 3.3.1 Spatial Transcriptomics Data

In [None]:
task = "single_sample_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
cell_type_key = "cell_type"
species = "mouse"
node_label_method = "one-hop-norm" # one-hop-norm, one-hop-attention
edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None None None" \
               " --seeds 0 1 2 3 4 5 6 7 8 9" \
               " --run_index 1 2 3 4 5 6 7 8 9 10" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1." \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 3.3.2 Spatial Transcriptomics Data Subsamples

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    cell_type_key = "cell_type"
    species = "mouse"
    node_label_method = "one-hop-attention" # one-hop-norm, one-hop-attention
    edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset

    job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None None None" \
                   " --seeds 0 1 2 3 4 5 6 7 8 9" \
                   " --run_index 1 2 3 4 5 6 7 8 9 10" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {node_label_method}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --active_gp_thresh_ratio 0.05" \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method {node_label_method}" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   " --conv_layer_encoder gcnconv" \
                   " --n_epochs 100" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 500000." \
                   " --lambda_gene_expr_recon 300." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

### 3.4 Vizgen MERFISH Mouse Liver

#### 3.4.1 Spatial Transcriptomics Data

In [None]:
task = "single_sample_method_benchmarking"
dataset = "vizgen_merfish_mouse_liver"
cell_type_key = "Cell_Type"
species = "mouse"
node_label_method = "one-hop-norm" # one-hop-norm, one-hop-attention
edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None None None" \
               " --seeds 0 1 2 3 4 5 6 7 8 9" \
               " --run_index 1 2 3 4 5 6 7 8 9 10" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1." \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 3.4.2 Spatial Transcriptomics Data Subsamples

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"vizgen_merfish_mouse_liver_subsample_{subsample_pct}pct"
    cell_type_key = "Cell_Type"
    species = "mouse"
    node_label_method = "one-hop-attention" # one-hop-norm, one-hop-attention
    edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset

    job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None None None" \
                   " --seeds 0 1 2 3 4 5 6 7 8 9" \
                   " --run_index 1 2 3 4 5 6 7 8 9 10" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {node_label_method}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --active_gp_thresh_ratio 0.05" \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method {node_label_method}" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   " --conv_layer_encoder gcnconv" \
                   " --n_epochs 100" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 500000." \
                   " --lambda_gene_expr_recon 300." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

### 3.5 Slide-seqV2 Mouse Hippocampus

#### 3.5.1 Spatial Transcriptomics Data

In [None]:
task = "single_sample_method_benchmarking"
dataset = "slideseqv2_mouse_hippocampus"
cell_type_key = "cell_type"
species = "mouse"
node_label_method = "one-hop-norm" # one-hop-norm, one-hop-attention
edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None None None" \
               " --seeds 0 1 2 3 4 5 6 7 8 9" \
               " --run_index 1 2 3 4 5 6 7 8 9 10" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1." \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 3.5.2 Spatial Transcriptomics Data Subsamples

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"slideseqv2_mouse_hippocampus_subsample_{subsample_pct}pct"
    cell_type_key = "cell_type"
    species = "mouse"
    node_label_method = "one-hop-attention" # one-hop-norm, one-hop-attention
    edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset

    job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None None None" \
                   " --seeds 0 1 2 3 4 5 6 7 8 9" \
                   " --run_index 1 2 3 4 5 6 7 8 9 10" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {node_label_method}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --active_gp_thresh_ratio 0.05" \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method {node_label_method}" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   " --conv_layer_encoder gcnconv" \
                   " --n_epochs 100" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 500000." \
                   " --lambda_gene_expr_recon 300." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

## 4. NicheCompass Sample Integration Method Benchmarking Models Training

### 4.1 seqFISH Mouse Organogenesis

#### 4.1.1 Spatial Transcriptomics Data

In [None]:
task = "sample_integration_method_benchmarking"
dataset = "seqfish_mouse_organogenesis"
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
cell_type_key = "celltype_mapped_refined"
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_embeds_nums = "6"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None None None" \
               " --seeds 0 1 2 3 4 5 6 7 8 9" \
               " --run_index 1 2 3 4 5 6 7 8 9 10" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1." \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 4.1.2 Spatial Transcriptomics Data Subsamples

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "sample_integration_method_benchmarking"
    dataset = f"seqfish_mouse_organogenesis_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
    cell_type_key = "celltype_mapped_refined"
    species = "mouse"
    node_label_method = "one-hop-norm"
    edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_embeds_nums = "6"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.

    job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None None None" \
                   " --seeds 0 1 2 3 4 5 6 7 8 9" \
                   " --run_index 1 2 3 4 5 6 7 8 9 10" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {node_label_method}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --active_gp_thresh_ratio 0.05" \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method {node_label_method}" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   " --conv_layer_encoder gcnconv" \
                   " --n_epochs 100" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 500000." \
                   " --lambda_gene_expr_recon 300." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

### 4.2 STARmap PLUS Mouse Central Nervous System

#### 4.2.1 Spatial Transcriptomics Data

In [None]:
task = "sample_integration_method_benchmarking"
dataset = "starmap_plus_mouse_cns"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "Main_molecular_cell_type"
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None None None" \
               " --seeds 0 1 2 3 4 5 6 7 8 9" \
               " --run_index 1 2 3 4 5 6 7 8 9 10" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1." \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 4.2.2 Spatial Transcriptomics Data Subsamples

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "sample_integration_method_benchmarking"
    dataset = f"starmap_plus_mouse_cns_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "Main_molecular_cell_type"
    species = "mouse"
    node_label_method = "one-hop-norm"
    edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.

    job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None None None" \
                   " --seeds 0 1 2 3 4 5 6 7 8 9" \
                   " --run_index 1 2 3 4 5 6 7 8 9 10" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {node_label_method}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --active_gp_thresh_ratio 0.05" \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method {node_label_method}" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   " --conv_layer_encoder gcnconv" \
                   " --n_epochs 100" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 500000." \
                   " --lambda_gene_expr_recon 300." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

### 4.3 nanoString CosMx SMI Human Non-Small-Cell Lung Cancer (NSCLC)

#### 4.3.1 Spatial Transcriptomics Data

In [None]:
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_embeds_nums = "3 10"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None None None" \
               " --seeds 0 1 2 3 4 5 6 7 8 9" \
               " --run_index 1 2 3 4 5 6 7 8 9 10" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1." \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 4.3.2 Spatial Transcriptomics Data Subsamples

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    node_label_method = "one-hop-norm"
    edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_embeds_nums = "3 10"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.

    job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None None None" \
                   " --seeds 0 1 2 3 4 5 6 7 8 9" \
                   " --run_index 1 2 3 4 5 6 7 8 9 10" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {node_label_method}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --active_gp_thresh_ratio 0.05" \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method {node_label_method}" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   " --conv_layer_encoder gcnconv" \
                   " --n_epochs 100" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 500000." \
                   " --lambda_gene_expr_recon 300." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

## 5. NicheCompass Reference Model Training

### 5.1 seqFISH Mouse Organogenesis Imputed

In [None]:
task = "reference"
dataset = "seqfish_mouse_organogenesis_imputed"
job_id = 2
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
n_neighbors = 12
n_hvg = 4000
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size = "4096"
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 5.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --filter_genes" \
              f" --n_hvg {n_hvg}" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [12]:
task = "reference"
dataset = "seqfish_mouse_organogenesis_imputed"
job_id = 4
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
n_neighbors = 12
n_hvg = 4000
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size = "4096"
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 30.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --filter_genes" \
              f" --n_hvg {n_hvg}" \
              " --nichenet_keep_target_genes_ratio 0.1" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0." \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 12420065


### 5.2 STARmap PLUS Mouse Central Nervous System

In [None]:
task = "reference"
dataset = "starmap_plus_mouse_cns"
job_id = 1
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" \
                    " batch9 batch10 batch11 batch12 batch13 batch14 batch15 batch16 batch17 batch18" \
                    " batch19 batch20"
reference_batches = "batch1 batch2 batch3"
n_neighbors = 12 # 8, 12
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size = 2048 # 2048
cat_covariates_embeds_injection = "encoder gene_expr_decoder"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0. # 0., 10000, 100000, 500000
contrastive_logits_pos_ratio = 0. # 0., 0.125, 0.0625, 0.03125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
task = "reference"
dataset = "starmap_plus_mouse_cns"
job_id = 3
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" \
                    " batch9 batch10 batch11 batch12 batch13 batch14 batch15 batch16 batch17 batch18" \
                    " batch19 batch20"
n_neighbors = 12 # 8, 12
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size = 1024 # 2048
cat_covariates_embeds_injection = "encoder gene_expr_decoder"
n_cond_embed = 20 # 3, 20, None
lambda_cat_covariates_contrastive = 250000. # 0., 10000, 100000, 500000
contrastive_logits_pos_ratio = 0.03125 # 0., 0.125, 0.0625, 0.03125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --n_cond_embed {n_cond_embed}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

### 5.3 nanoString CosMx SMI Human Non-Small-Cell Lung Cancer (NSCLC)

In [None]:
task = "reference"
dataset = "nanostring_cosmx_human_nsclc"
job_id = 1
reference_batches = "batch1 batch2 batch3"
n_neighbors = 12 # 4, 8, 12
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 1024 # 4096, 2048, 512
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
task = "reference"
dataset = "nanostring_cosmx_human_nsclc"
job_id = 1
reference_batches = "batch1 batch2 batch3"
n_neighbors = 12 # 4, 8, 12
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 1024 # 4096, 2048, 512
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "fov"
cat_covariates_no_edges = "False"
cat_covariates_embeds_nums = "10"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
task = "reference"
dataset = "nanostring_cosmx_human_nsclc"
job_id = 10
reference_batches = "batch1 batch2 batch3"
n_neighbors = 12 # 4, 8, 12
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 1024 # 4096, 2048, 512
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 10"
lambda_cat_covariates_contrastive = 100000.
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
task = "reference"
dataset = "nanostring_cosmx_human_nsclc"
job_id = 11
reference_batches = "batch1 batch2 batch3"
n_neighbors = 12 # 4, 8, 12
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 1024 # 4096, 2048, 512
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = None
cat_covariates_no_edges = None
cat_covariates_embeds_nums = None
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
task = "reference"
dataset = "nanostring_cosmx_human_nsclc"
job_id = 15
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8"
n_neighbors = 12 # 4, 8, 12
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 512 # 4096, 2048, 512
cat_covariates_embeds_injection = "encoder gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "patient batch fov"
cat_covariates_embeds_nums = "5 8 20"
lambda_cat_covariates_contrastive = 250000.
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

### 5.4 Vizgen MERFISH Human Ovarian Cancer

In [None]:
task = "reference"
dataset = "vizgen_merfish_human_ovarian_cancer"
job_id = 2
reference_batches = "batch1 batch2 batch3 batch4"
n_neighbors = 4 # 8, 12
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 4096 # 2048
cat_covariates_embeds_injection = "encoder gene_expr_decoder"
cat_covariates_keys = "patient batch"
cat_covariates_no_edges = "True True"
cat_covariates_embeds_nums = "2 4"
lambda_cat_covariates_contrastive = 0. # 0., 10000, 100000, 500000
contrastive_logits_pos_ratio = 0. # 0., 0.125, 0.0625, 0.03125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

### 5.4 Vizgen MERFISH Human Lung Cancer

In [None]:
task = "reference"
dataset = "vizgen_merfish_human_lung_cancer"
job_id = 1
reference_batches = "batch1 batch2"
n_neighbors = 4 # 8, 12
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 4096 # 2048
cat_covariates_embeds_injection = "encoder gene_expr_decoder"
cat_covariates_embeds_nums = "2"
lambda_cat_covariates_contrastive = 250000. # 0., 10000, 100000, 500000
contrastive_logits_pos_ratio = 0.125 # 0., 0.125, 0.0625, 0.03125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

### 5.5 Spatial ATAC-RNA-Seq Mouse Embryo & Brain

In [None]:
task = "reference"
dataset = "spatial_atac_rna_seq_mouse_brain"
reference_batches = "None"
n_neighbors = 12
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size = "4096"

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --filter_genes" \
              " --n_hvg 3000" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --mebocost_species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              " --include_atac_modality" \
              " --filter_peaks" \
              " --min_cell_peak_thresh_ratio 0.0005" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              " --lambda_chrom_access_recon 100." \
              " --lambda_cat_covariates_contrastive 0." \
              " --contrastive_logits_ratio 0." \
              " --lambda_group_lasso 0." \
              " --lambda_l1_masked 5." \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

### 5.6 Xenium Human Breast Cancer

In [None]:
task = "reference"
dataset = "xenium_human_breast_cancer"
job_id = 2
reference_batches = "batch1 batch2"
n_neighbors = 4 # 4, 8, 12
species = "human"
node_label_method = "one-hop-attention"
edge_batch_size = 4096 # 4096 (4, 8), 1024 (12)
cat_covariates_embeds_injection = "encoder gene_expr_decoder"
cat_covariates_embeds_nums = "2"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0. # 0., 5., 10.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

## 6. NicheCompass Reference Query Mapping

### 6.1 seqFISH Mouse Organogenesis Imputed

#### 6.1.1 Reference Model Training

In [5]:
task = "reference_query"
dataset = "seqfish_mouse_organogenesis_imputed"
job_id = 1
reference_batches = "batch1 batch2 batch3 batch4"
n_neighbors = 12
n_hvg = 3000
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size = 4096
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 5.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}_reference_only"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --filter_genes" \
              f" --n_hvg {n_hvg}" \
              " --nichenet_keep_target_genes_ratio 0.1" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}_reference_only" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        t="12:00:00",
        p="interactive_gpu_p",
        gres="gpu:1",
        qos="interactive_gpu",
        nice=9999)

Submitted batch job 12320752


#### 6.1.2 Query Mapping

In [6]:
task = "reference_query"
dataset = "seqfish_mouse_organogenesis_imputed"
job_id = 1
query_batches = "batch5 batch6"
n_neighbors = 12
node_label_method = "one-hop-norm"
load_timestamp = "01072023_165203_1"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 5.
edge_batch_size = 4096

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}_query_mapping"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "map_query_on_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --query_batches {query_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --spatial_key spatial" \
              " --mapping_entity_key mapping_entity" \
              " --gp_names_key nichecompass_gp_names" \
              f" --reference_model_label {node_label_method}_{task}_reference_only" \
              f" --load_timestamp {load_timestamp}" \
              f" --query_model_label {node_label_method}_{task}_query_only" \
              f" --reference_query_model_label {node_label_method}_{task}_query_mapping" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        t="12:00:00",
        p="interactive_gpu_p",
        gres="gpu:1",
        qos="interactive_gpu",
        nice=9999)

Submitted batch job 12323212


### 6.2 STARmap PLUS Mouse Central Nervous System

#### 6.2.1 Reference Model Training

In [12]:
task = "reference_query"
dataset = "starmap_plus_mouse_cns"
job_id = 4
reference_batches = "batch1 batch2"
n_neighbors = 8 # 8, 12
n_sampled_neighbors = 4
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size = 1024
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 250000. # 0., 10000, 100000, 500000
contrastive_logits_pos_ratio = 0.0625 # 0., 0.125, 0.0625, 0.03125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 0.1" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}_reference_only" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0." \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=9999)

Submitted batch job 12428141


#### 6.2.2 Query Mapping

In [4]:
task = "reference_query"
dataset = "starmap_plus_mouse_cns"
job_id = 1
query_batches = "batch3"
n_neighbors = 12
node_label_method = "one-hop-norm"
load_timestamp = "06072023_114143_1"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 5.
edge_batch_size = 1024

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}_query_mapping"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "map_query_on_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --query_batches {query_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --spatial_key spatial" \
              " --mapping_entity_key mapping_entity" \
              " --gp_names_key nichecompass_gp_names" \
              f" --reference_model_label {node_label_method}_{task}_reference_only" \
              f" --load_timestamp {load_timestamp}" \
              f" --query_model_label {node_label_method}_{task}_query_only" \
              f" --reference_query_model_label {node_label_method}_{task}_query_mapping" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        t="12:00:00",
        p="interactive_gpu_p",
        gres="gpu:1",
        qos="interactive_gpu",
        nice=9999)

Submitted batch job 12422983
