# Slurm Job Submission

- **Creator**: Sebastian Birk (<sebastian.birk@helmholtz-munich.de>).
- **Affiliation:** Helmholtz Munich, Institute of Computational Biology (ICB), Talavera-López Lab
- **Date of Creation:** 20.03.2023
- **Date of Last Modification:** 05.09.2023

## 1. Setup

### 1.1 Import Libraries

In [1]:
import numpy as np
import os
import glob
import scanpy as sc
import gc

### 1.2 Define Parameters

In [2]:
conda_env_name = "nichecompass"
reproducibility_conda_env_name = "nichecompass-reproducibility"

### 1.3 Define Functions

In [3]:
def submit_python_script(
        job_name_prefix,
        job_id,
        job_folder_path,
        conda_env_name,
        script_folder_path,
        script_name,
        script_args,
        t="48:00:00",
        p="gpu_p",
        gres="gpu:1",
        qos="gpu_normal",
        cores="20",
        mem='160',
        nice=10000):
    
    job_name = f"{job_id}_{job_name_prefix}"
    job_file_path = f"{job_folder_path}/job_{job_name}.cmd"
    out_file_path = f"logs/out_{job_name}.txt"
    err_file_path = f"logs/err_{job_name}.txt"
    
    os.makedirs(job_folder_path + "/logs", exist_ok=True)
    
    with open(job_file_path, "w") as handle:
        handle.writelines("#!/bin/bash\n")
        handle.writelines(f"#SBATCH -J {job_name}\n")
        handle.writelines(f"#SBATCH -o {out_file_path}\n")
        handle.writelines(f"#SBATCH -e {err_file_path}\n")
        handle.writelines(f"#SBATCH -t {t}\n")
        handle.writelines(f"#SBATCH -p {p}\n")
        handle.writelines(f"#SBATCH -c {cores}\n")
        if "gpu" in p:
            handle.writelines(f"#SBATCH --gres={gres}\n")
        handle.writelines(f"#SBATCH --qos={qos}\n")       
        handle.writelines(f"#SBATCH --mem={mem}G\n")
        handle.writelines(f"#SBATCH --nice={nice}\n")
        handle.writelines("source $HOME/.bashrc\n")
        handle.writelines(f"conda activate {conda_env_name}\n")
        handle.writelines(f"cd {script_folder_path}/slurm_jobs\n")
        handle.writelines(f"python ../../{script_name}")
        handle.writelines(f"{script_args}")
        handle.writelines("\n")
        
    os.system(f"sbatch {job_file_path}")

## 5. NicheCompass Reference Model

### 5.3 nanoString CosMx SMI Human Non-Small-Cell Lung Cancer (NSCLC)

#### 5.3.2 train reference

In [None]:
job_id = 1
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01

In [None]:
job_id = 3
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 100. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01

In [None]:
job_id = 5
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0.0078125 # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 1000000.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01

In [None]:
job_id = 7
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0.0078125 # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 1000000.
lambda_l1_masked = 100. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01

In [None]:
job_id = 9
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0.015625 # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 1000000.
lambda_l1_masked = 100. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01

In [None]:
job_id = 11
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0.015625 # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 500000.
lambda_l1_masked = 100. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01

In [None]:
job_id = 14
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0.03125 # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 500000.
lambda_l1_masked = 100. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01

In [None]:
job_id = 16
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0.015625 # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 100000.
lambda_l1_masked = 100. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01

In [None]:
job_id = 18
cat_covariates_embeds_nums = "3 30 15" # "3 30 15"
contrastive_logits_pos_ratio = 0.015625 # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 1000000.
lambda_l1_masked = 100. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01

In [None]:
job_id = 20
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0.015625 # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 1000000.
lambda_l1_masked = 100. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.1 # 0.01

In [4]:
# no contrastive, highger covar embedding node
job_id = 21
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 15" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
lambda_edge_recon = 5000000.
lambda_gene_expr_recon = 3000.

In [6]:
# no contrastive, higher covar embedding node
job_id = 22
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 30" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
lambda_edge_recon = 5000000.
lambda_gene_expr_recon = 3000.

In [8]:
# no contrastive, decrease kl
job_id = 23
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 25000000.
lambda_gene_expr_recon = 15000.

In [10]:
# no contrastive, decrease kl
job_id = 24
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 100. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 25000000.
lambda_gene_expr_recon = 15000.

In [12]:
# no contrastive, decrease kl
job_id = 25
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

In [14]:
# no contrastive, decrease kl
job_id = 26
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 100. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

In [16]:
# no contrastive, decrease kl, -5r3 - 6
job_id = 27
reference_batches = "batch1 batch2 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

In [18]:
# no contrastive, -5r3 - 13, decrease kl
job_id = 28
reference_batches = "batch1 batch2 batch4 batch5 batch6 batch7" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

In [None]:
# no contrastive, -5r3 - 6
job_id = 29
reference_batches = "batch1 batch2 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

In [None]:
# no contrastive, -5r3 - 13
job_id = 30
reference_batches = "batch1 batch2 batch4 batch5 batch6 batch7" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 5000000.
lambda_gene_expr_recon = 3000.

In [4]:
# no contrastive, decrease kl
job_id = 31
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 500000000.
lambda_gene_expr_recon = 300000.

In [6]:
# no contrastive, decrease kl
job_id = 32
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 100. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 500000000.
lambda_gene_expr_recon = 300000.

In [8]:
# no contrastive, decrease kl
job_id = 33
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 5000000000.
lambda_gene_expr_recon = 3000000.

In [10]:
# no contrastive, decrease kl
job_id = 34
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 100. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 5000000000.
lambda_gene_expr_recon = 3000000.

In [12]:
# no contrastive, decrease kl
job_id = 35
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000000.
lambda_gene_expr_recon = 30000000.

In [14]:
# no contrastive, decrease kl
job_id = 36
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 100. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000000.
lambda_gene_expr_recon = 30000000.

In [4]:
# no contrastive, decrease kl
job_id = 41
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

In [7]:
# no contrastive, decrease kl, -5r3 - 6
job_id = 42
reference_batches = "batch1 batch2 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

In [9]:
# no contrastive, -5r3 - 13, decrease kl
job_id = 43
reference_batches = "batch1 batch2 batch4 batch5 batch6 batch7" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

In [18]:
# no contrastive, decrease kl
job_id = 44
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 100. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

In [20]:
# no contrastive, decrease kl, -5r3 - 6
job_id = 45
reference_batches = "batch1 batch2 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 100. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

In [22]:
# no contrastive, -5r3 - 13, decrease kl
job_id = 46
reference_batches = "batch1 batch2 batch4 batch5 batch6 batch7" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 100. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

In [12]:
# no contrastive, decrease kl
job_id = 47
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 50. # 0 100
lambda_l1_addon = 500. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

In [14]:
# no contrastive, decrease kl, -5r3 - 6
job_id = 48
reference_batches = "batch1 batch2 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 50. # 0 100
lambda_l1_addon = 500. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

In [16]:
# no contrastive, -5r3 - 13, decrease kl
job_id = 49
reference_batches = "batch1 batch2 batch4 batch5 batch6 batch7" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 50. # 0 100
lambda_l1_addon = 500. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

In [23]:
dataset = "nanostring_cosmx_human_nsclc"
task = "reference"
job_name_prefix = f"{dataset}_nichecompass_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/lustre/groups/imm01/workspace/irene.bonafonte/Projects/2023May_nichecompass/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"


# Hyperparameters
# reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_keys = "batch fov patient"
species = "human"
node_label_method = "one-hop-norm"
n_neighbors = 4
n_sampled_neighbors = 4
edge_batch_size = 512
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_no_edges = "True False True"
contrastive_logits_neg_ratio = 0.

script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1.0" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --n_addon_gp 100" \
              f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder 960" \
              " --conv_layer_encoder gatv2conv" \
              " --n_epochs 400" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              f" --lambda_edge_recon {lambda_edge_recon}" \
              f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --lambda_l1_addon {lambda_l1_addon}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size 256" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000,
        mem=128,
        cores=8,
        t='24:00:00'
)

Submitted batch job 14017568


#### 5.3.2 metrics

In [None]:
dataset = 'nanostring_cosmx_human_nsclc'
model_type = 'reference'
available_models = glob.glob(f'../artifacts/{dataset}/models/{model_type}/*')
models = []
for model in available_models:
    model_label = model.split('/')[-1]
    if os.path.exists(f"{model}/{dataset}_{model_type}.h5ad"):
        model_adata = sc.read_h5ad(f'{model}/{dataset}_{model_type}.h5ad')
        # only embeddings with all samples
        if model_adata.obs.batch.nunique() == 8:
            models.append(model_label)
            print(model_label)
        del model_adata
    
    gc.collect()

In [7]:
models

['05102023_213259_9',
 '12102023_161937_34',
 '05102023_213300_18',
 '30092023_022140_303',
 '06102023_193137_21',
 '12102023_161937_31',
 '05102023_213300_14',
 '06102023_193137_22',
 '06102023_193136_23',
 '12102023_161937_35',
 '05102023_213300_11',
 '12102023_161937_36',
 '30092023_022140_301',
 '12102023_161937_32',
 '12102023_161937_33',
 '05102023_213259_5',
 '06102023_173137_24',
 '05102023_213300_20',
 '05102023_213300_1',
 '05102023_213300_16',
 '06102023_193136_25',
 '06102023_193048_26']

In [7]:
task='reference'
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path=f"/lustre/groups/imm01/workspace/irene.bonafonte/Projects/2023May_nichecompass/nichecompass-reproducibility/scripts/{task}"
models = ['19102023_172844_41', '19102023_180356_44', '19102023_172844_47']
for model in models:
    submit_python_script(
        job_name_prefix='metrics',
        job_id=model,
        script_folder_path=script_folder_path,
        job_folder_path=job_folder_path,
        script_name='nsclc_integration_metrics.py',
        script_args=f' --model {model}',
        conda_env_name=conda_env_name
    )

Submitted batch job 14022455
Submitted batch job 14022456
Submitted batch job 14022457


In [15]:
len(models)

22

#### 5.3.3 cluster and visualize 

In [9]:
runs=['25','27','28','29','30','1']

for run in runs:
    leiden_resolution=0.5
    task='reference'
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path=f"/lustre/groups/imm01/workspace/irene.bonafonte/Projects/2023May_nichecompass/nichecompass-reproducibility/scripts/{task}"
    submit_python_script(
        job_name_prefix='cluster_latent',
        job_id=run,
        script_folder_path=script_folder_path,
        job_folder_path=job_folder_path,
        script_name='nsclc_visualize_and_cluster_models.py',
        conda_env_name=conda_env_name,
        p='cpu_p',
        qos='cpu_normal',
        cores='8',
        mem='64',
        t="24:00:00",
        script_args=f' --model_type {task}' \
                    f' --suffix {run}' \
                    f' --cluster' \
                    f' --leiden_resolution {leiden_resolution}' \
    )

Submitted batch job 14011278
Submitted batch job 14011279
Submitted batch job 14011280
Submitted batch job 14011281
Submitted batch job 14011282


In [9]:
runs=['27_3','28_3','27_4','28_8']
runs=['27_3','28_3']
runs=['42_4','43_8', '45_4','46_8', '48_4','49_8']
runs=['27_4','28_8']
runs=['42_3','43_3', '45_3','46_3', '48_3','49_3']

for run in runs:
    leiden_resolution=0.5
    task='reference_query'
    model_type='reference_query_mapping'
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path=f"/lustre/groups/imm01/workspace/irene.bonafonte/Projects/2023May_nichecompass/nichecompass-reproducibility/scripts/{task}"
    submit_python_script(
        job_name_prefix='cluster_latent',
        job_id=run,
        script_folder_path=script_folder_path,
        job_folder_path=job_folder_path,
        script_name='nsclc_visualize_and_cluster_models.py',
        conda_env_name=conda_env_name,
        p='gpu_p',
        qos='gpu_normal',
        cores='8',
        mem='64',
        t="24:00:00",
        script_args=f' --model_type {model_type}' \
                    f' --suffix {run}' \
                    f' --cluster' \
                    f' --leiden_resolution {leiden_resolution}' \
    )

Submitted batch job 14030564
Submitted batch job 14030565
Submitted batch job 14030566
Submitted batch job 14030567
Submitted batch job 14030568
Submitted batch job 14030569


#### 5.3.4 query mapping
run one per timestamp, re-name manually, run second one

In [4]:
dataset = "nanostring_cosmx_human_nsclc"
task = "reference_query"
job_name_prefix = f"{dataset}_nichecompass_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/lustre/groups/imm01/workspace/irene.bonafonte/Projects/2023May_nichecompass/nichecompass-reproducibility/scripts/{task}"
script_name = "map_query_on_nichecompass_reference_model.py"

jobs = [
    # job_id timestamp              query   l1m l1a l_edge l_gex
    ['27_3', '06102023_173136_27', 'batch3', 0, 0, 50000000, 30000],  ['27_4', '06102023_173136_27', 'batch4', 0, 0, 50000000, 30000],
    ['28_3', '06102023_193135_28', 'batch3', 0, 0, 50000000, 30000], ['28_8', '06102023_193135_28', 'batch8', 0, 0, 50000000, 30000]  
]

In [None]:
# no contrastive, decrease kl
job_id = 41
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

# no contrastive, decrease kl
job_id = 44
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 100. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

# no contrastive, decrease kl
job_id = 47
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 50. # 0 100
lambda_l1_addon = 500. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

In [10]:
dataset = "nanostring_cosmx_human_nsclc"
task = "reference_query"
job_name_prefix = f"{dataset}_nichecompass_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/lustre/groups/imm01/workspace/irene.bonafonte/Projects/2023May_nichecompass/nichecompass-reproducibility/scripts/{task}"
script_name = "map_query_on_nichecompass_reference_model.py"

jobs = [
    # job_id timestamp              query   l1m l1a l_edge l_gex
    ['42_3', '19102023_172844_42', 'batch3', 0, 1000, 50000000, 30000],  ['42_4', '19102023_172844_42', 'batch4', 0, 1000, 50000000, 30000],
    ['43_3', '19102023_172844_43', 'batch3', 0, 1000, 50000000, 30000],  ['43_8', '19102023_172844_43', 'batch8', 0, 1000, 50000000, 30000],    
    ['45_3', '19102023_180839_45', 'batch3', 0, 100, 50000000, 30000], ['45_4', '19102023_180839_45', 'batch4', 0, 100, 50000000, 30000],
    ['46_3', '19102023_184159_46', 'batch3', 0, 100, 50000000, 30000], ['46_8', '19102023_184159_46', 'batch8', 0, 100, 50000000, 30000],    
    ['48_3', '19102023_172905_48', 'batch3', 50, 500, 50000000, 30000], ['48_4', '19102023_172905_48', 'batch4', 50, 500, 50000000, 30000],
    ['49_3', '19102023_172905_49', 'batch3', 50, 500, 50000000, 30000], ['49_8', '19102023_172905_49', 'batch8', 50, 500, 50000000, 30000]    
]

In [11]:
for job in jobs:
    job_name_prefix = f"{dataset}_nichecompass_{task}"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/lustre/groups/imm01/workspace/irene.bonafonte/Projects/2023May_nichecompass/nichecompass-reproducibility/scripts/{task}"
    script_name = "map_query_on_nichecompass_reference_model.py"
    script_args = f" --dataset {dataset}" \
                  f" --query_batches {job[2]}" \
                  f" --n_neighbors 4" \
                  " --spatial_key spatial" \
                  " --mapping_entity_key mapping_entity" \
                  " --gp_names_key nichecompass_gp_names" \
                  f" --reference_model_label reference" \
                  f" --load_timestamp {job[1]}" \
                  f" --query_model_label query" \
                  f" --reference_query_model_label reference_query_mapping" \
                  " --n_epochs 400" \
                  " --n_epochs_all_gps 25" \
                  " --n_epochs_no_cat_covariates_contrastive 0" \
                  " --lr 0.001" \
                  f" --lambda_edge_recon {job[5]}" \
                  f" --lambda_gene_expr_recon {job[6]}" \
                  f" --lambda_cat_covariates_contrastive 0." \
                  f" --contrastive_logits_pos_ratio 0." \
                  f" --contrastive_logits_neg_ratio 0." \
                  " --lambda_group_lasso 0." \
                  f" --lambda_l1_masked {job[3]}" \
                  f" --edge_batch_size 512" \
                  " --node_batch_size 256" \
                  f" --n_sampled_neighbors 4" \

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job[0],
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            cores='8',
            mem='128',
            t="24:00:00"
    )

Submitted batch job 14022892
Submitted batch job 14022893
Submitted batch job 14022894
Submitted batch job 14022895
Submitted batch job 14022896
Submitted batch job 14022897
Submitted batch job 14022898
Submitted batch job 14022899
Submitted batch job 14022900
Submitted batch job 14022901
Submitted batch job 14022902
Submitted batch job 14022903


In [None]:
# 27 5 & 6, 28 5 & 13, 29 5 & 6, 30 5 & 13

In [None]:
# no contrastive, -5r3 - 6
job_id = 29
reference_batches = "batch1 batch2 batch5 batch6 batch7 batch8" 
cat_covariates_embeds_nums = "3 30 5" # "3 30 15"
contrastive_logits_pos_ratio = 0. # 0 0.0078125 (4) 0.015625 (8) 0.03125 (16)
lambda_cat_covariates_contrastive = 0.
lambda_l1_masked = 0. # 0 100
lambda_l1_addon = 1000. # 0 100 1000
active_gp_thresh_ratio = 0.03 # 0.01
# decrease weight of KL divergence in the loss
lambda_edge_recon = 50000000.
lambda_gene_expr_recon = 30000.

In [None]:
cat_covariates_keys = "batch fov patient"
species = "human"
node_label_method = "one-hop-norm"
n_neighbors = 4
n_sampled_neighbors = 4
edge_batch_size = 512
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_no_edges = "True False True"
contrastive_logits_neg_ratio = 0.

script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1.0" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --n_addon_gp 100" \
              f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder 960" \
              " --conv_layer_encoder gatv2conv" \
              " --n_epochs 400" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              f" --lambda_edge_recon {lambda_edge_recon}" \
              f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --lambda_l1_addon {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size 256" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"

#### 5.3.4 GP test


In [7]:
runs=['43']
leidens=[0.35]

for run, leiden in zip(runs, leidens):
    task='reference'
    model_type='reference'
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path=f"/lustre/groups/imm01/workspace/irene.bonafonte/Projects/2023May_nichecompass/nichecompass-reproducibility/scripts/{task}"
    submit_python_script(
        job_name_prefix='gp_test',
        job_id=run,
        script_folder_path=script_folder_path,
        job_folder_path=job_folder_path,
        script_name='differential_gp.py',
        conda_env_name=conda_env_name,
        p='gpu_p',
        qos='gpu_normal',
        cores='8',
        mem='64',
        t="48:00:00",
        script_args=f' --model_type {model_type}' \
                    f' --suffix {run}' \
                    f' --leiden_resolution {leiden}' \
    )

Submitted batch job 14126041


In [9]:
runs=['43_3','43_8', '49_3','49_8']
leidens=[0.35,0.5,0.4,0.4]

runs=['43_8']
leidens=[0.5]

for run, leiden in zip(runs, leidens):
    task='reference_query'
    model_type='reference_query_mapping'
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path=f"/lustre/groups/imm01/workspace/irene.bonafonte/Projects/2023May_nichecompass/nichecompass-reproducibility/scripts/{task}"
    submit_python_script(
        job_name_prefix='gp_test',
        job_id=run,
        script_folder_path=script_folder_path,
        job_folder_path=job_folder_path,
        script_name='differential_gp.py',
        conda_env_name=conda_env_name,
        p='gpu_p',
        qos='gpu_normal',
        cores='8',
        mem='64',
        t="48:00:00",
        script_args=f' --model_type {model_type}' \
                    f' --suffix {run}' \
                    f' --leiden_resolution {leiden}' \
    )

Submitted batch job 14126201
