In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import torch

sc.settings.verbosity = 0

In [2]:
import GenKI as gk
from GenKI.preprocesing import build_adata
from GenKI.dataLoader import DataLoader
from GenKI.train import VGAE_trainer
from GenKI import utils

%load_ext autoreload
%autoreload 2

In [3]:
# subset data as an example

adata = build_adata("../data/adata.h5ad", scale_data=True)
adata
target_cells = 6800
adata.n_obs/target_cells
adata_1 = adata[np.random.choice(adata.obs_names, size=target_cells, replace=True)]


load counts from ../data/adata.h5ad


In [4]:
# load data

data_wrapper = DataLoader(
                adata_1, # adata object
                target_gene = ['LYAR', 'SSRP1', 'PTPN14', 'TP53', 'YY1'], # KO gene name
                target_cell = None, # obsname for cell type, if none use all
                obs_label = "ident", # colname for genes
                GRN_file_dir = "GRNs", # folder name for GRNs
                rebuild_GRN = True, # whether build GRN by pcNet
                pcNet_name = "grNet", # GRN file name
                verbose = True, # whether verbose
                n_cpus = 8, # multiprocessing
                )

data_wt = data_wrapper.load_data()
data_ko = data_wrapper.load_kodata()

use all the cells (6800) in adata
build GRN


2024-11-08 09:19:21,428	INFO worker.py:1816 -- Started a local Ray instance.


ray init, using 8 CPUs


OutOfMemoryError: Task was killed due to the node running low on memory.
Memory on the node (IP: 192.168.149.71, ID: 27213e26c57aaaf0c660834942260c20191b1388746f89b7ded5a91e) where the task (task ID: 32889d5baaef152b21d34d60398f4f43eec9347501000000, name=pc_net_parallel, pid=11814, memory used=0.12GB) was running was 5.38GB / 5.65GB (0.952387), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: a7fdad37598c34cd203fe0d3f781f151c1aebbfde821b8b6342b4b8e) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 192.168.149.71`. To see the logs of the worker, use `ray logs worker-a7fdad37598c34cd203fe0d3f781f151c1aebbfde821b8b6342b4b8e*out -ip 192.168.149.71. Top 10 memory users:
PID	MEM(GB)	COMMAND
11543	3.09	/home/zero/miniforge3/bin/python3.12 -m ipykernel_launcher -f /home/zero/.local/share/jupyter/runtim...
2065	0.32	/usr/lib/kf6/baloo_file_extractor
11814	0.12	ray::pc_net_parallel
11615	0.11	/home/zero/miniforge3/lib/python3.12/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ra...
11754	0.10	/home/zero/miniforge3/bin/python3.12 -u /home/zero/miniforge3/lib/python3.12/site-packages/ray/_priv...
11674	0.08	/home/zero/miniforge3/bin/python3.12 -u /home/zero/miniforge3/lib/python3.12/site-packages/ray/autos...
11726	0.07	/home/zero/miniforge3/lib/python3.12/site-packages/ray/core/src/ray/raylet/raylet --raylet_socket_na...
11675	0.06	/home/zero/miniforge3/bin/python3.12 /home/zero/miniforge3/lib/python3.12/site-packages/ray/dashboar...
11497	0.06	/opt/brave-bin/brave --type=renderer --string-annotations --crashpad-handler-pid=2218 --enable-crash...
11792	0.05	/home/zero/miniforge3/bin/python3.12 -u /home/zero/miniforge3/lib/python3.12/site-packages/ray/dashb...
Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.

[33m(raylet)[0m [2024-11-08 09:20:21,367 E 11726 11726] (raylet) node_manager.cc:3069: 8 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: 27213e26c57aaaf0c660834942260c20191b1388746f89b7ded5a91e, IP: 192.168.149.71) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 192.168.149.71`
[33m(raylet)[0m 
[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.


In [None]:
# init trainer

hyperparams = {"epochs": 100, 
               "lr": 7e-4, 
               "beta": 1e-4, 
               "seed": 8096}
log_dir=None 

sensei = VGAE_trainer(data_wt, 
                     epochs=hyperparams["epochs"], 
                     lr=hyperparams["lr"], 
                     log_dir=log_dir, 
                     beta=hyperparams["beta"],
                     seed=hyperparams["seed"],
                     verbose=False,
                     )

In [None]:
# %%timeit

sensei.train()

In [None]:
# save model

sensei.save_model('adata_genki')

In [None]:
# get distance between wt and ko

z_mu_wt, z_std_wt = sensei.get_latent_vars(data_wt)
z_mu_ko, z_std_ko = sensei.get_latent_vars(data_ko)
dis = gk.utils.get_distance(z_mu_ko, z_std_ko, z_mu_wt, z_std_wt, by="KL")
print(dis.shape)

In [None]:
# raw ranked gene list

res_raw = utils.get_generank(data_wt, dis, rank=True)
res_raw.head()

In [None]:
# if permutation test

null = sensei.pmt(data_ko, n=100, by="KL")
res = utils.get_generank(data_wt, dis, null,)

# save_significant_as = 'gene_list_example')
res