# G only KEGG based network architecture

> 

In [None]:
import numpy as np
import pandas as pd

from EnvDL.core import ensure_dir_path_exists 
from EnvDL.dlfn import g2fc_datawrapper, BigDataset, plDNN_general
from EnvDL.dlfn import ResNet2d, BasicBlock2d
from EnvDL.dlfn import LSUV_

import torch
import torch.nn.functional as F # F.mse_loss
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn

import lightning.pytorch as pl
from lightning.pytorch.loggers import TensorBoardLogger

from EnvDL.dlfn import kegg_connections_build, kegg_connections_clean, kegg_connections_append_y_hat, kegg_connections_sanitize_names
from EnvDL.dlfn import VNNHelper, VisableNeuralNetwork, Linear_block_reps
from EnvDL.dlfn import plDNN_general, BigDataset
from EnvDL.dlfn import reverse_edge_dict, reverse_node_props
from EnvDL.dlfn import VNNVAEHelper, plVNNVAE
from EnvDL.dlfn import kegg_connections_build, kegg_connections_clean, kegg_connections_append_y_hat, kegg_connections_sanitize_names
from EnvDL.dlfn import VNNHelper, VisableNeuralNetwork, Linear_block_reps
from EnvDL.dlfn import ListDataset, plVNN
from EnvDL.dlfn import plDNN_general, BigDataset

In [None]:
cache_path = '../nbs_artifacts/02.40_g2fc_G_ACGT_VNN_baseline_rx/'
save_prefix = [e for e in cache_path.split('/') if e != ''][-1]

# Run settings: 
max_epoch  = 2
batch_size = 48

# VNN settings:
default_out_nodes_inp   = 4
default_out_nodes_edge  = 32
default_out_nodes_out   = 1

default_drop_nodes_inp  = 0.0
default_drop_nodes_edge = 0.0
default_drop_nodes_out  = 0.0

default_reps_nodes_inp  = 1
default_reps_nodes_edge = 1
default_reps_nodes_out  = 1

In [None]:
use_gpu_num = 0

device = "cuda" if torch.cuda.is_available() else "cpu"
if use_gpu_num in [0, 1]: 
    torch.cuda.set_device(use_gpu_num)
print(f"Using {device} device")

Using cuda device


In [None]:
ensure_dir_path_exists(dir_path = cache_path)

## Fit Using VNNHelper

In [None]:

# Same setup as above to create kegg_gene_brite
X = g2fc_datawrapper()
X.set_split()
X.load_all(name_list = ['obs_geno_lookup', 'YMat', 'KEGG_slices',], store=True) 
X.calc_cs('YMat', version = 'np', filter = 'val:train')
ACGT_gene_slice_list =     X.get('KEGG_slices', ops_string='')
parsed_kegg_gene_entries = X.get('KEGG_entries')


# Restrict to only those with pathway
kegg_gene_brite = [e for e in parsed_kegg_gene_entries if 'BRITE' in e.keys()]

# also require to have a non-empty path
kegg_gene_brite = [e for e in kegg_gene_brite if not e['BRITE']['BRITE_PATHS'] == []]

print('Retaining '+ str(round(len(kegg_gene_brite)/len(parsed_kegg_gene_entries), 4)*100)+'%, '+str(len(kegg_gene_brite)
    )+'/'+str(len(parsed_kegg_gene_entries)
    )+' Entries'
    )
# kegg_gene_brite[1]['BRITE']['BRITE_PATHS']

Loading and storing default `phno`.


Retaining 43.53%, 6067/13939 Entries


In [None]:
kegg_connections = kegg_connections_build(kegg_gene_brite = kegg_gene_brite, 
                                          n_genes = 6067) 
kegg_connections = kegg_connections_clean(         kegg_connections = kegg_connections)
kegg_connections = kegg_connections_append_y_hat(  kegg_connections = kegg_connections)
kegg_connections = kegg_connections_sanitize_names(kegg_connections = kegg_connections, 
                                                   replace_chars = {'.':'_'})

  0%|          | 0/6067 [00:00<?, ?it/s]

100%|██████████| 6067/6067 [00:00<00:00, 56952.52it/s]

Removed node "Others"





In [None]:
# initialize helper for input nodes
myvnn = VNNHelper(edge_dict = kegg_connections)

myvnn.nodes_inp[0:10]

# Get a mapping of brite names to tensor list index
find_names = myvnn.nodes_inp # e.g. ['100383860', '100278565', ... ]
lookup_dict = {}

# the only difference lookup_dict and brite_node_to_list_idx_dict above is that this is made using the full set of genes in the list 
# whereas that is made using kegg_gene_brite which is a subset
for i in range(len(parsed_kegg_gene_entries)):
    if 'BRITE' not in parsed_kegg_gene_entries[i].keys():
        pass
    elif parsed_kegg_gene_entries[i]['BRITE']['BRITE_PATHS'] == []:
        pass
    else:
        name = parsed_kegg_gene_entries[i]['BRITE']['BRITE_PATHS'][0][-1]
        if name in find_names:
            lookup_dict[name] = i
lookup_dict    

{'100278565': 0,
 '100383860': 1,
 '100383837': 3,
 '100191673': 8,
 '100275685': 9,
 '103630585': 13,
 '100194370': 16,
 '100194192': 17,
 '100273289': 18,
 '100037826': 19,
 '100192899': 21,
 '100304252': 26,
 '100280063': 28,
 '103630746': 30,
 '100282726': 31,
 '100285519': 33,
 '100272424': 35,
 '100381824': 36,
 '100277985': 40,
 '100283343': 42,
 '100284607': 45,
 '100191772': 48,
 '100277385': 49,
 '100381530': 50,
 '103630860': 51,
 '100384769': 52,
 '103630917': 55,
 '100194239': 56,
 '103644387': 57,
 '103631010': 58,
 '103631056': 63,
 '100191905': 64,
 '100281199': 68,
 '100283731': 70,
 '103631177': 72,
 '100281329': 76,
 '100274981': 78,
 '100382167': 86,
 '100383301': 87,
 '100285254': 90,
 '100280817': 91,
 '542672': 92,
 '542290': 93,
 '103631462': 94,
 '100283425': 96,
 '103631568': 97,
 '100280768': 98,
 '103631596': 102,
 '542230': 105,
 '103631647': 106,
 '100383383': 107,
 '100274307': 111,
 '100273485': 112,
 '100191592': 113,
 '100273170': 114,
 '100384051': 11

In [None]:
# if permuting gene identities
torch.manual_seed(5461)

keys = [e for e in lookup_dict.keys()]

# vals = [lookup_dict[e] for e in lookup_dict.keys()]
# dict(zip(keys, [int(i) for i in torch.randperm(len(keys))]))

idx = torch.tensor([lookup_dict[e] for e in myvnn.nodes_inp])
idx = idx[torch.randperm(idx.shape[0])]
idx = [int(i) for i in idx]
temp = dict(zip(myvnn.nodes_inp, idx))

randomized_lookup_dict = {}
for e in lookup_dict.keys():
    if e not in temp.keys():
        randomized_lookup_dict[e] = lookup_dict[e]
    else:
        randomized_lookup_dict[e] = temp[e]

lookup_dict = randomized_lookup_dict

In [None]:
brite_node_to_list_idx_dict = {}
for i in range(len(kegg_gene_brite)):
    brite_node_to_list_idx_dict[str(kegg_gene_brite[i]['BRITE']['BRITE_PATHS'][0][-1])] = i        

# Get the input sizes for the graph
size_in_zip = zip(myvnn.nodes_inp, [np.prod(ACGT_gene_slice_list[lookup_dict[e]].shape[1:]) for e  in myvnn.nodes_inp])


In [None]:

# init input node sizes
myvnn.set_node_props(key = 'inp', node_val_zip = size_in_zip)

# init node output sizes
myvnn.set_node_props(key = 'out', node_val_zip = zip(myvnn.nodes_inp, [default_out_nodes_inp  for e in myvnn.nodes_inp]))
myvnn.set_node_props(key = 'out', node_val_zip = zip(myvnn.nodes_edge,[default_out_nodes_edge for e in myvnn.nodes_edge]))
myvnn.set_node_props(key = 'out', node_val_zip = zip(myvnn.nodes_out, [default_out_nodes_out  for e in myvnn.nodes_out]))


# # options should be controlled by node_props
myvnn.set_node_props(key = 'flatten', node_val_zip = zip(
    myvnn.nodes_inp, 
    [True for e in myvnn.nodes_inp]))

myvnn.set_node_props(key = 'reps', node_val_zip = zip(myvnn.nodes_inp, [default_reps_nodes_inp  for e in myvnn.nodes_inp]))
myvnn.set_node_props(key = 'reps', node_val_zip = zip(myvnn.nodes_edge,[default_reps_nodes_edge for e in myvnn.nodes_edge]))
myvnn.set_node_props(key = 'reps', node_val_zip = zip(myvnn.nodes_out, [default_reps_nodes_out  for e in myvnn.nodes_out]))

myvnn.set_node_props(key = 'drop', node_val_zip = zip(myvnn.nodes_inp, [default_drop_nodes_inp  for e in myvnn.nodes_inp]))
myvnn.set_node_props(key = 'drop', node_val_zip = zip(myvnn.nodes_edge,[default_drop_nodes_edge for e in myvnn.nodes_edge]))
myvnn.set_node_props(key = 'drop', node_val_zip = zip(myvnn.nodes_out, [default_drop_nodes_out  for e in myvnn.nodes_out]))

# init edge node input size (propagate forward input/edge outpus)
myvnn.calc_edge_inp()

# myvnn.mk_digraph(include = ['node_name', 'inp_size', 'out_size'])
# myvnn.mk_digraph(include = [''])

In [None]:
vals = X.get('KEGG_slices', ops_string='asarray from_numpy float')

In [None]:
# restrict to the tensors that will be used
vals = [vals[lookup_dict[i]] for i in myvnn.nodes_inp]
# send to gpu
vals = [val.to('cuda') for val in vals]

In [None]:
# replace lookup so that it matches the lenght of the input tensors
new_lookup_dict = {}
for i in range(len(myvnn.nodes_inp)):
    new_lookup_dict[myvnn.nodes_inp[i]] = i
    # print((myvnn.nodes_inp[i], i))
    # break

In [None]:
model = VisableNeuralNetwork(
    node_props = myvnn.node_props,
    Linear_block = Linear_block_reps,
    edge_dict = myvnn.edge_dict,
    dependancy_order = myvnn.dependancy_order,
    node_to_inp_num_dict = new_lookup_dict
)
model = model.to('cuda')
# # with torch.no_grad(): print(model(vals))

In [None]:
# # if randomizing y
# torch.manual_seed(2608434)

# y_trn = X.get('YMat', ops_string='cs filter:val:train asarray from_numpy float')
# y_trn = y_trn[torch.randperm(y_trn.shape[0])]


# y_val = X.get('YMat', ops_string='cs filter:val:train asarray from_numpy float')
# y_val = y_val[torch.randperm(y_val.shape[0])]


In [None]:

training_dataloader = DataLoader(BigDataset(
    lookups_are_filtered = True,
    lookup_obs =  X.get('val:train',       ops_string='                   asarray from_numpy'), 
    lookup_geno = X.get('obs_geno_lookup', ops_string='   filter:val:train asarray from_numpy'),
    y =           X.get('YMat',            ops_string='cs filter:val:train asarray from_numpy float cuda:0'),
    G =           vals,
    G_type = 'list',
    # send_batch_to_gpu = 'cuda:0'
    ),
    batch_size = batch_size,
    shuffle = True
)

validation_dataloader = DataLoader(BigDataset(
    lookups_are_filtered = True,
    lookup_obs =  X.get('val:test',        ops_string='                   asarray from_numpy'), 
    lookup_geno = X.get('obs_geno_lookup', ops_string='   filter:val:test asarray from_numpy'),
    y =           X.get('YMat',            ops_string='cs filter:val:test asarray from_numpy float cuda:0'),
    G =           vals,
    G_type = 'list',
    # send_batch_to_gpu = 'cuda:0'
    ),
    batch_size = batch_size,
    shuffle = False
)


In [None]:
# LSUV_(model, data = next(iter(training_dataloader))[1])

Applying orthogonal init (zero init if dim < 2) to params in 8868 module(s).
Applying LSUV to 8868 module(s) (up to 10 iters per module):
Module  0 after  2 itr(s) | Mean: -0.002 | Std: 1.000 | <class 'torch.nn.modules.linear.Linear'>
Module  1 after  2 itr(s) | Mean: -0.140 | Std: 1.000 | <class 'torch.nn.modules.linear.Linear'>
Module  2 after  2 itr(s) | Mean: -0.124 | Std: 1.000 | <class 'torch.nn.modules.linear.Linear'>
Module  3 after  2 itr(s) | Mean: -0.154 | Std: 1.000 | <class 'torch.nn.modules.linear.Linear'>
Module  4 after  2 itr(s) | Mean:  0.174 | Std: 1.000 | <class 'torch.nn.modules.linear.Linear'>
Module  5 after  2 itr(s) | Mean: -0.386 | Std: 1.000 | <class 'torch.nn.modules.linear.Linear'>
Module  6 after  2 itr(s) | Mean: -0.199 | Std: 1.000 | <class 'torch.nn.modules.linear.Linear'>
Module  7 after  2 itr(s) | Mean:  0.315 | Std: 1.000 | <class 'torch.nn.modules.linear.Linear'>
Module  8 after  2 itr(s) | Mean: -0.344 | Std: 1.000 | <class 'torch.nn.modules.linea

In [None]:
VNN = plDNN_general(model)  

optimizer = VNN.configure_optimizers()

logger = TensorBoardLogger("tb_vnn_logs", name=save_prefix+'_no_LSUV')
trainer = pl.Trainer(max_epochs=max_epoch, logger=logger)

trainer.fit(model=VNN, train_dataloaders=training_dataloader, val_dataloaders=validation_dataloader)


/home/kickd/miniconda3/envs/fastai/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:191: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/kickd/miniconda3/envs/fastai/lib/python3.11/si ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: tb_vnn_logs/02.40_g2fc_G_ACGT_VNN_baseline_rx
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type     

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/kickd/miniconda3/envs/fastai/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
  loss = F.mse_loss(pred, y_i)
/home/kickd/miniconda3/envs/fastai/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

  loss = F.mse_loss(pred, y_i)


  loss = F.mse_loss(pred, y_i)


Validation: |          | 0/? [00:00<?, ?it/s]

  loss = F.mse_loss(pred, y_i)


Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=2` reached.


In [None]:
# import time, json
# save_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())

# json_path = cache_path+''.join(['lookup_dict','__'+save_time,'.json'])
# with open(json_path, 'w', encoding='utf-8') as f: 
#     json.dump(new_lookup_dict, f, ensure_ascii=False, indent=4)    

# pt_path = cache_path+''.join([save_prefix,'__'+save_time,'.pt'])

# torch.save(VNN.mod, pt_path)

: 