# G only KEGG based network architecture

> 

In [None]:
import numpy as np
import pandas as pd

from EnvDL.core import ensure_dir_path_exists 
from EnvDL.dlfn import g2fc_datawrapper, BigDataset, plDNN_general
from EnvDL.dlfn import ResNet2d, BasicBlock2d
from EnvDL.dlfn import LSUV_

import torch
import torch.nn.functional as F # F.mse_loss
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn

import lightning.pytorch as pl
from lightning.pytorch.loggers import TensorBoardLogger


from EnvDL.dlfn import kegg_connections_build, kegg_connections_clean, kegg_connections_append_y_hat, kegg_connections_sanitize_names
from EnvDL.dlfn import VNNHelper, VisableNeuralNetwork, Linear_block_reps
from EnvDL.dlfn import plDNN_general, BigDataset
from EnvDL.dlfn import reverse_edge_dict, reverse_node_props
from EnvDL.dlfn import VNNVAEHelper, plVNNVAE

In [None]:
cache_path = '../nbs_artifacts/02.41_g2fc_G_ACGT_VNN_vae_wide/'
save_prefix = "vnn-02.40-vnnvae-wide"

# Run settings: 
max_epoch  = 80
batch_size = 48

# VNN settings:
default_out_nodes_inp   = 4
default_out_nodes_edge  = 32
default_out_nodes_out   = 512

default_drop_nodes_inp  = 0.0
default_drop_nodes_edge = 0.0
default_drop_nodes_out  = 0.0

default_reps_nodes_inp  = 1
default_reps_nodes_edge = 1
default_reps_nodes_out  = 1


# VAE settings
default_latent_inp_size = default_out_nodes_out
default_latent_size     = default_out_nodes_out

In [None]:
use_gpu_num = 0

device = "cuda" if torch.cuda.is_available() else "cpu"
if use_gpu_num in [0, 1]: 
    torch.cuda.set_device(use_gpu_num)
print(f"Using {device} device")

Using cuda device


In [None]:
ensure_dir_path_exists(dir_path = cache_path)

## Fit Using VNNHelper

In [None]:

# Same setup as above to create kegg_gene_brite
X = g2fc_datawrapper()
X.set_split()
X.load_all(name_list = ['obs_geno_lookup', 'YMat', 'KEGG_slices',], store=True) 
X.calc_cs('YMat', version = 'np', filter = 'val:train')
ACGT_gene_slice_list =     X.get('KEGG_slices', ops_string='')
parsed_kegg_gene_entries = X.get('KEGG_entries')


# Restrict to only those with pathway
kegg_gene_brite = [e for e in parsed_kegg_gene_entries if 'BRITE' in e.keys()]

# also require to have a non-empty path
kegg_gene_brite = [e for e in kegg_gene_brite if not e['BRITE']['BRITE_PATHS'] == []]

print('Retaining '+ str(round(len(kegg_gene_brite)/len(parsed_kegg_gene_entries), 4)*100)+'%, '+str(len(kegg_gene_brite)
    )+'/'+str(len(parsed_kegg_gene_entries)
    )+' Entries'
    )
# kegg_gene_brite[1]['BRITE']['BRITE_PATHS']

Loading and storing default `phno`.
Retaining 43.53%, 6067/13939 Entries


In [None]:
kegg_connections = kegg_connections_build(kegg_gene_brite = kegg_gene_brite, 
                                          n_genes = 6067)
kegg_connections = kegg_connections_clean(         kegg_connections = kegg_connections)
kegg_connections = kegg_connections_append_y_hat(  kegg_connections = kegg_connections)
kegg_connections = kegg_connections_sanitize_names(kegg_connections = kegg_connections, 
                                                   replace_chars = {'.':'_'})

100%|██████████| 6067/6067 [00:00<00:00, 59925.54it/s]




Removed node "Others"


In [None]:
# initialize helper for input nodes
myvnn = VNNHelper(edge_dict = kegg_connections)

# Get a mapping of brite names to tensor list index
find_names = myvnn.nodes_inp # e.g. ['100383860', '100278565', ... ]
lookup_dict = {}

# the only difference lookup_dict and brite_node_to_list_idx_dict above is that this is made using the full set of genes in the list 
# whereas that is made using kegg_gene_brite which is a subset
for i in range(len(parsed_kegg_gene_entries)):
    if 'BRITE' not in parsed_kegg_gene_entries[i].keys():
        pass
    elif parsed_kegg_gene_entries[i]['BRITE']['BRITE_PATHS'] == []:
        pass
    else:
        name = parsed_kegg_gene_entries[i]['BRITE']['BRITE_PATHS'][0][-1]
        if name in find_names:
            lookup_dict[name] = i
lookup_dict    


brite_node_to_list_idx_dict = {}
for i in range(len(kegg_gene_brite)):
    brite_node_to_list_idx_dict[str(kegg_gene_brite[i]['BRITE']['BRITE_PATHS'][0][-1])] = i        

# Get the input sizes for the graph
size_in_zip = zip(myvnn.nodes_inp, [np.prod(ACGT_gene_slice_list[lookup_dict[e]].shape[1:]) for e  in myvnn.nodes_inp])


# init input node sizes
myvnn.set_node_props(key = 'inp', node_val_zip = size_in_zip)

# init node output sizes
myvnn.set_node_props(key = 'out', node_val_zip = zip(myvnn.nodes_inp, [default_out_nodes_inp  for e in myvnn.nodes_inp]))
myvnn.set_node_props(key = 'out', node_val_zip = zip(myvnn.nodes_edge,[default_out_nodes_edge for e in myvnn.nodes_edge]))
myvnn.set_node_props(key = 'out', node_val_zip = zip(myvnn.nodes_out, [default_out_nodes_out  for e in myvnn.nodes_out]))


# # options should be controlled by node_props
myvnn.set_node_props(key = 'flatten', node_val_zip = zip(
    myvnn.nodes_inp, 
    [True for e in myvnn.nodes_inp]))

# myvnn.set_node_props(key = 'reps', node_val_zip = zip(
#     myvnn.nodes_out+myvnn.nodes_inp+myvnn.nodes_edge, 
#     [1 for e in myvnn.nodes_out+myvnn.nodes_inp+myvnn.nodes_edge]))

# # init dropout 
# myvnn.set_node_props(key = 'drop', node_val_zip = zip(
#     myvnn.nodes_out+myvnn.nodes_inp+myvnn.nodes_edge, 
#     [0.0 for e in myvnn.nodes_out+myvnn.nodes_inp+myvnn.nodes_edge]))

myvnn.set_node_props(key = 'reps', node_val_zip = zip(myvnn.nodes_inp, [default_reps_nodes_inp  for e in myvnn.nodes_inp]))
myvnn.set_node_props(key = 'reps', node_val_zip = zip(myvnn.nodes_edge,[default_reps_nodes_edge for e in myvnn.nodes_edge]))
myvnn.set_node_props(key = 'reps', node_val_zip = zip(myvnn.nodes_out, [default_reps_nodes_out  for e in myvnn.nodes_out]))

myvnn.set_node_props(key = 'drop', node_val_zip = zip(myvnn.nodes_inp, [default_drop_nodes_inp  for e in myvnn.nodes_inp]))
myvnn.set_node_props(key = 'drop', node_val_zip = zip(myvnn.nodes_edge,[default_drop_nodes_edge for e in myvnn.nodes_edge]))
myvnn.set_node_props(key = 'drop', node_val_zip = zip(myvnn.nodes_out, [default_drop_nodes_out  for e in myvnn.nodes_out]))

# init edge node input size (propagate forward input/edge outpus)
myvnn.calc_edge_inp()

# myvnn.mk_digraph(include = ['node_name', 'inp_size', 'out_size'])
# myvnn.mk_digraph(include = [''])

In [None]:
# Setup list with only the tensors required
vals = X.get('KEGG_slices', ops_string='asarray from_numpy float')
# restrict to the tensors that will be used
vals = [vals[lookup_dict[i]] for i in myvnn.nodes_inp]
# send to gpu
vals = [val.to('cuda') for val in vals]

In [None]:
# replace lookup so that it matches the length of the input tensors
new_lookup_dict = {}
for i in range(len(myvnn.nodes_inp)):
    new_lookup_dict[myvnn.nodes_inp[i]] = i
    # print((myvnn.nodes_inp[i], i))
    # break

In [None]:
model_vnn = VisableNeuralNetwork(
    node_props = myvnn.node_props,
    Linear_block = Linear_block_reps,
    edge_dict = myvnn.edge_dict,
    dependancy_order = myvnn.dependancy_order,
    node_to_inp_num_dict = new_lookup_dict
)

In [None]:
# reverse the edges in the edge dict (and deduplicate one to many relationships)
kegg_connections_reversed = reverse_edge_dict(edge_dict = kegg_connections)

# use existing graph to define properties of new one (switch inputs/outputs)
prop_dict_reversed = reverse_node_props(
    prop_dict = myvnn.node_props, 
    conversion_dict = {'out':'inp',
                        'inp':'out',
                    'flatten':''})

# use the VNNHelper class to setup the connections but then pass in all the nodes' properties directly
myvnn_rev = VNNHelper(edge_dict = kegg_connections_reversed)
myvnn_rev.node_props = prop_dict_reversed
myvnn_rev.calc_edge_inp()
# myvnn_rev.mk_digraph(include = ['node_name', 'inp_size', 'out_size'])

In [None]:
# now the reversed version
model_vnn_reverse = VisableNeuralNetwork(
    node_props = myvnn_rev.node_props,
    Linear_block = Linear_block_reps,
    edge_dict = myvnn_rev.edge_dict,
    dependancy_order = myvnn_rev.dependancy_order,
    node_to_inp_num_dict = {'y_hat': 0},
    return_dict = True # With multiple outputs a dictionary will automatically be returned.
)

In [None]:
training_dataloader = DataLoader(BigDataset(
    lookups_are_filtered = True,
    lookup_obs =  X.get('val:train',       ops_string='                   asarray from_numpy'), 
    lookup_geno = X.get('obs_geno_lookup', ops_string='   filter:val:train asarray from_numpy'),
    y =           X.get('YMat',            ops_string='cs filter:val:train asarray from_numpy float cuda:0'),
    G =           vals,
    G_type = 'list',
    # send_batch_to_gpu = 'cuda:0'
    ),
    batch_size = batch_size,
    shuffle = True
)

validation_dataloader = DataLoader(BigDataset(
    lookups_are_filtered = True,
    lookup_obs =  X.get('val:test',        ops_string='                   asarray from_numpy'), 
    lookup_geno = X.get('obs_geno_lookup', ops_string='   filter:val:test asarray from_numpy'),
    y =           X.get('YMat',            ops_string='cs filter:val:test asarray from_numpy float cuda:0'),
    G =           vals,
    G_type = 'list',
    # send_batch_to_gpu = 'cuda:0'
    ),
    batch_size = batch_size,
    shuffle = False
)

In [None]:
VVH = VNNVAEHelper(
    encoder = model_vnn, 
    decoder = model_vnn_reverse, 
    latent_inp_size = default_latent_inp_size, 
    latent_size = default_latent_size)

# # initializaiton with LSUV seems to cause infs to be predicted
# # step through and init the fc components too
# LSUV_(VVH.encoder.to('cuda'), data = next(iter(training_dataloader))[1])
# x_enc = VVH.encoder.to('cuda')( next(iter(training_dataloader))[1] )
# LSUV_(VVH.fc_mu.to('cuda'),      data = x_enc)
# LSUV_(VVH.fc_log_var.to('cuda'), data = x_enc)
# mu, log_var = VVH.reparam(x_enc)
# _, _, z = VVH.sample(mu, log_var)
# LSUV_(VVH.fc_reverse.to('cuda'), data = z)
# LSUV_(VVH.decoder.to('cuda'), data = [VVH.reparam_rev(z)])

# VVH(next(iter(training_dataloader))[1])

plVVH = plVNNVAE(VVH, lookup_dict = new_lookup_dict)

logger = pl.loggers.TensorBoardLogger("tb_vnnvae_logs", name=save_prefix)
trainer = pl.Trainer(max_epochs=max_epoch, logger=logger)

trainer.fit(model=plVVH, train_dataloaders=training_dataloader, val_dataloaders=validation_dataloader)

/home/kickd/miniconda3/envs/fastai/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:191: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/kickd/miniconda3/envs/fastai/lib/python3.11/si ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/kickd/miniconda3/envs/fastai/lib/python3.11/site-packages/lightning/pytorch/trainer/configuration_validator.py:72: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/tor

Training: |          | 0/? [00:00<?, ?it/s]

/home/kickd/miniconda3/envs/fastai/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


: 

In [None]:
import time, json
save_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())

json_path = cache_path+''.join(['lookup_dict','__'+save_time,'.json'])
with open(json_path, 'w', encoding='utf-8') as f: 
    json.dump(new_lookup_dict, f, ensure_ascii=False, indent=4)    

pt_path = cache_path+''.join([save_prefix,'__'+save_time,'.pt'])

torch.save(plVVH.VNNVAEHelper, pt_path)

In [None]:
# # Confirm that a re-loaded vnnvae works
# VVH2 = torch.load(pt_path)

# plVVH = plVNNVAE(VVH2, lookup_dict = new_lookup_dict)

# logger = pl.loggers.TensorBoardLogger("tb_vnnvae_logs", name=save_prefix)
# trainer = pl.Trainer(max_epochs=4, logger=logger)

# trainer.fit(model=plVVH, train_dataloaders=training_dataloader, val_dataloaders=validation_dataloader)