# G only KEGG based network architecture

> 

In [None]:
import numpy as np
import pandas as pd

from EnvDL.core import ensure_dir_path_exists 
from EnvDL.dlfn import g2fc_datawrapper, BigDataset, plDNN_general
from EnvDL.dlfn import ResNet2d, BasicBlock2d
from EnvDL.dlfn import LSUV_

import torch
import torch.nn.functional as F # F.mse_loss
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn

import lightning.pytorch as pl
from lightning.pytorch.loggers import TensorBoardLogger


from EnvDL.dlfn import kegg_connections_build, kegg_connections_clean, kegg_connections_append_y_hat, kegg_connections_sanitize_names
from EnvDL.dlfn import VNNHelper, VisableNeuralNetwork, Linear_block_reps
from EnvDL.dlfn import plDNN_general, BigDataset
from EnvDL.dlfn import reverse_edge_dict, reverse_node_props
from EnvDL.dlfn import VNNVAEHelper, plVNNVAE

In [None]:
cache_path = '../nbs_artifacts/02.41_g2fc_G_ACGT_VNN_vae_wide/'
save_prefix = "vnn-02.40-vnnvae-wide"

In [None]:
import json

json_path = cache_path+'lookup_dict__2023-12-28-13-52-53.json'

with open(json_path, 'r', encoding='utf-8') as f:
    restored_lookup_dict = json.load(f)
restored_lookup_dict

pt_path = cache_path+'vnn-02.40-vnnvae-wide__2023-12-28-13-52-53.pt'
VVH2 = torch.load(pt_path)
# VVH2.to('cuda')

In [None]:
# use_gpu_num = 0

# device = "cuda" if torch.cuda.is_available() else "cpu"
# if use_gpu_num in [0, 1]: 
#     torch.cuda.set_device(use_gpu_num)
# print(f"Using {device} device")

Using cuda device


In [None]:
# ensure_dir_path_exists(dir_path = cache_path)

## Fit Using VNNHelper

In [None]:

# Same setup as above to create kegg_gene_brite
X = g2fc_datawrapper()
X.set_split()
X.load_all(name_list = ['obs_geno_lookup', 'YMat', 'KEGG_slices',], store=True) 
X.calc_cs('YMat', version = 'np', filter = 'val:train')
ACGT_gene_slice_list =     X.get('KEGG_slices', ops_string='')
parsed_kegg_gene_entries = X.get('KEGG_entries')

# Restrict to only those with pathway
kegg_gene_brite = [e for e in parsed_kegg_gene_entries if 'BRITE' in e.keys()]

# also require to have a non-empty path
kegg_gene_brite = [e for e in kegg_gene_brite if not e['BRITE']['BRITE_PATHS'] == []]

print('Retaining '+ str(round(len(kegg_gene_brite)/len(parsed_kegg_gene_entries), 4)*100)+'%, '+str(len(kegg_gene_brite)
    )+'/'+str(len(parsed_kegg_gene_entries)
    )+' Entries'
    )
# kegg_gene_brite[1]['BRITE']['BRITE_PATHS']


Loading and storing default `phno`.
Retaining 43.53%, 6067/13939 Entries


In [None]:
kegg_connections = kegg_connections_build(kegg_gene_brite = kegg_gene_brite, 
                                          n_genes = 6067)
kegg_connections = kegg_connections_clean(         kegg_connections = kegg_connections)
kegg_connections = kegg_connections_append_y_hat(  kegg_connections = kegg_connections)
kegg_connections = kegg_connections_sanitize_names(kegg_connections = kegg_connections, 
                                                   replace_chars = {'.':'_'})

  0%|          | 0/6067 [00:00<?, ?it/s]

100%|██████████| 6067/6067 [00:00<00:00, 55676.03it/s]

Removed node "Others"





In [None]:
# initialize helper for input nodes
myvnn = VNNHelper(edge_dict = kegg_connections)

# Get a mapping of brite names to tensor list index
find_names = myvnn.nodes_inp # e.g. ['100383860', '100278565', ... ]
lookup_dict = {}

# the only difference lookup_dict and brite_node_to_list_idx_dict above is that this is made using the full set of genes in the list 
# whereas that is made using kegg_gene_brite which is a subset
for i in range(len(parsed_kegg_gene_entries)):
    if 'BRITE' not in parsed_kegg_gene_entries[i].keys():
        pass
    elif parsed_kegg_gene_entries[i]['BRITE']['BRITE_PATHS'] == []:
        pass
    else:
        name = parsed_kegg_gene_entries[i]['BRITE']['BRITE_PATHS'][0][-1]
        if name in find_names:
            lookup_dict[name] = i


In [None]:
# Setup list with only the tensors required
vals = X.get('KEGG_slices', ops_string='asarray from_numpy float')
# restrict to the tensors that will be used

vals = [vals[lookup_dict[e]] for e in restored_lookup_dict.keys()]
# vals = [vals[lookup_dict[i]] for i in myvnn.nodes_inp]
# send to gpu
# vals = [val.to('cuda') for val in vals]

In [None]:
with torch.no_grad():
    res = VVH2.encode(vals)
res = torch.Tensor(res).numpy()
np.save(pt_path.replace('.pt', '__emb.npy'), res)

In [None]:
# import numpy as np
from sklearn.manifold import TSNE

X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random', perplexity=3
                  ).fit_transform(res)
X_embedded.shape

(4926, 2)

In [None]:
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white"
px.scatter(X_embedded)

In [None]:
labels = X.get('phno')

In [None]:
df_embedded = pd.concat([
    pd.DataFrame(X_embedded, columns=['x1', 'x2']),
    labels.loc[:, ['Hybrid', 'Geno_Idx']].drop_duplicates().sort_values('Geno_Idx').reset_index(drop=True).loc[:, ['Hybrid']]],
    axis=1)

In [None]:
df_embedded[['p1', 'p2'] ] = df_embedded.Hybrid.str.split('/', n=1, expand=True)
df_embedded['bool'] = False

df_embedded['bool'] = ['True' if (
    1.5*list(df_embedded['x1'])[i]+2 > list(df_embedded['x2'])[i] 
    ) else 'False' for i in range(df_embedded.shape[0])]

px.scatter(
    df_embedded,
    x = 'x1', y = 'x2', hover_data = ['Hybrid'],
    color = 'bool')

In [None]:
temp = df_embedded.loc[:, ['p1', 'p2', 'bool']].assign(n = 1).groupby(['bool', 'p1', 'p2']).count().reset_index()

temp = pd.concat([
    temp.loc[:, ['bool', 'p1', 'n']],
    temp.loc[:, ['bool', 'p2', 'n']].rename(columns={'p2':'p1'})
    ]).groupby(['bool', 'p1']).count().reset_index()

temp = temp.merge(temp.groupby('bool').agg(total = ('n', 'sum')).reset_index() )
temp['avg'] = temp.n/temp.total

temp = temp.pivot(index='p1', columns='bool', values='avg')

temp = temp.reset_index()
# filter to parents in both
temp = temp.loc[(temp['False'].notna() & temp['True'].notna()), ]

# Order by difference in representation
temp.loc[:, 'abs_diff'] = temp.loc[:, 'False'] - temp.loc[:, 'True']
temp.loc[:, 'abs_diff'] = temp.loc[:, 'abs_diff'].abs()
temp = temp.sort_values('abs_diff', ascending=False).reset_index(drop=True)
temp.head(20)

bool,p1,False,True,abs_diff
0,LH244,0.195804,0.047355,0.148449
1,PHZ51,0.017483,0.069203,0.051721
2,LH185,0.055944,0.022266,0.033678
3,PHP02,0.006993,0.039829,0.032836
4,PHB47,0.01049,0.029375,0.018885
5,PHK76,0.020979,0.039515,0.018536
6,LH198,0.006993,0.023312,0.016319
7,LH195,0.06993,0.082061,0.012131
8,CG110,0.01049,0.000209,0.01028
9,PHT69,0.048951,0.058645,0.009694


In [None]:
px.scatter(temp.head(20), y = 'abs_diff', x = 'p1')

In [None]:
search_strs = [
    'LH244',
    'PHZ51',
    'LH185',
    'PHP02']


df_embedded['bool'] = [True if (
    (list(df_embedded['p1'])[i] in search_strs) or 
    (list(df_embedded['p2'])[i] in search_strs)
) else False for i in range(df_embedded.shape[0])]

px.scatter(
    df_embedded,
    x = 'x1', y = 'x2', hover_data = ['Hybrid'],
    color = 'bool')