In [1]:
import numpy as np
import pandas as pd
import pickle
import math
import sys
sys.path.append('..')

import torch
import torch.nn as nn
from torch.functional import F

import torch_geometric
from torch_geometric.data import Data, DataLoader, Dataset
from torch_geometric.utils import from_networkx, to_networkx, to_dense_adj, degree
import torch_geometric.transforms as T

import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm

from models.graph_transformer.euclidean_graph_transformer import GraphTransformerEncoder, PostEncoding
from models.graph_transformer.autoencoder_base import DeepSNEM, LinearDecoder
from utils.data_gen import ucsv2graph, SNDatasetAuto, load_prot_embs

import re
import gc

torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True

# Load a sample graph

In [4]:
unique_prots = '../data/prot_embeddings/new_features/proteins.csv'
unique_df = pd.read_csv(unique_prots)
global_dict = {}

for idx, prot in enumerate(unique_df.proteins.to_numpy()):
    global_dict[prot] = idx

def create_ugraph(sample, global_dict, weighted=False):
    df = pd.read_csv('../snac_data/' + sample)
    
    if weighted:
        G = nx.from_pandas_edgelist(df, source='node1', target='node2', 
                                edge_attr=['sign'], create_using=nx.DiGraph())
    else:
        G = nx.from_pandas_edgelist(df, source='node1', target='node2', edge_attr=['sign'])

    nx.set_node_attributes(G, global_dict, 'global_idx')

    data = from_networkx(G)
    
    # Switch to the global index
    ##row, col = data.edge_index
    ##row = torch.LongTensor([data.global_idx[node] for node in row])
    ##col = torch.LongTensor([data.global_idx[node] for node in col])
    ##data.global_edge_index = torch.stack((row, col))

    return G, data

def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return np.eye(num_classes, dtype='uint8')[y]

In [7]:
unweighted_fnames = '../data/graph_info_df/samples_all.csv'
u_fnames = pd.read_csv(unweighted_fnames)
u_path_list = u_fnames.path_list.to_numpy()
usample = u_path_list[30]

weighted_fnames = '../../snac_data/file_info_weighted.csv'
w_fnames = pd.read_csv(weighted_fnames)
w_path_list = w_fnames.files_weighted.to_numpy()
wsample = w_path_list[2]

_, graph = create_ugraph(wsample, global_dict)

FileNotFoundError: [Errno 2] File ../snac_data/graphs_weighted/AML001_CD34_24H_BRD-K16300438_10/graph.csv does not exist: '../snac_data/graphs_weighted/AML001_CD34_24H_BRD-K16300438_10/graph.csv'

In [4]:
data = ucsv2graph(usample, global_dict=global_dict)
adj = to_dense_adj(data.edge_index, edge_attr=data.sign)
adj.shape

torch.Size([1, 62, 62, 2])

# Create the appropriate data loader

In [9]:
#X, val = train_test_split(u_path_list, test_size=0.3)

X_path = '../data/graph_info_df/all_pairs3_train_graphs.csv'
X = pd.read_csv(X_path)
X = X.x.to_numpy()

val_path = '../data/graph_info_df/val_set_1.csv'
val = pd.read_csv(val_path)
val = val.graphs.to_numpy()

test_path = '../data/graph_info_df/test_set.csv'
test = pd.read_csv(test_path)
test = test.graphs.to_numpy()

train_data = SNDatasetAuto(X, global_dict)
val_data = SNDatasetAuto(val, global_dict)
pred_data = SNDatasetAuto(test, global_dict)

train_loader = DataLoader(train_data, batch_size=1, num_workers=12, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=1, num_workers=12)
pred_loader = DataLoader(pred_data, batch_size=1, num_workers=12)
test_loader = DataLoader(pred_data, batch_size=1, num_workers=12)

# Test the Graph Transformer

In [10]:
SIZE = 1024
EMB_DIM = 1024

prot_embs = load_prot_embs(SIZE, norm=False)
encoder = GraphTransformerEncoder(n_layers=1, n_heads=4, n_hid=EMB_DIM, pretrained_weights=prot_embs)
decoder = LinearDecoder(emb_dim=EMB_DIM, original_dim=SIZE)

FileNotFoundError: [Errno 2] File data/prot_embeddings/new_features/proteins.csv does not exist: 'data/prot_embeddings/new_features/proteins.csv'

In [7]:
dev = torch.device('cuda')
autoenc = DeepSNEM(encoder, decoder).to(dev)

In [8]:
autoenc.load_state_dict(torch.load('../gt_1024_tl_1_leaky_relu.pt'))

<All keys matched successfully>

In [9]:
enc_sum = autoenc.encoder(data.to(dev)).sum(0)
enc_mean = autoenc.encoder(data.to(dev)).mean(0)

In [10]:
enc_sum

tensor([-1.5340, -1.6266, -1.9134,  ...,  8.9540, 17.2479, 14.8186],
       device='cuda:0', grad_fn=<SumBackward1>)

In [11]:
@torch.no_grad()
def emb_csv(model, loader):
    model.eval()
    embeddings = np.zeros((len(u_path_list), 1024))
    
    idx = 0
    for graph in tqdm(loader):
        embeddings[idx] = model.encode(graph.to(dev)).sum(0).cpu().numpy()
        idx += 1
        
    return embeddings

In [12]:
embs = emb_csv(autoenc, pred_loader)

HBox(children=(FloatProgress(value=0.0, max=69170.0), HTML(value='')))




In [13]:
upl_f = lambda x : re.sub('graphs_combined/','', x)
upl_f_csv = lambda x : re.sub('.csv','', x)
upl_f_emb = lambda x : re.sub('/graph','_emb',x)

upl = [*map(upl_f, u_path_list)]
upl = [*map(upl_f_csv, upl)]
upl = [*map(upl_f_emb, upl)]
upl = np.array(upl)

In [14]:
cols = ['emb',*map(str, *[range(1024)])]

In [15]:
df1 = pd.DataFrame(upl)
df2 = pd.DataFrame(embs)

In [16]:
df = pd.concat([df1, df2], axis=1)
df.columns = cols

In [17]:
df.to_csv('gt_1024_tl_1_leaky_relu_sum.csv')

In [18]:
autoenc.encoder.emb_layer(data.global_idx.to(dev))

tensor([[ 7.4923e-02,  2.9273e-02, -7.0417e-02,  ..., -1.1095e-01,
         -1.1349e-01,  5.2326e-05],
        [ 4.8571e-03,  3.3093e-02, -9.5274e-02,  ..., -7.3341e-02,
          3.9810e-02, -7.8793e-02],
        [-8.0703e-02,  8.7508e-03, -1.3737e-01,  ...,  5.4448e-03,
         -6.4654e-02, -5.8495e-02],
        ...,
        [ 3.5466e-02,  3.0272e-02, -1.3990e-01,  ..., -1.6121e-01,
          2.1723e-02, -1.5147e-01],
        [ 4.9526e-02, -2.5785e-02, -1.4737e-01,  ..., -1.2276e-01,
          4.0719e-02, -1.1648e-01],
        [ 2.2043e-02,  2.2198e-02, -8.6441e-02,  ..., -7.5116e-02,
          4.1186e-02,  1.0438e-02]], device='cuda:0',
       grad_fn=<EmbeddingBackward>)

In [19]:
autoenc.encode(data.to(dev))

tensor([[-0.0299, -0.0463, -0.0749,  ...,  0.1469,  0.3684,  0.2623],
        [-0.0051, -0.0062, -0.0463,  ...,  0.1392,  0.4092,  0.2838],
        [-0.0354, -0.0427, -0.0718,  ...,  0.1512,  0.3832,  0.2253],
        ...,
        [-0.0308, -0.0196, -0.0186,  ...,  0.1238,  0.2252,  0.2401],
        [-0.0273, -0.0261, -0.0068,  ...,  0.1427,  0.2194,  0.2415],
        [-0.0260, -0.0227, -0.0029,  ...,  0.1567,  0.2310,  0.2486]],
       device='cuda:0', grad_fn=<LeakyReluBackward0>)