In [31]:
import numpy as np
import pandas as pd
import pickle
import math
import sys
sys.path.append('..')

import torch
import torch.nn as nn
from torch.functional import F

import torch_geometric
from torch_geometric.data import Data, DataLoader, Dataset
from torch_geometric.utils import add_self_loops, degree, to_dense_adj,remove_self_loops
import torch_geometric.transforms as T

import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm.notebook import tqdm

from models.graph_transformer.euclidean_graph_transformer import GraphTransformerEncoder
from models.deep_graph_infomax.infomax import DeepGraphInfomax
from models.graph_transformer.autoencoder_base import DeepSNEM, LinearDecoder, FermiDiracDecoder
from utils.data_gen import ucsv2graph, SNDatasetAuto, load_prot_embs, ucsv2graph_infomax, PositionalEmbedding

import re
import gc

torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True

# Load a sample graph negative_sampling

In [32]:
unique_prots = 'data/prot_embeddings/new_features/proteins.csv'
unique_df = pd.read_csv(unique_prots)
global_dict = {}

for idx, prot in enumerate(unique_df.proteins.to_numpy()):
    global_dict[prot] = idx
    
def calc_pos_mat(emb_dim):
    posemb = PositionalEmbedding(emb_dim)
    pos_dists = [i for i in np.arange(0, 30)]
    pos_mat = torch.zeros(60,60)
    for i in pos_dists:
        for j in pos_dists:
            ipos = posemb(torch.tensor([np.float(i)]).cuda())[0][0]
            jpos = posemb(torch.tensor([np.float(j)]).cuda())[0][0]
            pos_mat[int(2*i),int(2*j)] = torch.matmul(ipos, jpos.T)
    pos_mat = torch.sqrt(pos_mat)
    return pos_mat

pos_mat = calc_pos_mat(512)

In [33]:
unweighted_fnames = 'data/graph_info_df/samples_all.csv'
u_fnames = pd.read_csv(unweighted_fnames)
u_path_list = u_fnames.path_list.to_numpy()
usample = u_path_list[10]

unweighted_total = '../snac_data/file_info.csv'
u_total = pd.read_csv(unweighted_total)

weighted_fnames = '../snac_data/file_info_weighted.csv'
w_fnames = pd.read_csv(weighted_fnames)
w_path_list = w_fnames.files_weighted.to_numpy()
wsample = w_path_list[2]

dup_fnames = 'data/graph_info_df/samples_for_dupl.csv'
dup_fnames = pd.read_csv(dup_fnames)

In [34]:
data = ucsv2graph_infomax(usample, global_dict, pos_mat)

In [35]:
dup_fnames

Unnamed: 0.1,Unnamed: 0,path_list,embed_id
0,0,graphs_combined/CPC005_A375_24H_BRD-K27305650-...,CPC005_A375_24H_BRD-K27305650-003-01-4_10_emb_1
1,1,graphs_combined/CPC005_A375_24H_BRD-K27305650-...,CPC005_A375_24H_BRD-K27305650-003-01-4_10_emb_10
2,2,graphs_combined/CPC005_A375_24H_BRD-K27305650-...,CPC005_A375_24H_BRD-K27305650-003-01-4_10_emb_100
3,3,graphs_combined/CPC005_A375_24H_BRD-K27305650-...,CPC005_A375_24H_BRD-K27305650-003-01-4_10_emb_11
4,4,graphs_combined/CPC005_A375_24H_BRD-K27305650-...,CPC005_A375_24H_BRD-K27305650-003-01-4_10_emb_12
...,...,...,...
663,663,graphs_combined/RAD001_HCC515_6H_BRD-A19037878...,RAD001_HCC515_6H_BRD-A19037878_10_emb_13
664,664,graphs_combined/RAD001_HCC515_6H_BRD-A19037878...,RAD001_HCC515_6H_BRD-A19037878_10_emb_14
665,665,graphs_combined/RAD001_HCC515_6H_BRD-A19037878...,RAD001_HCC515_6H_BRD-A19037878_10_emb_15
666,666,graphs_combined/RAD001_HCC515_6H_BRD-A19037878...,RAD001_HCC515_6H_BRD-A19037878_10_emb_16


# Create the appropriate data loader

In [5]:
#X, val = train_test_split(u_path_list, test_size=0.3)

X_path = 'data/graph_info_df/all_pairs3_train_graphs.csv'
X = pd.read_csv(X_path)
X = X.x.to_numpy()

val_path = 'data/graph_info_df/val_set_1.csv'
val = pd.read_csv(val_path)
val = val.graphs.to_numpy()

test_path = 'data/graph_info_df/test_set.csv'
test = pd.read_csv(test_path)
test = test.graphs.to_numpy()

samples_all_path = 'data/graph_info_df/samples_all.csv'
samples_all = pd.read_csv(samples_all_path)
samples_all = samples_all.path_list.to_numpy()

train_data = SNDatasetAuto(X, global_dict)
val_data = SNDatasetAuto(val, global_dict)
pred_data = SNDatasetAuto(test, global_dict)
samples_all_data = SNDatasetAuto(samples_all, global_dict)

train_loader = DataLoader(train_data, batch_size=1, num_workers=12, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=1, num_workers=12)
pred_loader = DataLoader(pred_data, batch_size=1, num_workers=12)
samples_all_loader = DataLoader(samples_all_data, batch_size=1, num_workers=12)

# Test the Graph Transformer

In [6]:
SIZE = 512
EMB_DIM = 512

prot_embs = load_prot_embs(SIZE, norm=False)
encoder = GraphTransformerEncoder(n_layers=1, n_heads=4, n_hid=EMB_DIM, pretrained_weights=prot_embs[0])
decoder = LinearDecoder(emb_dim=EMB_DIM, original_dim=SIZE)

In [7]:
dev = torch.device('cuda')
#autoenc = DeepSNEM(encoder, decoder).to(dev)

prot_embs = load_prot_embs(512, norm=False)
enc = GraphTransformerEncoder(n_layers=2, n_heads=4, n_hid=512, 
                            pretrained_weights=prot_embs[0]).to(dev)

model = DeepGraphInfomax(hidden_channels=512, encoder=enc,
                                     summary= lambda z, *args, **kwargs: z.mean(dim=0)).to(dev)

In [15]:
#model.load_state_dict(torch.load('embeddings/deep_graph_infomax/dgi_512_tl_1.pt'))

In [37]:
summ = model(data.to(dev))

In [17]:
@torch.no_grad()
def emb_csv(model, loader):
    model.eval()
    embeddings = np.zeros((len(u_path_list), 1024))
    
    idx = 0
    for graph in tqdm(loader):
        embeddings[idx] = model.encode(graph.to(dev)).sum(0).cpu().numpy()
        idx += 1
        
    return embeddings

@torch.no_grad()
def emb_infomax(model, loader):
    model.eval()
    embeddings = np.zeros((len(u_path_list), 512))
    
    idx = 0
    for graph in tqdm(loader):
        embeddings[idx] = model(graph.to(dev))[2].detach().cpu().numpy()
        idx += 1
        
    return embeddings

In [18]:
embs = emb_infomax(model, samples_all_loader)

HBox(children=(FloatProgress(value=0.0, max=69170.0), HTML(value='')))




AttributeError: 'Batch' object has no attribute 'seq_mat'

In [None]:
upl_f = lambda x : re.sub('graphs_combined/','', x)
upl_f_csv = lambda x : re.sub('.csv','', x)
upl_f_emb = lambda x : re.sub('/graph','_emb',x)

upl = [*map(upl_f, samples_all)]
upl = [*map(upl_f_csv, upl)]
upl = [*map(upl_f_emb, upl)]
upl = np.array(upl)

In [None]:
cols = ['emb',*map(str, *[range(512)])]

In [None]:
df1 = pd.DataFrame(upl)
df2 = pd.DataFrame(embs)

In [None]:
df = pd.concat([df1, df2], axis=1)
df.columns = cols

In [None]:
#df.to_csv('embeddings/autoencoder_graph/gt_1024_tl_1_leaky_relu_sum.csv')
df.to_csv('embeddings/deep_graph_infomax/dgi_512_tl_1.csv')