In [1]:
import numpy as np
import pandas as pd
import pickle
import math
import sys
sys.path.append('..')

import torch
import torch.nn as nn
from torch.functional import F

import torch_geometric
from torch_geometric.data import Data, DataLoader, Dataset
from torch_geometric.utils import add_self_loops, degree, to_dense_adj,remove_self_loops
import torch_geometric.transforms as T

import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm.notebook import tqdm

from models.graph_transformer.euclidean_graph_transformer import GraphTransformerEncoder
from models.deep_graph_infomax.infomax import DeepGraphInfomax
from models.graph_transformer.autoencoder_base import DeepSNEM, LinearDecoder, FermiDiracDecoder
from utils.data_gen import SNLDataset, load_prot_embs, wcsv2graph, load_prot_embs_go, SNDatasetInfomax

import re
import gc

torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True
torch.cuda.is_available()

True

# Load a sample graph negative_sampling

In [2]:
unique_prots = 'data/prot_embeddings/new_features/proteins.csv'
unique_df = pd.read_csv(unique_prots)
global_dict = {}

for idx, prot in enumerate(unique_df.proteins.to_numpy()):
    global_dict[prot] = idx

In [3]:
unweighted_fnames = 'data/graph_info_df/samples_all.csv'
u_fnames = pd.read_csv(unweighted_fnames)
u_path_list = u_fnames.path_list.to_numpy()
us_cellid = []
for us in u_path_list:
    x = re.split('_', us)
    us_cellid.append(x[2])
u_fnames['cell_id'] = us_cellid
cellid = np.array(us_cellid)
usample = u_path_list[85]

unweighted_total = '../snac_data/file_info.csv'
u_total = pd.read_csv(unweighted_total)

weighted_fnames = '../snac_data/file_info_weighted.csv'
w_fnames = pd.read_csv(weighted_fnames)
w_path_list = w_fnames.files_weighted.to_numpy()
wsample = w_path_list[2]

dup_fnames = 'data/graph_info_df/samples_for_dupl.csv'
dup_fnames = pd.read_csv(dup_fnames)

# Create the appropriate data loader

In [4]:
#X, val = train_test_split(u_path_list, test_size=0.3)

X_path = 'data/graph_info_df/all_pairs3_train_graphs.csv'
X = pd.read_csv(X_path)
X = X.x.to_numpy()

val_path = 'data/graph_info_df/val_set_1.csv'
val = pd.read_csv(val_path)
val = val.graphs.to_numpy()

test_path = 'data/graph_info_df/test_set.csv'
test = pd.read_csv(test_path)
test = test.graphs.to_numpy()

samples_all_path = 'data/graph_info_df/samples_all.csv'
samples_all = pd.read_csv(samples_all_path)
samples_all = samples_all.path_list.to_numpy()

pl = w_path_list
plu = u_path_list
train_loader = SNLDataset(pl, global_dict, np.zeros_like(pl))
samples_all_data = SNDatasetInfomax(samples_all, global_dict)

train_loader = DataLoader(train_loader, batch_size=1, num_workers=12, pin_memory=True)
samples_all_loader = DataLoader(samples_all_data, batch_size=1, num_workers=12)

# Test the Graph Transformer

In [8]:
SIZE = 512
EMB_DIM = 512

prot_embs = load_prot_embs_go(SIZE, norm=False)
summarizer = lambda z, *args, **kwargs: z.mean(dim=0)
encoder = GraphTransformerEncoder(n_layers=1, n_heads=4, n_hid=EMB_DIM, pretrained_weights=prot_embs[0], 
                                  summarizer=summarizer).to(dev)
decoder = FermiDiracDecoder(1.0).to(dev)
autoenc = DeepSNEM(encoder, decoder).to(dev)

In [9]:
dev = torch.device('cuda')
#autoenc = DeepSNEM(encoder, decoder).to(dev)

prot_embs = load_prot_embs_go(512, norm=False)
enc = GraphTransformerEncoder(n_layers=1, n_heads=4, n_hid=512, 
                            pretrained_weights=prot_embs[0], summarizer=summarizer).to(dev)

model = DeepGraphInfomax(hidden_channels=512, encoder=enc,
                                     summary= lambda z, *args, **kwargs: z.mean(dim=0)).to(dev)

In [10]:
model.load_state_dict(torch.load('embeddings/deep_graph_infomax/GO_dgi_512_tl_1_un.pt'))

<All keys matched successfully>

In [11]:
@torch.no_grad()
def emb_csv(model, loader):
    model.eval()
    embeddings = np.zeros((len(samples_all), 512))
    
    idx = 0
    for graph in tqdm(loader):
        s = model.encoder(graph.to(dev))
        embeddings[idx] = model.encoder.summarize(s).cpu().numpy()
        idx += 1
        
    return embeddings

In [None]:
embs = emb_csv(model, samples_all_data)

HBox(children=(FloatProgress(value=0.0, max=69170.0), HTML(value='')))

In [30]:
upl_f = lambda x : re.sub('graphs_combined/','', x)
upl_f_csv = lambda x : re.sub('.csv','', x)
upl_f_emb = lambda x : re.sub('/graph','_emb',x)

upl = [*map(upl_f, samples_all)]
upl = [*map(upl_f_csv, upl)]
upl = [*map(upl_f_emb, upl)]
upl = np.array(upl)

In [117]:
upl_f = lambda x : re.sub('graphs_weighted/','', x)
upl_f_csv = lambda x : re.sub('.csv','', x)
upl_f_emb = lambda x : re.sub('/graph','_emb_1',x)

upl = [*map(upl_f, w_path_list)]
upl = [*map(upl_f_csv, upl)]
upl = [*map(upl_f_emb, upl)]
upl = np.array(upl)

In [31]:
cols = ['emb',*map(str, *[range(512)])]

In [32]:
df1 = pd.DataFrame(upl)
df2 = pd.DataFrame(embs)

In [33]:
df = pd.concat([df1, df2], axis=1)
df.columns = cols

In [None]:
#df.to_csv('embeddings/autoencoder_graph/gt_1024_tl_1_leaky_relu_sum.csv')
df.to_csv('embeddings/deep_graph_infomax/G_dgi_512_un.csv')