In [89]:
import uproot
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from collections import namedtuple, defaultdict
import open3d as o3d
import random
random.seed(42)
import h5py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.utils import add_self_loops
from torchvision import transforms
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib

In [90]:
hf_neighbor_pairs_inCluster = h5py.File("./neighbor_pairs_inCluster.hdf5", 'r')
hf_neighbor_pairs_notInCluster = h5py.File("./neighbor_pairs_notInCluster.hdf5", 'r')
hf_cellFeatures = h5py.File("./cellFeatures.hdf5", 'r')

In [91]:
true_pair = hf_neighbor_pairs_inCluster.get('neighbor_pairs_inCluster')[:]
false_pair = hf_neighbor_pairs_notInCluster.get('neighbor_pairs_notInCluster')[:]
cellFeatures = hf_cellFeatures.get('cellFeatures')[:]

In [92]:
hf_neighbor_pairs_inCluster.close()
hf_neighbor_pairs_notInCluster.close()
hf_cellFeatures.close()

In [93]:
print("true_pair.shape: ",true_pair.shape)
print("false_pair.shape: ",false_pair.shape)
print("cellFeatures.shape: ",cellFeatures.shape)

true_pair.shape:  (256793, 2)
false_pair.shape:  (993449, 2)
cellFeatures.shape:  (187652, 8)


In [94]:
true_pair_250k = true_pair[:250000]
false_pair_250k = false_pair[:250000]

## Concatenating +ve and -ve examples

In [95]:
total_indices = np.concatenate((true_pair_250k,false_pair_250k),axis=0)

In [96]:
total_indices.shape

(500000, 2)

## Making labels 1 for +ve e.g and 0 for -ve e.g

In [97]:
total_label = np.concatenate(([1]*250000,[0]*250000),axis=0)

In [98]:
total_label.shape

(500000,)

## Randomizing data and dividing into train, test parts

In [99]:
arr = np.arange(500000)
np.random.shuffle(arr)
total_indices_rand = total_indices[arr]
total_label_rand =  total_label[arr]

In [100]:
total_indices_train, total_indices_test, truth_label_train, truth_label_test = train_test_split(
    total_indices_rand, total_label_rand, train_size=0.70)

In [101]:
total_indices_train.shape

(350000, 2)

In [102]:
total_indices_test.shape

(150000, 2)

In [103]:
# creating pair and no pair indices file
with h5py.File('./test_pair_neighbor.hdf5', 'w') as f: 
    dset = f.create_dataset("test_pair", data = total_indices_test)

with h5py.File('./test_truth_neighbor.hdf5', 'w') as f: 
    dset = f.create_dataset("test_truth_label", data = truth_label_test)

### Based on the train-test cell indices find the cell features and scale them

In [104]:
# find the unique cell indices in the pair
train_index_set = set()
for item in total_indices_train:
    if item[0] not in train_index_set:
        train_index_set.add(item[0])
    elif item[1] not in train_index_set:
        train_index_set.add(item[1])

In [105]:
len(train_index_set)

108462

In [106]:
# find the unique cell indices in the pair
test_index_set = set()
for item in total_indices_test:
    if item[0] not in test_index_set:
        test_index_set.add(item[0])
    elif item[1] not in test_index_set:
        test_index_set.add(item[1])

In [107]:
len(test_index_set)

87301

## MinMax Scaling training set

In [108]:
train_index_list = list(train_index_set)
test_index_list = list(test_index_set)

In [109]:
with h5py.File('./test_index_list_neighbor.hdf5', 'w') as f: 
    dset = f.create_dataset("test_index_list", data = test_index_list)

In [110]:
for item in test_index_list:
    if item in train_index_list:
        index = test_index_list.index(item)
        test_index_list.pop(index)

In [111]:
len(test_index_list)

45973

In [112]:
cellFeatures_train = cellFeatures[train_index_list]
cellFeatures_test = cellFeatures[test_index_list]

In [113]:
scaler = MinMaxScaler()
cellFeatures_trainS = scaler.fit_transform(cellFeatures_train)
scaler_filename = "./scaler_neighbor.save"
joblib.dump(scaler, scaler_filename)

['./scaler_neighbor.save']

In [114]:
scaler = joblib.load('./scaler_neighbor.save') 
cellFeatures_testS = scaler.transform(cellFeatures_test)

### Insert cellFeatures_train/test in a cellFeaturesScaled array 

In [115]:
cellFeatures.shape

(187652, 8)

In [116]:
cellFeaturesScaled =np.zeros((187652, 8))
cell = 0
for i in train_index_list:
    cellFeaturesScaled[i] = cellFeatures_trainS[cell]
    cell = cell+1

In [117]:
cell = 0
for i in test_index_list:
    cellFeaturesScaled[i] = cellFeatures_testS[cell]
    cell = cell+1

In [118]:
# creating Scaled Cell Feature file
with h5py.File('./cellFeaturesScaled_neighbor.hdf5', 'w') as f: 
    dset = f.create_dataset("cellFeaturesScaled", data = cellFeaturesScaled)

In [119]:
total_indices_train

array([[108470, 129112],
       [ 88516, 105476],
       [ 84083,  84340],
       ...,
       [155598, 166075],
       [  8224,  19715],
       [ 21126,  21127]])

In [120]:
train_edge_source_noBD =[]
for pair in total_indices_train:
    train_edge_source_noBD.append(pair[0])

train_edge_dest_noBD =[]
for pair in total_indices_train:
    train_edge_dest_noBD.append(pair[1])

In [121]:
test_edge_source_noBD =[]
for pair in total_indices_test:
    test_edge_source_noBD.append(pair[0])

test_edge_dest_noBD =[]
for pair in total_indices_test:
    test_edge_dest_noBD.append(pair[1])

In [122]:
train_edge_source_BD =[]
for pair in total_indices_train:
    train_edge_source_BD.append(pair[0])
    train_edge_source_BD.append(pair[1])

In [123]:
train_edge_source_BD = np.array(train_edge_source_BD)

In [124]:
train_edge_source_BD.shape

(700000,)

In [125]:
train_edge_dest_BD =[]
for pair in total_indices_train:
    train_edge_dest_BD.append(pair[1])
    train_edge_dest_BD.append(pair[0])

In [126]:
train_edge_dest_BD = np.array(train_edge_dest_BD)

In [127]:
train_edge_dest_BD.shape

(700000,)

In [128]:
test_edge_source_BD =[]
for pair in total_indices_test:
    test_edge_source_BD.append(pair[0])
    test_edge_source_BD.append(pair[1])
    
test_edge_dest_BD =[]
for pair in total_indices_test:
    test_edge_dest_BD.append(pair[1])
    test_edge_dest_BD.append(pair[0])

In [129]:
test_edge_source_BD = np.array(test_edge_source_BD)
test_edge_dest_BD = np.array(test_edge_dest_BD)

In [130]:
test_edge_source_BD.shape

(300000,)

In [131]:
test_edge_dest_BD.shape

(300000,)

In [132]:
test_edge_source_BD

array([ 36759,  36760,  49458, ...,   4262, 101605, 101606])

In [133]:
test_edge_dest_BD

array([ 36760,  36759,  72008, ...,   4197, 101606, 101605])

In [134]:
with h5py.File('./train_edge_source_BD.hdf5', 'w') as f: 
    dset = f.create_dataset("train_edge_source_BD", data = train_edge_source_BD)

with h5py.File('./train_edge_dest_BD.hdf5', 'w') as f: 
    dset = f.create_dataset("train_edge_dest_BD", data = train_edge_dest_BD)

In [135]:
with h5py.File('./test_edge_source_BD.hdf5', 'w') as f: 
    dset = f.create_dataset("test_edge_source_BD", data = test_edge_source_BD)

with h5py.File('./test_edge_dest_BD.hdf5', 'w') as f: 
    dset = f.create_dataset("test_edge_dest_BD", data = test_edge_dest_BD)

In [136]:
with h5py.File('./truth_label_train_neighbor.hdf5', 'w') as f: 
    dset = f.create_dataset("truth_label_train", data = truth_label_train)

with h5py.File('./truth_label_test_neighbor.hdf5', 'w') as f: 
    dset = f.create_dataset("truth_label_test", data = truth_label_test)

In [137]:
with h5py.File('./train_edge_source_noBD.hdf5', 'w') as f: 
    dset = f.create_dataset("train_edge_source_noBD", data = train_edge_source_noBD)

with h5py.File('./train_edge_dest_noBD.hdf5', 'w') as f: 
    dset = f.create_dataset("train_edge_dest_noBD", data = train_edge_dest_noBD)
    
with h5py.File('./test_edge_source_noBD.hdf5', 'w') as f: 
    dset = f.create_dataset("test_edge_source_noBD", data = test_edge_source_noBD)

with h5py.File('./test_edge_dest_noBD.hdf5', 'w') as f: 
    dset = f.create_dataset("test_edge_dest_noBD", data = test_edge_dest_noBD)