In [75]:
import warnings
warnings.filterwarnings('ignore')


In [76]:
import os
import torch
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

#printing
from torch_geometric import utils
import networkx as nx

# Torch
from torch_geometric.nn import GCNConv
from torch.utils.data import random_split
#Data manipulation
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
import os
from typing import Dict, Tuple
import pandas as pd


In [77]:
type(DataLoader)

type

In [78]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [115]:
Edges = Dict[Tuple[int, int], Tuple[int, ...]]
Nodes = Dict[int, Tuple[int, ...]]
all_edges: Dict[Tuple[int, int], Tuple[int, ...]] = {}

def data_loaders(data_directory):
    data_loaders = {}
    for domain_name in os.listdir(data_directory):
        domain_path = os.path.join(data_directory, domain_name)
        data_loader = data_loader_from_domain(domain_path)
        data_loaders[domain_name] = data_loader
    return data_loaders

def data_loader_from_domain(domain_path):
    domain_name = os.path.basename(domain_path)
    dataset = []
    number_of_problems = 0
    for problem_name in os.listdir(domain_path):
        if problem_name == "empty_causal_graphs":
            continue
        number_of_problems+=1
        problem_path = os.path.join(domain_path, problem_name)
        data = problem_path_to_data(problem_path)
        dataset.append(data)
        # Generate list of data objects from our problem path
        
        # Iterate over all the problems in the domain
        # Generate a data object for each problem
        # train_test_split everything
        # train the model
    print(f"Number of problems in {domain_name}: ", number_of_problems)
    train_test_split = int(len(dataset)*0.8)
    train_test_datasets = random_split(dataset, [train_test_split, len(dataset)-train_test_split])

    data_loader = DataLoader(train_test_datasets, batch_size=8, shuffle=True)
    print(f"Number of batches in {domain_name}: ", len(data_loader))
    print(f"Number of nodes in first file of {domain_name}: ", data_loader.dataset[0].x.size(0))





    

def problem_path_to_data(problem_path):
    # print(problem_path)
    cg_df = pd.read_csv(os.path.join(problem_path, 'cg.csv'), index_col=[0, 1])
    cg_df.sort_index(inplace=True)
    nodes_df = pd.read_csv(os.path.join(problem_path, 'nodes.csv'), index_col=0)

    edges = cg_df.index
    edge_features_list = cg_df[['type_pre_eff', 'type_eff_eff']].values
    edge_labels = cg_df['label'].values
    edge_dict = {}

    # Unlucky naming, but the edge_features is a vector representing features of a single edge
    # edge_feature_list is the dictionary of all the edges and their respective features
    for edge, edge_features, label in zip(edges, edge_features_list, edge_labels):
        edge_dict[tuple(edge)] = (edge_features, label)

    edge_features, edge_labels = zip(*[edge_dict[edge] for edge in sorted(edge_dict.keys())])


    data = Data(
        x=torch.tensor(nodes_df.values, dtype=torch.float),
        edge_index=torch.tensor(list(sorted(edge_dict.keys())), dtype=torch.long).t().contiguous(),
        edge_attr=torch.tensor(edge_features, dtype=torch.float),
        y=torch.tensor(edge_labels, dtype=torch.bool)
    )
    return data


    
    
def draw_graph(data: Data):
    g = utils.to_networkx(data)

    color = ['green' if data.y[i] else 'red' for i in range(data.y.size(0))]
    import matplotlib as plt
    a = nx.draw_networkx(g,node_size=200, pos=nx.spectral_layout(g), edge_color=color, node_color='green', with_labels=True)

dler = data_loader_from_domain('graph_training_data/satellite')
dler


Number of problems in satellite:  233
Number of batches in satellite:  1


AttributeError: 'Subset' object has no attribute 'x'

In [80]:
training_domains("graph_training_data")

In [None]:
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

class MyData(Data):
    my_id = 0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.my_id = MyData.my_id
        MyData.my_id += 1
    def __cat_dim__(self, key, value, *args, **kwargs):
         if key == 'foo':
             return None
         else:
             return super().__cat_dim__(key, value, *args, **kwargs)

edge_index = torch.tensor([
   [0, 1, 1, 2],
   [1, 0, 2, 1],
])



foo = torch.randn(16)

data = MyData(edge_index=edge_index, foo=foo)
data1 = MyData(edge_index=edge_index, foo=foo)
data2 = MyData(edge_index=edge_index, foo=foo)
data_list = [data, data1, data2]
loader = DataLoader(data_list, batch_size=2, shuffle=True)
batch = next(iter(loader))

print(batch[0].my_id)
print(batch[1].my_id)





In [208]:
OneBatch = Data
class Net(torch.nn.Module):
    def __init__(self, features_num):
        super(Net, self).__init__()
        self.conv1 = GCNConv(features_num, 128)
        self.conv2 = GCNConv(128, 64)

    def encode(self, data):
        x = self.conv1(data.x, data.edge_index) # convolution 1
        x = x.relu()
        return self.conv2(x, data.edge_index) # convolution 2



    def decode(self, z, edge_index): # only pos and neg edges
        # if not test:
        #     print('pos_edge_index', pos_edge_index.shape)
        #     print('neg_edge_index', neg_edge_index.shape)
            # print('pos_edge_index', pos_edge_index)
            # print('neg_edge_index', neg_edge_index)

        # if not test:
        #     print('edge_index 0 shape:', edge_index[0].shape)
        #     # print("Edge index 0: ", edge_index[0])
        #TODO  edge_index[0] 7 4 9

        #     print("Edge index 1 shape: ", edge_index[1].shape)
        #     # print("Edge index 1: ", edge_index[1])
        #TODO edge_index[1] 5 3 9
        #     print("z shape: ", z.shape)

        # Multiply adjecency matrix with latent space using the COO format of 
        # Edge index[0] and Edge index[1]
        logits = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)  # dot product 
        # if not test:
        #     print('logits shape:', logits.shape)
        #     print('logits', logits)
        #     4/0

        return logits

    def decode_all(self, z): 
        prob_adj = z @ z.t() # get adj NxN
        return (prob_adj > 0).nonzero(as_tuple=False).t() # get predicted edge_list 

In [232]:
num_node_features = next(iter(data_loader)).x.shape[1]
model, data = Net(num_node_features).to(device), data.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
batch = next(iter(data_loader))
print(batch[0].x.shape)
print(batch[1].x.shape)


torch.Size([12, 2])
torch.Size([13, 2])


In [257]:
def train():
    model.train()  # Flag to modify the gradient

    batch = next(iter(data_loader))  # This is next level shit
    # print(batch[0])
    # print(batch[1])
    edge_index = batch.edge_index
    optimizer.zero_grad()
    z = model.encode(batch) 
    link_logits = model.decode(z, edge_index) # decode
    # print(link_logits)
    link_labels = batch.y
    # print(link_labels)
    link_labels = link_labels.type(torch.float)
    # print(link_labels)
    loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
    loss.backward()
    optimizer.step()

    return loss


@torch.no_grad()
def test():
    model.eval()
    perfs = []
    for prefix in ["val", "test"]:
        pos_edge_index = data[f'{prefix}_pos_edge_index']
        neg_edge_index = data[f'{prefix}_neg_edge_index']

        z = model.encode() # encode train
        link_logits = model.decode(z, pos_edge_index, neg_edge_index, test=True) # decode test or val
        link_probs = link_logits.sigmoid() # apply sigmoid
        
        link_labels = get_link_labels(neg_edge_index, pos_edge_index) # get link
        
        perfs.append(roc_auc_score(link_labels.cpu(), link_probs.cpu())) #compute roc_auc score
    return perfs

In [258]:
best_val_perf = test_perf = 0
for epoch in range(1, 101):
    train_loss = train()
    # val_perf, tmp_test_perf = test()
    # if val_perf > best_val_perf:
    #     best_val_perf = val_perf
    #     test_perf = tmp_test_perf
    # log = 'Epoch: {:03d}, Loss: {:.4f}, Val: {:.4f}, Test: {:.4f}'
    log = 'Epoch: {:03d}, Loss: {:.4f}'
    if epoch % 10 == 0:
    #     print(log.format(epoch, train_loss, best_val_perf, test_perf))
        print(log.format(epoch, train_loss))

Epoch: 010, Loss: 1.4836
Epoch: 020, Loss: 0.0211
Epoch: 030, Loss: 0.4479
Epoch: 040, Loss: 0.4292
Epoch: 050, Loss: 0.3755
Epoch: 060, Loss: 0.3718
Epoch: 070, Loss: 0.2479
Epoch: 080, Loss: 0.3556
Epoch: 090, Loss: 0.0783
Epoch: 100, Loss: 0.2439


In [175]:
t1 = 10*torch.rand((2,5))
t2 = 10*torch.rand((2,5))
t1 = t1.round()
t2 = -t2.round()
a = torch.cat([t1,t2], dim=-1)
a.shape

t1[1]

tensor([ 9.,  8.,  4.,  5., 10.])

In [114]:
import torch
from torch.utils.data import random_split

# transform = transforms.Compose([transforms.ToTensor(), 
#                                         transforms.Normalize((0.5,), (0.5,))])
# dataset = MNIST(root = './data', train = train, transform = transform, download=True)
# train_set, val_set = torch.utils.data.random_split(dataset, [50000, 10000])



# data = range(200)

# train,test,val = torch.utils.data.random_split(data, [0.8, 0.1, 0.1])

# train, test, val = list(train), list(test), list(val)

# train

random_split(range(30), [0.3, 0.3, 0.4], generator=torch.Generator().manual_seed(42))


ValueError: Sum of input lengths does not equal the length of the input dataset!

In [110]:
import torch
from torchvision.datasets import ImageFolder
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split
from torchvision.transforms import Compose, ToTensor, Resize
from torch.utils.data import DataLoader

def train_val_dataset(dataset, val_split=0.25):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=val_split)
    datasets = {}
    datasets['train'] = Subset(dataset, train_idx)
    datasets['val'] = Subset(dataset, val_idx)
    return datasets

dataset = ImageFolder('C:\Datasets\lcms-dataset', transform=Compose([Resize((224,224)),ToTensor()]))
print(len(dataset))
datasets = train_val_dataset(dataset)
print(len(datasets['train']))
print(len(datasets['val']))
# The original dataset is available in the Subset class
print(datasets['train'].dataset)

dataloaders = {x:DataLoader(datasets[x],32, shuffle=True, num_workers=4) for x in ['train','val']}
x,y = next(iter(dataloaders['train']))
print(x.shape, y.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Datasets\\lcms-dataset'