# Create IsCyclic dataset

In [1]:

# ! Use "gnn" conda environment to run this notebook.

In [2]:
import os
import pickle
import sys

import torch

sys.path.append("../gnnexp")
from models import GNN_Custom_IsCyclic
from preprocessing_iscyclic import create_isCyclic_dataset

torch.manual_seed(12345)

<torch._C.Generator at 0x7f9c5c078570>

## Data

In [3]:
dataset, data_objs = create_isCyclic_dataset(saved=True)
print(dataset[0].x)
print(dataset[0].edge_index)
print(dataset[0].y)

Dataset size:  951
tensor([[1.],
        [1.],
        [1.]])
tensor([[0, 0, 1, 1, 2, 2],
        [1, 2, 0, 2, 0, 1]])
tensor(1., dtype=torch.float64)


In [4]:
with open("../data/IsCyclic/index.pkl", "rb") as file:
    indices = pickle.load(file)

## Model

In [5]:
model = GNN_Custom_IsCyclic(
    in_features=dataset.num_node_features,
    h_features=64,
)

In [6]:
state_dict = torch.load(
    "../graph_classification_model_weights/IsCyclic_weights.pt"
)
model.load_state_dict(state_dict)
model.eval()

GNN_Custom_IsCyclic(
  (conv1): GraphConvolution (1 -> 64)
  (conv2): GraphConvolution (64 -> 64)
  (conv3): GraphConvolution (64 -> 64)
  (dense1): Linear(in_features=64, out_features=16, bias=True)
  (dense2): Linear(in_features=16, out_features=8, bias=True)
  (dense3): Linear(in_features=8, out_features=2, bias=True)
)

## Reference: GEM style data

In [7]:
ckpt = torch.load(f"../ckpt/Mutagenicity_base_h20_o20.pth.tar")

In [8]:
ckpt.keys()

dict_keys(['epoch', 'model_type', 'optimizer', 'model_state', 'optimizer_state', 'cg'])

In [9]:
ckpt['cg'].keys()

dict_keys(['adj', 'feat', 'label', 'pred', 'train_idx', 'val_idx', 'test_idx', 'gid'])

In [10]:
print(f'{"KEY":<10}: {"OBJECT":<25}: {"TYPE":<15}: SHAPE/LEN\n')
for key, val in ckpt['cg'].items():
    try:
        print(f"{key:<10}: {str(type(val)):<25}: {str(val.dtype):<15}: {val.shape}")
    except: # object doesn't have method named "shape"
        print(f"{key:<10}: {str(type(val)):<25}: {str(type(val[0])):<15}: {len(val)}")

KEY       : OBJECT                   : TYPE           : SHAPE/LEN

adj       : <class 'torch.Tensor'>   : torch.float64  : torch.Size([4336, 100, 100])
feat      : <class 'torch.Tensor'>   : torch.float64  : torch.Size([4336, 100, 14])
label     : <class 'torch.Tensor'>   : torch.int32    : torch.Size([4336])
pred      : <class 'numpy.ndarray'>  : float64        : (1, 4336, 2)
train_idx : <class 'list'>           : <class 'int'>  : 3468
val_idx   : <class 'list'>           : <class 'int'>  : 434
test_idx  : <class 'list'>           : <class 'int'>  : 434
gid       : <class 'torch.Tensor'>   : torch.int64    : torch.Size([4336])


## CKPT

In [11]:
# MAX_ADJ_SIZE = max([
#     graph.edge_index[0].unique().size(0)
#     for graph in dataset
# ])
# print(MAX_ADJ_SIZE)

MAX_ADJ_SIZE = 100 # The authors of GEM have hard coded this.

In [12]:
adjacencies = list()
features = list()
labels = list()
preds = list()
g_ids = list()
skipped_graphs = list()

for g_id, graph in enumerate(dataset):
    if graph.edge_index[0].unique().size(0) > MAX_ADJ_SIZE:
        print(f"Skipped {g_id}")
        skipped_graphs.append(g_id)
        continue
    adj = torch.zeros(
        size=(MAX_ADJ_SIZE, MAX_ADJ_SIZE)
    ).float()
    rows = graph.edge_index[0]
    cols = graph.edge_index[1]
    for row, col in zip(rows, cols):
        adj[int(row), int(col)] = 1.0
    
    feat = graph.x.float()
    feat_size = feat.size(0)
    extra_size = MAX_ADJ_SIZE - feat_size
    feat_extra = torch.zeros(
        size=(extra_size, dataset.num_node_features)
    ).float()
    feat = torch.cat((feat, feat_extra), dim=0)

    label = graph.y.long()
    pred = model(feat, adj)

    adjacencies.append(adj)
    features.append(feat)
    labels.append(label)
    preds.append(pred)
    g_ids.append(g_id)

adjacencies = torch.stack(adjacencies)
features = torch.stack(features)
labels = torch.Tensor(labels).long()
preds = torch.stack(preds).detach().unsqueeze(0).numpy()
g_ids = torch.Tensor(g_ids).long()

Skipped 147
Skipped 148
Skipped 149
Skipped 311
Skipped 312
Skipped 313


In [13]:
cg_dict = {
    'adj': adjacencies,
    'feat': features,
    'label': labels,
    'pred': preds,
    'gid': g_ids,
}

In [14]:
acc = 100 * (labels.numpy() == preds.argmax(axis=-1)).sum() / len(labels)
print(f"Accuracy: {acc:.2f} %")

Accuracy: 85.71 %


In [15]:
print(f'{"KEY":<10}: {"OBJECT":<25}: {"TYPE":<15}: SHAPE/LEN\n')
for key, val in cg_dict.items():
    try:
        print(f"{key:<10}: {str(type(val)):<25}: {str(val.dtype):<15}: {val.shape}")
    except: # object doesn't have method named "shape"
        print(f"{key:<10}: {str(type(val)):<25}: {str(type(val[0])):<15}: {len(val)}")

KEY       : OBJECT                   : TYPE           : SHAPE/LEN

adj       : <class 'torch.Tensor'>   : torch.float32  : torch.Size([945, 100, 100])
feat      : <class 'torch.Tensor'>   : torch.float32  : torch.Size([945, 100, 1])
label     : <class 'torch.Tensor'>   : torch.int64    : torch.Size([945])
pred      : <class 'numpy.ndarray'>  : float32        : (1, 945, 2)
gid       : <class 'torch.Tensor'>   : torch.int64    : torch.Size([945])


## Save

In [16]:
new_ckpt = dict()
new_ckpt['model_state'] = model.state_dict()
new_ckpt['cg'] = cg_dict

In [18]:
train_set_1 = list()
val_set_1 = list()
test_set_1 = list()

for set_ in ['train', 'val', 'test']:
    for idx in indices[f"idx_{set_}"]:
        if idx in skipped_graphs or idx >= len(new_ckpt['cg']['label']):
            print(f"Skipped {idx}")
            continue
        label = new_ckpt['cg']['label'][idx]
        pred = new_ckpt['cg']['pred'][0][idx].argmax(axis=-1)
        if label == pred == 1:
            eval(f"{set_}_set_1.append(int(idx))")

train_set_1 = list(set(train_set_1))
val_set_1 = list(set(val_set_1))
test_set_1 = list(set(test_set_1))

Skipped 949
Skipped 947
Skipped 946
Skipped 311
Skipped 945
Skipped 948
Skipped 313
Skipped 149
Skipped 312
Skipped 147
Skipped 148
Skipped 950


### Eval set as part of training set

In [19]:
new_ckpt['cg']['train_idx'] = train_set_1 + test_set_1
new_ckpt['cg']['val_idx'] = val_set_1
new_ckpt['cg']['test_idx'] = test_set_1

In [20]:
os.makedirs("../ckpt", exist_ok=True)
torch.save(new_ckpt, f"../ckpt/IsCyclic_eval_as_train.pt")

### Eval set as test set

In [21]:
new_ckpt['cg']['train_idx'] = train_set_1
new_ckpt['cg']['val_idx'] = val_set_1
new_ckpt['cg']['test_idx'] = test_set_1

In [22]:
os.makedirs("../ckpt", exist_ok=True)
torch.save(new_ckpt, f"../ckpt/IsCyclic_eval_as_eval.pt")

## Rough

In [23]:
correct = 0
for graph_idx in range(new_ckpt['cg']['adj'].shape[0]):
    feat = new_ckpt['cg']['feat'][graph_idx, :].float().unsqueeze(0)
    adj = new_ckpt['cg']['adj'][graph_idx].float().unsqueeze(0)
    label = new_ckpt['cg']['label'][graph_idx].long().unsqueeze(0)
    pred = model(feat, adj).argmax(dim=-1)
    if label == pred:
        correct += 1
acc = 100 * correct/new_ckpt['cg']['adj'].shape[0]
print(f"accuracy: "f"{acc:.2f} %")

accuracy: 85.71 %


In [24]:
len(test_set_1)

161

In [25]:
len(train_set_1)

125