# Create NCI1 ckpt

In [1]:

# ! Use "gnn" conda environment to run this notebook.

In [2]:
import os
import pickle
import sys

import torch

sys.path.append("../gnnexp")
from models import GNN_Custom_NCI1
from preprocessing_nci1 import NCI1Dataset

torch.manual_seed(12345)

<torch._C.Generator at 0x7f73f1b725b0>

## Data

In [3]:
dataset = NCI1Dataset("../data/NCI1")

In [4]:
len(dataset)

4110

In [5]:
with open("../data/NCI1/index.pkl", "rb") as file:
    indices = pickle.load(file)

## Model

In [6]:
model = GNN_Custom_NCI1(
    in_features=dataset.num_node_features,
    h_features=128,
)

In [7]:
state_dict = torch.load(
    "../graph_classification_model_weights/NCI1_weights.pt"
)
model.load_state_dict(state_dict)
model.eval()

GNN_Custom_NCI1(
  (conv1): GraphConvolution (37 -> 128)
  (conv2): GraphConvolution (128 -> 128)
  (dense1): Linear(in_features=128, out_features=16, bias=True)
  (dense2): Linear(in_features=16, out_features=8, bias=True)
  (dense3): Linear(in_features=8, out_features=2, bias=True)
)

## Reference: GEM style data

In [8]:
ckpt = torch.load(f"../ckpt/Mutagenicity_base_h20_o20.pth.tar")

In [9]:
ckpt.keys()

dict_keys(['epoch', 'model_type', 'optimizer', 'model_state', 'optimizer_state', 'cg'])

In [10]:
ckpt['cg'].keys()

dict_keys(['adj', 'feat', 'label', 'pred', 'train_idx', 'val_idx', 'test_idx', 'gid'])

In [11]:
print(f'{"KEY":<10}: {"OBJECT":<25}: {"TYPE":<15}: SHAPE/LEN\n')
for key, val in ckpt['cg'].items():
    try:
        print(f"{key:<10}: {str(type(val)):<25}: {str(val.dtype):<15}: {val.shape}")
    except: # object doesn't have method named "shape"
        print(f"{key:<10}: {str(type(val)):<25}: {str(type(val[0])):<15}: {len(val)}")

KEY       : OBJECT                   : TYPE           : SHAPE/LEN

adj       : <class 'torch.Tensor'>   : torch.float64  : torch.Size([4336, 100, 100])
feat      : <class 'torch.Tensor'>   : torch.float64  : torch.Size([4336, 100, 14])
label     : <class 'torch.Tensor'>   : torch.int32    : torch.Size([4336])
pred      : <class 'numpy.ndarray'>  : float64        : (1, 4336, 2)
train_idx : <class 'list'>           : <class 'int'>  : 3468
val_idx   : <class 'list'>           : <class 'int'>  : 434
test_idx  : <class 'list'>           : <class 'int'>  : 434
gid       : <class 'torch.Tensor'>   : torch.int64    : torch.Size([4336])


## CKPT

In [12]:
MAX_ADJ_SIZE = max([
    graph.edge_index[0].unique().size(0)
    for graph in dataset
])
print(MAX_ADJ_SIZE)

111


In [13]:
adjacencies = list()
features = list()
labels = list()
preds = list()
g_ids = list()

for g_id, graph in enumerate(dataset):
    adj = torch.zeros(
        size=(MAX_ADJ_SIZE, MAX_ADJ_SIZE)
    ).float()
    rows = graph.edge_index[0]
    cols = graph.edge_index[1]
    for row, col in zip(rows, cols):
        adj[int(row), int(col)] = 1.0
    
    feat = graph.x.float()
    feat_size = feat.size(0)
    extra_size = MAX_ADJ_SIZE - feat_size
    feat_extra = torch.zeros(
        size=(extra_size, dataset.num_node_features)
    ).float()
    feat = torch.cat((feat, feat_extra), dim=0)

    label = graph.y.long()
    pred = model(feat, adj)

    adjacencies.append(adj)
    features.append(feat)
    labels.append(label)
    preds.append(pred)
    g_ids.append(g_id)

adjacencies = torch.stack(adjacencies)
features = torch.stack(features)
labels = torch.Tensor(labels).long()
preds = torch.stack(preds).detach().unsqueeze(0).numpy()
g_ids = torch.Tensor(g_ids).long()

In [14]:
100 * (labels.numpy() == preds.argmax(axis=-1)).sum() / len(labels)

52.91970802919708

In [15]:
cg_dict = {
    'adj': adjacencies,
    'feat': features,
    'label': labels,
    'pred': preds,
    'gid': g_ids,
}

In [16]:
print(f'{"KEY":<10}: {"OBJECT":<25}: {"TYPE":<15}: SHAPE/LEN\n')
for key, val in cg_dict.items():
    try:
        print(f"{key:<10}: {str(type(val)):<25}: {str(val.dtype):<15}: {val.shape}")
    except: # object doesn't have method named "shape"
        print(f"{key:<10}: {str(type(val)):<25}: {str(type(val[0])):<15}: {len(val)}")

KEY       : OBJECT                   : TYPE           : SHAPE/LEN

adj       : <class 'torch.Tensor'>   : torch.float32  : torch.Size([4110, 111, 111])
feat      : <class 'torch.Tensor'>   : torch.float32  : torch.Size([4110, 111, 37])
label     : <class 'torch.Tensor'>   : torch.int64    : torch.Size([4110])
pred      : <class 'numpy.ndarray'>  : float32        : (1, 4110, 2)
gid       : <class 'torch.Tensor'>   : torch.int64    : torch.Size([4110])


## Save

In [17]:
new_ckpt = dict()
new_ckpt['model_state'] = model.state_dict()
new_ckpt['cg'] = cg_dict

In [18]:
train_set_1 = list()
val_set_1 = list()
test_set_1 = list()

for set_ in ['train', 'val', 'test']:
    for idx in indices[f"idx_{set_}"]:
        label = new_ckpt['cg']['label'][idx]
        pred = new_ckpt['cg']['pred'][0][idx].argmax(axis=-1)
        if label == pred == 1:
            eval(f"{set_}_set_1.append(int(idx))")

### Eval set as part of training set

In [19]:
new_ckpt['cg']['train_idx'] = train_set_1 + test_set_1
new_ckpt['cg']['val_idx'] = val_set_1
new_ckpt['cg']['test_idx'] = test_set_1

In [20]:
os.makedirs("../ckpt", exist_ok=True)
torch.save(new_ckpt, f"../ckpt/NCI1_eval_as_train.pt")

### Eval set as test set

In [21]:
new_ckpt['cg']['train_idx'] = train_set_1
new_ckpt['cg']['val_idx'] = val_set_1
new_ckpt['cg']['test_idx'] = test_set_1

In [22]:
os.makedirs("../ckpt", exist_ok=True)
torch.save(new_ckpt, f"../ckpt/NCI1_eval_as_eval.pt")

## Rough

In [24]:
correct = 0
for graph_idx in range(new_ckpt['cg']['adj'].shape[0]):
    feat = new_ckpt['cg']['feat'][graph_idx, :].float().unsqueeze(0)
    adj = new_ckpt['cg']['adj'][graph_idx].float().unsqueeze(0)
    label = new_ckpt['cg']['label'][graph_idx].long().unsqueeze(0)
    pred = model(feat, adj).argmax(dim=-1)
    if label == pred:
        correct += 1
acc = 100 * correct/new_ckpt['cg']['adj'].shape[0]
print(f"{set_.capitalize()} accuracy: {acc:.2f} %")

Test accuracy: 52.92 %
