# Module for GGNN experimentation

In [1]:
import torch as tt
from tqdm.auto import tqdm
import numpy as np
import sys
sys.path.append("..")
import csv
import json
import time
import os
import random
from torch.autograd import Variable
from python.utils import SimilarityLoss
from python.data_loader import GraphDataLoader
from python.model import GGNN

  assert (state_dim >= annotation_dim, 'state_dim must be no less than annotation_dim')


In [9]:
# initialize randomness
DIR = '/home/af9562/'
seed = 0
batch_size = 20
target_edge_type = 1
random.seed(seed)
np.random.seed(seed)
tt.manual_seed(seed)

<torch._C.Generator at 0x7fa8b4018fd0>

In [10]:
loader = GraphDataLoader(directory=DIR+'import_prediction/data/graphs/newMethod/',
                         hidden_size=10, directed=False, max_nodes=300, target_edge_type=target_edge_type)

In [11]:
with open(DIR+'import_prediction/data/graphs/newMethod/test.json', 'r') as file:
    test_raw_data = json.load(file)
with open(DIR+'import_prediction/data/graphs/newMethod/valid.json', 'r') as file:
    valid_raw_data = json.load(file)

test_data = loader.initialize(test_raw_data, batch_size=batch_size, shuffle=False, targets="targets_1")
val_data = loader.initialize(valid_raw_data, batch_size=batch_size, shuffle=False, targets="generate")

In [12]:
with open(DIR+'import_prediction/data/graphs/newMethod/train.json', 'r') as file:
    train_raw_data = json.load(file)
train_data = loader.initialize(train_raw_data, batch_size=batch_size, shuffle=True, targets="generateOnPass")

In [13]:
def run_epoch(model, data, epoch, is_training):
    
    if is_training:
        model.train()
    else:
        model.eval()

    total_loss = 0
    step = 0
    accuracy = 0
    
    batches = tqdm(data)
    
    for adj_matrix, features, mask, src, pos in batches:
        step += 1
        
        adj_matrix = tt.DoubleTensor(adj_matrix)
        features = tt.DoubleTensor(features)
        mask = tt.DoubleTensor(mask)

        optimizer.zero_grad()
        model.zero_grad()    
        
        embeddings = model.forward_sigma(features, adj_matrix)

        loss, acc = criterion(embeddings, mask, src, pos)
        accuracy += acc
        if is_training:
            batches.set_description("Epoch %d. Training. Step acc %f, Total acc %f " % (epoch, acc, accuracy/step))
        else:
            batches.set_description("Epoch %d. Evaluating. Step acc %f, Total acc %f " % (epoch, acc, accuracy/step))
        total_loss += np.asscalar(loss.cpu().data.numpy())

        if is_training:
            loss.backward(retain_graph=True)
            tt.nn.utils.clip_grad_norm(model.parameters(),
                                          clamp_gradient_norm)
            optimizer.step()

    print("Acc: ", accuracy/step)
    return total_loss

In [14]:
def run_epoch(model, data, epoch, is_training):
    
    if is_training:
        model.train()
        print("Epoch %d. Training" %epoch)
    else:
        model.eval()
        print("Epoch %d. Evaluating" %epoch)

    total_loss = 0
    step = 0
    accuracy = 0
    
    batches = tqdm(data)
    
    for adj_matrix, features, src, mask in batches:
        step += 1


        optimizer.zero_grad()
        model.zero_grad()

        batch_size = adj_matrix.shape[0]
        option_size = adj_matrix.shape[1]
        adj_matrix = adj_matrix.view(-1, adj_matrix.shape[2], adj_matrix.shape[3]).double()
        src = src.view(-1).long()
        mask = mask.double()
        features = features.view(-1, features.shape[2], features.shape[3]).double()
        distances = model.forward_src(features, adj_matrix, src, batch_size, option_size)

        loss, acc = criterion(distances, mask)
        accuracy += acc
        batches.set_description("Acc: step=%f, total=%f. Loss: %f," % (acc, accuracy/step, loss))
        total_loss += np.asscalar(loss.cpu().data.numpy())

        if is_training:
            loss.backward(retain_graph=True)
            tt.nn.utils.clip_grad_norm(model.parameters(),
                                          clamp_gradient_norm)
            optimizer.step()

    print("Acc: ", accuracy/step)
    return total_loss/step

In [15]:
def save_model(model, path):
    data_to_save = {"model_weights": model.state_dict()}
    tt.save(data_to_save, path)

In [16]:
def train(model, epochs, patience, train_data, val_data):

    best_val_loss, best_val_loss_epoch = float("inf"), 0
    for epoch in range(epochs):     
        train_loss = run_epoch(model, train_data, epoch, True)
        val_loss = run_epoch(model, val_data, epoch, False)

        log_entry = {
                    'epoch': epoch,
                    'train_loss': train_loss,
                    'valid_loss': val_loss,
                    }

        with open(log_file, 'a') as f:
            w = csv.DictWriter(f, log_entry.keys())
            w.writerow(log_entry)

        if val_loss < best_val_loss:
            save_model(model, best_model_file)
            print(" (Best epoch so far, cum. val. loss decreased to %.5f from %.5f. "
                  "Saving to '%s')" % (val_loss, best_val_loss, best_model_file))
            best_val_loss = val_loss
            best_val_loss_epoch = epoch
        elif epoch - best_val_loss_epoch >= patience:
            print("Stopping training after %i epochs without improvement on "
                  "validation loss." % patience)
            break

In [17]:
log_dir='/Volumes/My Passport/import_prediction/logs/'

run_id = "_".join([time.strftime("%Y-%m-%d-%H-%M-%S"), str(os.getpid())])
log_file = os.path.join(log_dir, "%s_log.csv" % run_id)
best_model_file = os.path.join(log_dir, "%s_model_best.pickle" % run_id)

model = GGNN(state_dim=loader.hidden_size, 
             annotation_dim=loader.annotation_size,
             n_edge_types=loader.edge_types,
             n_nodes=loader.max_nodes,
             n_steps=6)
model.double()

criterion = SimilarityLoss(margin=0.5)
optimizer = tt.optim.Adam(model.parameters(), lr=0.01)
clamp_gradient_norm = 1.0

In [18]:
train(model=model, epochs=10, patience=3, train_data=train_data, val_data=val_data)

Epoch 0. Training


HBox(children=(FloatProgress(value=0.0, max=937.0), HTML(value='')))

ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/af9562/.conda/envs/import/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/af9562/.conda/envs/import/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/af9562/.conda/envs/import/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "../python/data_loader.py", line 167, in __getitem__
    return self.getitem_complex(index)
  File "../python/data_loader.py", line 196, in getitem_complex
    matrix, features, mask, src, pos = self.getitem_simple(index)
  File "../python/data_loader.py", line 181, in getitem_simple
    features = np.pad(graph['annotations'],
  File "<__array_function__ internals>", line 5, in pad
  File "/home/af9562/.conda/envs/import/lib/python3.8/site-packages/numpy/lib/arraypad.py", line 748, in pad
    pad_width = _as_pairs(pad_width, array.ndim, as_index=True)
  File "/home/af9562/.conda/envs/import/lib/python3.8/site-packages/numpy/lib/arraypad.py", line 519, in _as_pairs
    raise ValueError("index can't contain negative values")
ValueError: index can't contain negative values


In [19]:
test_loss = run_epoch(model=model, data=test_data, epoch=1, is_training=False)

Epoch 1. Evaluating


HBox(children=(FloatProgress(value=0.0, max=109.0), HTML(value='')))

ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/af9562/.conda/envs/import/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/af9562/.conda/envs/import/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/af9562/.conda/envs/import/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "../python/data_loader.py", line 167, in __getitem__
    return self.getitem_complex(index)
  File "../python/data_loader.py", line 196, in getitem_complex
    matrix, features, mask, src, pos = self.getitem_simple(index)
  File "../python/data_loader.py", line 181, in getitem_simple
    features = np.pad(graph['annotations'],
  File "<__array_function__ internals>", line 5, in pad
  File "/home/af9562/.conda/envs/import/lib/python3.8/site-packages/numpy/lib/arraypad.py", line 748, in pad
    pad_width = _as_pairs(pad_width, array.ndim, as_index=True)
  File "/home/af9562/.conda/envs/import/lib/python3.8/site-packages/numpy/lib/arraypad.py", line 519, in _as_pairs
    raise ValueError("index can't contain negative values")
ValueError: index can't contain negative values
