# Importing some utilities

In [18]:
import sys
sys.path.append('../python/')
from json2graph import jsonFile2graph
import networkx as nx
from vocabulary import Vocabulary
from graphUtils import plot_graph, graph2data, data2graph
from neuralmodel import GNN_MoRec

generator = 'VIATRA'

# Generate dataset

Generating the vocabularies. These objects contains a dictionary that associates each node type or edge type to an integer between $[0\dots n-1]$ where $n$ is the length of the vocabulary.

In [19]:
vocab_edges = Vocabulary()
vocab_nodes = Vocabulary()

Function that takes a graph and adds the inverse edges (see background section in the paper).

In [20]:
def addOpposite(G):
    to_add = []
    for n1,n2,e in G.edges:
        typee = G[n1][n2][e]['type']
        if (typee == 'eSuperTypes' or
           typee == 'eType' or typee == 'eOpposite'):
            to_add.append((n2,n1,typee+'_inv'))
    for n1,n2,t in to_add:
        G.add_edge(n1,n2,type=t)
    return G

Loading the models (graphs) generated by the generator.

In [21]:
import glob

def passFilter(G):
    return len(G) >= 4

files = glob.glob("../syntheticGraphs/"+generator+"/"+'Ecore'+"/*.json")
mine = []
for f in files:
    try:
        G = jsonFile2graph(f)
        G = addOpposite(G)
        if not passFilter(G):
            continue
        data = graph2data(G,0,vocab_nodes,vocab_edges)
        mine.append(data)
    except:
        continue

Loading the real models (graphs).

In [22]:
files = glob.glob("../realGraphs/"+'Ecore'+"/R1/*.json")
real = []
for f in files:
    G = jsonFile2graph(f)
    G = addOpposite(G)
    data = graph2data(G,1,vocab_nodes,vocab_edges)
    real.append(data)

Undersampling the dataset to obtain a balanced dataset i.e., $50/50$.

In [23]:
import random
random.seed(123)

if len(mine) > len(real):
    mine = random.sample(mine,len(real))
elif len(mine) < len(real):
    real = random.sample(real,len(mine))

Merging the real and generated graphs:

In [24]:
import random
random.seed(3)
dataset = mine + real
random.shuffle(dataset)
print('Len train:', len(dataset))

Len train: 500


Splitting the set into train test and validation.

In [25]:
from torch.utils.data import random_split
import torch
train_len = int(0.6*len(dataset))
val_len = int(0.15*len(dataset))
test_len = len(dataset) - int(0.6*len(dataset)) - int(0.15*len(dataset))
train, val, test = random_split(dataset, [train_len, val_len ,test_len], 
                                generator=torch.Generator().manual_seed(42))

# Training phase

Generating loaders to train the model.

In [26]:
from torch_geometric.data import DataLoader
train_loader = DataLoader(train, batch_size=32, num_workers = 5, shuffle=True)
val_loader = DataLoader(val, batch_size=1, num_workers = 5, shuffle=True)

Function that evaluate the model using the validation set.

In [27]:
def evaluation(model, loader):
    model.eval()
    count = 0.0
    with torch.no_grad():
        for data in loader:
            pred = model(data.x.cuda(), data.edge_index.cuda(),
          torch.squeeze(data.edge_attr.cuda(),dim=1),data.batch.cuda())
            if pred[0].item() > 0.5:
                pred = 1
            else:
                pred = 0
            if pred == data.y.long().item():
                count = count + 1
    return count/len(loader)

Training procedure with early stopping:

In [28]:
from nnUtils import EarlyStopping
import torch
import torch.nn as nn

path_to_model = './trainedModels/Ecore'+'-'+generator+'-GNN'
model = GNN_MoRec(64,64,0.0,vocab_nodes,vocab_edges).cuda()

epochs = 200
criterion = nn.BCELoss()

opt = torch.optim.Adam(model.parameters(), lr=0.001)
es = EarlyStopping(opt, model, path_to_model,mode='max',patience=50)


for e in range(epochs):
    total_loss = 0.0
    b = 1
    model.train()
    for data in train_loader:
        
        opt.zero_grad()
        
        pred = model(data.x.cuda(), data.edge_index.cuda(),
          torch.squeeze(data.edge_attr.cuda(),dim=1),data.batch.cuda())
        
        loss = criterion(torch.squeeze(pred), data.y.float().cuda())
        total_loss += loss.item()
        
        loss.backward()
        opt.step()
        b = b + 1
        
    val_acc = evaluation(model, val_loader)
    print('Epoch',e,'Loss',total_loss/b)
    print('Eval',val_acc)
    
    if es.step(val_acc,e):
        break
        
#Resultant model

model2 = None
try:
    model2 = GNN_MoRec(64,64,0.0,vocab_nodes,vocab_edges).cuda()
    checkpoint = torch.load(path_to_model)
    model2.load_state_dict(checkpoint['model_state_dict'])
    model2.eval()
except:
    print('Saving model')
    torch.save({
            'model_state_dict': model.state_dict()
            }, path_to_model)
    model2 = GNN_MoRec(64,64,0.0,vocab_nodes,vocab_edges).cuda()
    checkpoint = torch.load(path_to_model)
    model2.load_state_dict(checkpoint['model_state_dict'])
    model2.eval()


Epoch 0 Loss 0.3816955780441111
Eval 0.8666666666666667
Epoch 1 Loss 0.2512251856652173
Eval 0.88
Epoch 2 Loss 0.1979134665294127
Eval 0.88
Epoch 3 Loss 0.20431396974758667
Eval 0.88
Epoch 4 Loss 0.2025154862891544
Eval 0.8933333333333333
Epoch 5 Loss 0.171533156525005
Eval 0.8666666666666667
Epoch 6 Loss 0.14951868558471854
Eval 0.9066666666666666
Epoch 7 Loss 0.132079567252235
Eval 0.8933333333333333
Epoch 8 Loss 0.11733673766932705
Eval 0.8933333333333333
Epoch 9 Loss 0.12481300465085289
Eval 0.9066666666666666
Epoch 10 Loss 0.1327871179038828
Eval 0.8933333333333333
Epoch 11 Loss 0.1059197641231797
Eval 0.8933333333333333
Epoch 12 Loss 0.11424577608704567
Eval 0.9066666666666666
Epoch 13 Loss 0.09990355270830067
Eval 0.9066666666666666
Epoch 14 Loss 0.08505792170763016
Eval 0.8933333333333333
Epoch 15 Loss 0.09877586821940812
Eval 0.9066666666666666
Epoch 16 Loss 0.0891965196616101
Eval 0.88
Epoch 17 Loss 0.10983198119158094
Eval 0.9333333333333333
Epoch 18 Loss 0.09027370319447735

# Performing C2ST

Evaluating the model over the test set and reporting the accuracy.

In [29]:
test_loader = DataLoader(test, batch_size=1, num_workers = 5, shuffle=True)

model2.eval()
count = 0
i0 = 0
i1 = 0
for data in test_loader:
    
    pred = model2(data.x.cuda(), data.edge_index.cuda(),
          torch.squeeze(data.edge_attr,dim=1).cuda(),data.batch.cuda())
    if pred[0].item() > 0.5:
        pred = 1
    else:
        pred = 0
    if pred == data.y.long().item():
        count = count + 1
    
print('Acc', count/len(test_loader))

Acc 0.944


Performing C2ST using the accuracy and the length of the test set.

In [30]:
from C2ST import C2ST_pvalue

acc =  count/len(test_loader)
n_test = len(test_loader)
print('p-value:', C2ST_pvalue(acc,n_test))
print('samples', n_test)

p-value: 1.570266400322334e-23
samples 125


# Interpreting

For all graph in the test set that is synthetic and the model is sure that it is synthetic, the attention map is printed over it.

In [None]:
from interpretation import heatMap, plot_graph_attention, importantSubgraph, getMapAttention
i = 0
for data in test:
    G = data2graph(data,vocab_nodes,vocab_edges)
    batch = torch.zeros(len(G)).long()
    atts = model2.getAttentions(data.x.cuda(), data.edge_index.cuda(),
          torch.squeeze(data.edge_attr.cuda(),dim=1),batch.cuda())
    map_colors = getMapAttention(G,atts)
    
    pred = model2(data.x.cuda(), data.edge_index.cuda(),
          torch.squeeze(data.edge_attr.cuda(),dim=1),batch.cuda())
    if pred[0].item() < 0.1 and data.y.item() == 0:
        plot_graph_attention(G,map_colors)
        #plot_graph_attention(importantSubgraph(G, atts.detach().cpu().numpy(), 0.2, 2),map_colors)
        #heatMap(G,atts,str(i),'./interpretation/'+'Ecore'+'/'+generator+'/')
        #heatMap(importantSubgraph(G, atts.detach().cpu().numpy(), 0.2, 2),atts,str(i),
        #        './interpretation/'+'Ecore'+'/'+generator+'/subgraph/')
        i = i + 1
        print('--'*50)