# Importing some utilities

In [None]:
import sys
sys.path.append('../python/')
from json2graph import jsonFile2graph
import networkx as nx
from vocabulary import Vocabulary
from graphUtils import plot_graph, graph2data, data2graph
from neuralmodel import GNN_MoRec

# generator must belong to {VIATRA,RANDOMEMF,ALLOY,RAND}
generator = 'VIATRA'
# modelType must belong to {Ecore,RDS,Yakindu}
modelType = 'Yakindu'

# Generate dataset

Generating the vocabularies. These objects contains a dictionary that associates each node type or edge type to an integer between $[0\dots n-1]$ where $n$ is the length of the vocabulary.

In [None]:
vocab_edges = Vocabulary()
vocab_nodes = Vocabulary()

Function that takes a graph and adds the inverse edges (see background section in the paper).

In [None]:
def addOpposite(G):
    if (modelType == 'Ecore'):
        to_add = []
        for n1,n2,e in G.edges:
            typee = G[n1][n2][e]['type']
            if (typee == 'eSuperTypes' or
               typee == 'eType' or typee == 'eOpposite'):
                to_add.append((n2,n1,typee+'_inv'))
        for n1,n2,t in to_add:
            G.add_edge(n1,n2,type=t)
        return G
    elif (modelType == 'RDS'):
        to_add = []
        for n1,n2,e in G.edges:
            typee = G[n1][n2][e]['type']
            if (typee == 'elements' or
               typee == 'columns' or typee == 'indexes'
               or typee == 'column' or typee == 'indexColumns'):
                to_add.append((n2,n1,typee+'_inv'))
        for n1,n2,t in to_add:
            G.add_edge(n1,n2,type=t)
        return G
    elif (modelType == 'Yakindu'):
        to_add = []
        for n1,n2,e in G.edges:
            typee = G[n1][n2][e]['type']
            if (typee == 'vertices' or
               typee == 'regions'):
                to_add.append((n2,n1,typee+'_inv'))
        for n1,n2,t in to_add:
            G.add_edge(n1,n2,type=t)
        return G

Loading the models (graphs) generated by the generator.

In [None]:
import glob

def passFilter(G):
    return len(G) >= 4

files = glob.glob("../syntheticGraphs/"+generator+"/"+modelType+"/*.json")
mine = []
for f in files:
    try:
        G = jsonFile2graph(f)
        G = addOpposite(G)
        if not passFilter(G):
            continue
        data = graph2data(G,0,vocab_nodes,vocab_edges)
        mine.append(data)
    except:
        continue

Loading the real models (graphs).

In [None]:
files = glob.glob("../realGraphs/"+modelType+"/R1/*.json")
real = []
for f in files:
    G = jsonFile2graph(f)
    G = addOpposite(G)
    data = graph2data(G,1,vocab_nodes,vocab_edges)
    real.append(data)

Undersampling the dataset to obtain a balanced dataset i.e., $50/50$.

In [None]:
import random
random.seed(123)

if len(mine) > len(real):
    mine = random.sample(mine,len(real))
elif len(mine) < len(real):
    real = random.sample(real,len(mine))

Merging the real and generated graphs:

In [None]:
import random
random.seed(3)
dataset = mine + real
random.shuffle(dataset)
print('Len train:', len(dataset))

Splitting the set into train test and validation.

In [None]:
from torch.utils.data import random_split
import torch
train_len = int(0.6*len(dataset))
val_len = int(0.15*len(dataset))
test_len = len(dataset) - int(0.6*len(dataset)) - int(0.15*len(dataset))
train, val, test = random_split(dataset, [train_len, val_len ,test_len], 
                                generator=torch.Generator().manual_seed(42))

# Training phase

Generating loaders to train the model.

In [None]:
from torch_geometric.data import DataLoader
train_loader = DataLoader(train, batch_size=32, num_workers = 5, shuffle=True)
val_loader = DataLoader(val, batch_size=1, num_workers = 5, shuffle=True)

Function that evaluate the model using the validation set.

In [None]:
def evaluation(model, loader):
    model.eval()
    count = 0.0
    with torch.no_grad():
        for data in loader:
            pred = model(data.x.cuda(), data.edge_index.cuda(),
          torch.squeeze(data.edge_attr.cuda(),dim=1),data.batch.cuda())
            if pred[0].item() > 0.5:
                pred = 1
            else:
                pred = 0
            if pred == data.y.long().item():
                count = count + 1
    return count/len(loader)

Training procedure with early stopping:

In [None]:
from nnUtils import EarlyStopping
import torch
import torch.nn as nn

path_to_model = './trainedModels/'+modelType+'-'+generator+'-GNN'
model = GNN_MoRec(64,64,0.0,vocab_nodes,vocab_edges).cuda()

epochs = 200
criterion = nn.BCELoss()

opt = torch.optim.Adam(model.parameters(), lr=0.001)
es = EarlyStopping(opt, model, path_to_model,mode='max',patience=50)


for e in range(epochs):
    total_loss = 0.0
    b = 1
    model.train()
    for data in train_loader:
        
        opt.zero_grad()
        
        pred = model(data.x.cuda(), data.edge_index.cuda(),
          torch.squeeze(data.edge_attr.cuda(),dim=1),data.batch.cuda())
        
        loss = criterion(torch.squeeze(pred), data.y.float().cuda())
        total_loss += loss.item()
        
        loss.backward()
        opt.step()
        b = b + 1
        
    val_acc = evaluation(model, val_loader)
    print('Epoch',e,'Loss',total_loss/b)
    print('Eval',val_acc)
    
    if es.step(val_acc,e):
        break
        
#Resultant model

model2 = None
try:
    model2 = GNN_MoRec(64,64,0.0,vocab_nodes,vocab_edges).cuda()
    checkpoint = torch.load(path_to_model)
    model2.load_state_dict(checkpoint['model_state_dict'])
    model2.eval()
except:
    print('Saving model')
    torch.save({
            'model_state_dict': model.state_dict()
            }, path_to_model)
    model2 = GNN_MoRec(64,64,0.0,vocab_nodes,vocab_edges).cuda()
    checkpoint = torch.load(path_to_model)
    model2.load_state_dict(checkpoint['model_state_dict'])
    model2.eval()


# Performing C2ST

Evaluating the model over the test set and reporting the accuracy.

In [None]:
test_loader = DataLoader(test, batch_size=1, num_workers = 5, shuffle=True)

model2.eval()
count = 0
i0 = 0
i1 = 0
for data in test_loader:
    
    pred = model2(data.x.cuda(), data.edge_index.cuda(),
          torch.squeeze(data.edge_attr,dim=1).cuda(),data.batch.cuda())
    if pred[0].item() > 0.5:
        pred = 1
    else:
        pred = 0
    if pred == data.y.long().item():
        count = count + 1
    
print('Acc', count/len(test_loader))

Performing C2ST using the accuracy and the length of the test set.

In [None]:
from C2ST import C2ST_pvalue

acc =  count/len(test_loader)
n_test = len(test_loader)
print('p-value:', C2ST_pvalue(acc,n_test))
print('samples', n_test)

# Interpreting

For all graph in the test set that is synthetic and the model is sure that it is synthetic, the attention map is printed over it.

In [None]:
from interpretation import heatMap, plot_graph_attention, importantSubgraph, getMapAttention
i = 0
for data in test:
    G = data2graph(data,vocab_nodes,vocab_edges)
    batch = torch.zeros(len(G)).long()
    atts = model2.getAttentions(data.x.cuda(), data.edge_index.cuda(),
          torch.squeeze(data.edge_attr.cuda(),dim=1),batch.cuda())
    map_colors = getMapAttention(G,atts)
    
    pred = model2(data.x.cuda(), data.edge_index.cuda(),
          torch.squeeze(data.edge_attr.cuda(),dim=1),batch.cuda())
    if pred[0].item() < 0.1 and data.y.item() == 0:
        plot_graph_attention(G,map_colors)
        #plot_graph_attention(importantSubgraph(G, atts.detach().cpu().numpy(), 0.2, 2),map_colors)
        #heatMap(G,atts,str(i),'./interpretation/'+modelType+'/'+generator+'/')
        #heatMap(importantSubgraph(G, atts.detach().cpu().numpy(), 0.2, 2),atts,str(i),
        #        './interpretation/'+modelType+'/'+generator+'/subgraph/')
        i = i + 1
        print('--'*50)