In [1]:
import sys
sys.path.append('../')
import dmg.model2graph.model2graph as m2g
import dmg.model2graph.metafilter as mf
from networkx.algorithms.isomorphism import is_isomorphic
import dmg.graphUtils as gu
import glob
import dmg.rds.rdsPallete as rds
import random
random.seed(123)

# Load dataset

In [2]:
metafilter_refs = ['Database.elements', 
                           'Table.indexes',
                           'Table.columns',
                           'Index.indexColumns',
                           'IndexColumn.column',
                           'Reference.primaryKeyColumns',
                           'Reference.foreignKeyColumns',
                           'Column.primaryReferences',
                           'Column.foreignReferences']
metafilter_cla = ['Database', 'Column','Table',
                          'Index', 'IndexColumn','Reference']  
metafilter_atts = None
metafilterobj = mf.MetaFilter(references = metafilter_refs, 
                     attributes = metafilter_atts,
                     classes = metafilter_cla)
meta_models = ['../data/metamodels/rds_manual.ecore']

In [3]:
files = glob.glob("../data/rdsDataset/train/*")
graphs = []
for f in files:
    graphs.append(m2g.getGraphFromModel(f, 
                              meta_models, metafilterobj,
                              consider_atts = False))

In [4]:
print('Number of graphs:', len(graphs))

Number of graphs: 362


In [5]:
files = glob.glob("../data/rdsDataset/val/*")
graphs_val = []
for f in files:
    graphs_val.append(m2g.getGraphFromModel(f, 
                              meta_models, metafilterobj,
                              consider_atts = False))

In [6]:
print('Number of graphs:', len(graphs_val))

Number of graphs: 122


In [10]:
from torch_geometric.data import DataLoader
from dmg.deeplearning.dataGeneration import sequence2data, data2graph

listDatas_val = []
batch_size = 64
max_len = 3
print('Preparing seqs')
for g in graphs_val:
    sequence = rds.rds_pallete.graphToSequence(g)
    if len(sequence[-1][0].edges()) == 0:
        continue
    listDatas_val = listDatas_val + sequence2data(sequence, rds.rds_pallete, max_len)
loader_val = DataLoader(listDatas_val, batch_size=batch_size, 
                        num_workers = 0, 
                        shuffle=False)
print('Seqs finished')

Preparing seqs
Seqs finished


In [11]:
do_eval = False

if not do_eval:
    graphs = graphs + graphs_val

In [12]:
print('Number of graphs:', len(graphs))

Number of graphs: 484


# Training

In [21]:
from dmg.deeplearning.generativeModel import GenerativeModel
import torch
import torch.nn as nn
import multiprocess as mp

def f(g):
    import dmg.rds.rdsPallete as rds
    from dmg.deeplearning.dataGeneration import sequence2data
    max_len = 3
    sequence = rds.rds_pallete.graphToSequence(g)
    if sequence[-1][0].edges()==0:
        return None
    return sequence2data(sequence, rds.rds_pallete, max_len)

epochs = 100
hidden_dim = 128


criterion_node = nn.CrossEntropyLoss(reduction = 'mean',ignore_index=-1)
criterion_action = nn.CrossEntropyLoss(reduction = 'mean')
criterion_finish = nn.BCELoss(reduction = 'mean')
model = GenerativeModel(hidden_dim, rds.dic_nodes_rds, rds.dic_edges_rds, rds.dic_operations_rds)
opt = torch.optim.Adam(model.parameters(), lr=0.001)
#scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=10, gamma=0.1)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    listDatas = []
    #preparing training set
    print('Preparing seqs')
    #for g in graphs:
    #    sequence = yp.yakindu_pallete.graphToSequence(g)
    #    listDatas = listDatas + sequence2data(sequence, yp.yakindu_pallete, max_len)
    with mp.Pool(5) as pool:
        listDatas = pool.map(f, graphs)
    listDatas = [r for rr in listDatas if rr != None for r in rr]
    print('Seqs finished')
    loader = DataLoader(listDatas, batch_size=batch_size, 
                            num_workers = 0, 
                            shuffle=False)
    #training
    for data in loader:
        opt.zero_grad()
        action, nodes, finish = model(data.x, data.edge_index, 
                        torch.squeeze(data.edge_attr,dim=1), 
                data.batch, data.sequence, data.nodes, data.len_seq, data.action)
        
        nodes = torch.unsqueeze(nodes, dim = 2).repeat(1,1,2)
        nodes[:,:,0] = 1 - nodes[:,:,1]
            
        L = torch.max(data.len_seq).item()
        gTruth = data.sequence_masked[:,0:L]
        loss = (criterion_node(nodes.reshape(-1,2), gTruth.flatten()) +
                    criterion_action(action, data.action) +
                    criterion_finish(finish.flatten(), data.finished.float())) / 3
        total_loss += loss.item()
        loss.backward()
        opt.step()
    #validation
    if do_eval:
        val_loss = 0
        model.eval()
        with torch.no_grad():
            for data in loader_val:
                action, nodes, finish = model(data.x, data.edge_index, 
                            torch.squeeze(data.edge_attr,dim=1), 
                    data.batch, data.sequence, data.nodes, data.len_seq, data.action)
                nodes = torch.unsqueeze(nodes, dim = 2).repeat(1,1,2)
                nodes[:,:,0] = 1 - nodes[:,:,1]

                L = torch.max(data.len_seq).item()
                gTruth = data.sequence_masked[:,0:L]
                loss = (criterion_node(nodes.reshape(-1,2), gTruth.flatten()) +
                        criterion_action(action, data.action) +
                        criterion_finish(finish.flatten(), data.finished.float())) / 3
                val_loss+= loss.item()
        
    print('Epoch',epoch,'Loss Traning',total_loss/(len(loader)))
    #scheduler.step()
    if do_eval:
        print('Epoch',epoch,'Loss Val',val_loss/(len(loader_val)))
        

Preparing seqs


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

# Loading test

In [None]:
files = glob.glob("../data/yakinduDataset/test/*")
graphs_test = []
for f in files:
    graphs_test.append(m2g.getGraphFromModel(f, 
                              meta_models, metafilterobj,
                              consider_atts = False))
print('Number of graphs:', len(graphs_test))    

# Sample models

In [None]:
from dmg.deeplearning.generativeModel import sampleGraph

model.eval()
samples = [sampleGraph(yp.G_initial_yak, yp.yakindu_pallete, model, 50) for i in range(len(graphs_test))]

# Visual comparison

## Size

In [None]:
import seaborn as sns

sns.distplot([len(G) for G in samples], hist=False, kde=True, 
             bins=int(180/5), color = 'red', label = 'NN')
sns.distplot([len(G) for G in graphs_test], hist=False, kde=True, 
             bins=int(180/5), color = 'blue', label = 'Real')

## Degree

In [None]:
import numpy as np
import dmg.realism.metrics as mt
sns.distplot([np.mean(mt.getListDegree(G)) for G in samples], hist=False, kde=True, 
             bins=int(180/5), color = 'red', label = 'NN')
sns.distplot([np.mean(mt.getListDegree(G)) for G in graphs_test], hist=False, kde=True, 
             bins=int(180/5), color = 'blue', label = 'Real')

## MPC

In [None]:
dims = list(yp.dic_edges_yak.keys())
sns.distplot([np.mean(list(mt.MPC(G,dims).values())) for G in samples], hist=False, kde=True, 
             bins=int(180/5), color = 'red', label = 'NN')
sns.distplot([np.mean(list(mt.MPC(G,dims).values())) for G in graphs_test], hist=False, kde=True, 
             bins=int(180/5), color = 'blue', label = 'Real')

# Check isomorf and consistency

In [None]:
import matplotlib.pyplot as plt

h = plt.hist([len(G) for G in samples], bins = 20)

In [None]:
h = plt.hist([len(G) for G in graphs], bins = 20)

In [None]:
#check isomorf
iso = []
for s in samples:
    for g in graphs:
        if (is_isomorphic(s,g,gu.node_match_type, gu.edge_match_type)):
            iso.append(s)
            break
print(len(iso)*100/len(samples),'% iso')
not_iso = [g for g in samples if not g in iso]

In [None]:
import numpy as np
import seaborn as sns

sns.set_theme(style="whitegrid")
ax = sns.boxplot(x=[len(G) for G in iso])
print('Mean size:', np.mean([len(G) for G in iso]))

In [None]:
from dmg.yakindu.yakinduConsistency import inconsistent
#check consistency
inconsistents = []
for s in samples:
    if inconsistent(s):
        inconsistents.append(s)
print(len(inconsistents)*100/len(samples),'% inconsistents')
not_inconsistents = [g for g in samples if not g in inconsistents]

In [None]:
ax = sns.boxplot(x=[len(G) for G in inconsistents])
print('Mean size:', np.mean([len(G) for G in inconsistents]))

In [None]:
clean_new_models = [g for g in not_iso if not g in inconsistents]

In [None]:
ax = sns.boxplot(x=[len(G) for G in clean_new_models])
print('Mean size:', np.mean([len(G) for G in clean_new_models]))

In [None]:
ax = sns.boxplot(x=[len(G) for G in graphs])
print('Mean size:', np.mean([len(G) for G in graphs]))

In [None]:
print(len(clean_new_models),'clean models')
print(len(clean_new_models)*100/len(samples),'% clean models')

# Plot a sample of clean models

In [None]:
dot = gu.plotGraphViz(random.sample(clean_new_models,1)[0])
dot.format = 'pdf'
dot.view(filename='example', directory='./')

In [None]:
#m2g.getModelFromGraph(['../data/metamodels/yakinduSimplified.ecore'], clean_new_models[0])

In [None]:
m2g.serializeGraphModel('example.xmi',['../data/metamodels/yakinduSimplified.ecore'], 'Statechart', clean_new_models[0])